diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52f44add8..6cb25140f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ project (SuiteSparse
   DESCRIPTION "A suite of sparse matrix packages"
   HOMEPAGE_URL http://faculty.cse.tamu.edu/davis/suitesparse.html
   LANGUAGES C
-  VERSION 5.12.0
+  VERSION 5.13.0
 )
 
 set (CMAKE_DEBUG_POSTFIX _debug)
diff --git a/ChangeLog b/ChangeLog
index e9250456c..a8f1bd7b5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+Aug 25, 2022, SuiteSparse 5.13.0
+
+    * GraphBLAS v7.2.0:  see GraphBLAS/Doc/ChangeLog for details.
+        * performance:  more compact serialization (ZSTD added, now the
+            default compression method).
+        * MATLAB interface: faster linear indexing, reshape, bandwidth,
+            istril, istriu, isbanded, isdiag.  C(I,J)=A can now grow the
+            size of C.
+        * features: reshape methods, cube root operator, isStoredElement
+        * bugs: a minor bug; user-defined types were incorrectly limited to
+            128 bytes in size in v7.0.3.
+
 Apr 10, 2022, SuiteSparse 5.12.0
 
     * GraphBLAS v7.0.3:  see GraphBLAS/Doc/ChangeLog for details.
diff --git a/GraphBLAS/.gitignore b/GraphBLAS/.gitignore
index 743a85811..21a83414e 100644
--- a/GraphBLAS/.gitignore
+++ b/GraphBLAS/.gitignore
@@ -59,6 +59,8 @@ alternative/*.out
 alternative/*_out.m
 alternative/*_out2.m
 alternative/*_demo
+alternative/*.so*
+alternative/*.dylib*
 
 Test/*.log
 Test/errlog.txt
diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt
index 1a06d2dbc..d8a47ee09 100644
--- a/GraphBLAS/CMakeLists.txt
+++ b/GraphBLAS/CMakeLists.txt
@@ -26,10 +26,10 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Apr 8, 2022" )
+set ( GraphBLAS_DATE "Aug 8, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
-set ( GraphBLAS_VERSION_MINOR 0 )
-set ( GraphBLAS_VERSION_SUB   3 )
+set ( GraphBLAS_VERSION_MINOR 2 )
+set ( GraphBLAS_VERSION_SUB   0 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 
@@ -58,14 +58,13 @@ message ( STATUS "GraphBLAS C API: v" "${GraphBLAS_API_VERSION_MAJOR}.${GraphBLA
 
 # TODO: use something like this:
 # if ( set some flag to ignore cuda )
-#   set ( CMAKE_CUDA off )
+#   ... disable CUDA
 #   message ( STATUS "CUDA: disabled" )
 # elseif ( ${CMAKE_VERSION} VERSION_LESS "3.17.0" )
 #   ...
 
 if ( ${CMAKE_VERSION} VERSION_LESS "3.17.0" )
 
-
     # CUDA requires cmake 3.17 or later
     set ( CMAKE_CUDA off )
     message ( STATUS "CUDA: not enabled (cmake 3.17.0 or later required)" )
@@ -103,14 +102,14 @@ else ( )
 endif ( )
 
 # CUDA is under development for now, and not deployed in production:
-  set ( CMAKE_CUDA off)
+  set ( CMAKE_CUDA off )
 
 # Edit these lines for code development only, not for end-users:
 # set ( CMAKE_BUILD_TYPE Debug )
 
 if ( CMAKE_CUDA )
     # for CUDA development only; not for production use
-    set ( CMAKE_CUDA_DEV on )
+    set ( CMAKE_CUDA_DEV off )
     project ( graphblas
             VERSION "${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}"
             LANGUAGES CUDA C )
@@ -128,25 +127,21 @@ endif ( )
 find_package ( OpenMP )
 
 if ( CMAKE_CUDA )
+    # with CUDA and RMM
+    message ( STATUS "CUDA: enabled" )
     set ( CMAKE_CUDA_FLAG " -DGBCUDA" )
     add_subdirectory ( CUDA )
     set ( GB_CUDA graphblascuda  ${CUDA_LIBRARIES} )
-    link_directories ( "CUDA" ${CUDA_LIBRARIES} )
-else ( )
-    set ( CMAKE_CUDA_FLAG " " )
-    set ( GB_CUDA )
-endif ( )
-
-if ( CMAKE_CUDA )
-    message ( STATUS "CUDA: enabled" )
-    set ( CMAKE_RMM_FLAG " -DGBRMM" )
-    set ( GB_RMM rmm_wrap ${CUDA_LIBRARIES} stdc++ )
+    set ( GB_RMM rmm_wrap ${CUDA_LIBRARIES} )
     add_subdirectory ( rmm_wrap )
     include_directories ( "rmm_wrap" ${CUDA_INCLUDE_DIRS} )
-    link_directories ( "CUDA" "${CUDA_LIBRARIES}" "/usr/local/cuda/lib64/stubs" "rmm_wrap"  )
+    link_directories ( "CUDA" "${CUDA_LIBRARIES}" "/usr/local/cuda/lib64/stubs" "rmm_wrap" "/usr/local/cuda/lib64" )
 else ( )
+    # without CUDA and RMM
     message ( STATUS "CUDA: not enabled" )
+    set ( CMAKE_CUDA_FLAG " " )
     set ( CMAKE_RMM_FLAG " " )
+    set ( GB_CUDA )
     set ( GB_RMM )
 endif ( )
 
@@ -257,9 +252,9 @@ set ( CMAKE_INCLUDE_CURRENT_DIR ON )
 
 if ( CMAKE_CUDA_DEV )
     # for CUDA development only; not for production use
-    include_directories ( Source/Template Source Include Source/Generated1 lz4 Demo/Include rmm_wrap )
+    include_directories ( Source/Template Source Include Source/Generated1 lz4 zstd zstd/zstd_subset Demo/Include rmm_wrap )
 else ( )
-    include_directories ( Source/Template Source Include Source/Generated1 lz4 Source/Generated2 Demo/Include rmm_wrap )
+    include_directories ( Source/Template Source Include Source/Generated1 lz4 zstd zstd/zstd_subset Source/Generated2 Demo/Include rmm_wrap )
 endif ( )
 
 #-------------------------------------------------------------------------------
@@ -284,7 +279,7 @@ if ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" )
     # integer operations wrap
     set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -fwrapv " )
     # check all warnings (uncomment for development only)
-#   set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -Wall -Wextra -Wpedantic -Werror " )
+#   set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -Wall -Wextra -Wpedantic " )
     if ( CMAKE_C_COMPILER_VERSION VERSION_LESS 4.9 )
         message ( FATAL_ERROR "gcc version must be at least 4.9" )
     endif ( )
@@ -410,17 +405,23 @@ if ( NOT GBNCPUFEAT )
         include ( CheckSymbolExists )
         check_include_file ( dlfcn.h HAVE_DLFCN_H )
         if ( HAVE_DLFCN_H )
+            message ( STATUS "cpu_feautures has dlfcn.h" )
             target_compile_definitions ( graphblas PRIVATE HAVE_DLFCN_H )
             if ( BUILD_GRB_STATIC_LIBRARY )
                 target_compile_definitions ( graphblas_static PRIVATE HAVE_DLFCN_H )
             endif ( )
+        else ( )
+            message ( STATUS "cpu_feautures without dlfcn.h" )
         endif ( )
         check_symbol_exists ( getauxval "sys/auxv.h" HAVE_STRONG_GETAUXVAL )
         if ( HAVE_STRONG_GETAUXVAL )
+            message ( STATUS "cpu_feautures has getauxval from sys/auxv.h" )
             target_compile_definitions ( graphblas PRIVATE HAVE_STRONG_GETAUXVAL )
             if ( BUILD_GRB_STATIC_LIBRARY )
                 target_compile_definitions ( graphblas_static PRIVATE HAVE_STRONG_GETAUXVAL )
             endif ( )
+        else ( )
+            message ( STATUS "cpu_feautures doesn't have getauxval from sys/auxv.h" )
         endif ( )
     endif ( )
 endif ( )
diff --git a/GraphBLAS/CUDA/.gitignore b/GraphBLAS/CUDA/.gitignore
index 5db082e1a..2650c12fe 100644
--- a/GraphBLAS/CUDA/.gitignore
+++ b/GraphBLAS/CUDA/.gitignore
@@ -4,6 +4,7 @@
 *.so
 jitFactory
 stringify
+rmm_log.txt
 
 # Do not ignore this file
 !.gitignore
diff --git a/GraphBLAS/CUDA/CMakeLists.txt b/GraphBLAS/CUDA/CMakeLists.txt
index 41228aecb..4755bb66f 100644
--- a/GraphBLAS/CUDA/CMakeLists.txt
+++ b/GraphBLAS/CUDA/CMakeLists.txt
@@ -7,14 +7,14 @@ cmake_minimum_required(VERSION 3.20.1)
 
 project(GRAPHBLAS_CUDA VERSION 0.1 LANGUAGES CXX CUDA)
 
-set(CMAKE_CUDA_FLAGS "-cudart=static -lineinfo -G")
+set(CMAKE_CUDA_FLAGS "-cudart=static -lineinfo ")
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++17 -fPIC ")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGBNCPUFEAT")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGBNCPUFEAT")
 set(CMAKE_C_STANDARD 11)
 
-message(STATUS "${CMAKE_CXX_FLAGS}")
+message(STATUS "C++ flags for CUDA:" "${CMAKE_CXX_FLAGS}")
 
 file(GLOB GRAPHBLAS_CUDA_SOURCES "*.cu" "*.c" "*.cpp")
 
@@ -32,21 +32,37 @@ set(GRAPHBLAS_CUDA_INCLUDES
         ../Include
         ../CUDA)
 
-message(STATUS "${GRAPHBLAS_CUDA_INCLUDES}")
+message(STATUS "GraphBLAS CUDA includes: " "${GRAPHBLAS_CUDA_INCLUDES}")
 
-target_include_directories(graphblascuda PUBLIC  ${CUDAToolkit_INCLUDE_DIRS} ${GRAPHBLAS_CUDA_INCLUDES})
+set(EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)
+
+IF(NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY})
+    file(MAKE_DIRECTORY ${EXTERNAL_INCLUDES_DIRECTORY})
+endif()
+
+IF(NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/cuco)
+    execute_process(
+            COMMAND git clone "https://github.com/NVIDIA/cuCollections.git" --branch main --recursive cuco
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)
+endif()
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/external_includes/cuco/include)
+
+target_include_directories(graphblascuda PUBLIC
+        ${CMAKE_CURRENT_BINARY_DIR}/external_includes/cuco/include
+        ${CUDAToolkit_INCLUDE_DIRS}
+        ${GRAPHBLAS_CUDA_INCLUDES})
 set_target_properties(graphblascuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(graphblascuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 set_target_properties(graphblascuda PROPERTIES CUDA_ARCHITECTURES "75")
 
-target_link_libraries(graphblascuda CUDA::nvrtc CUDA::cudart_static)
+target_link_libraries(graphblascuda CUDA::nvrtc CUDA::cudart_static CUDA::nvToolsExt )
 
 install ( TARGETS graphblascuda
         LIBRARY       DESTINATION ${CMAKE_INSTALL_LIBDIR}
         PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
         ARCHIVE       DESTINATION ${CMAKE_INSTALL_LIBDIR} )
 
-
 # 1. Execute enumify/stringify/jitify logic to compile ptx kernels and compile/link w/ relevant *.cu files.
 
 # TODO: Need to do this piece in cmake
@@ -58,11 +74,12 @@ set(CUDA_TEST_SUITES
 )
 
 #
-set(CUDA_TEST_MONOIDS PLUS) #MIN MAX TIMES ANY)
-set(CUDA_TEST_BINOPS TIMES) #PLUS MIN MAX DIV MINUS RDIV RMINUS FIRST SECOND PAIR)
-set(CUDA_TEST_SEMIRINGS PLUS_TIMES) # MIN_PLUS MAX_PLUS)
-set(CUDA_TEST_DATATYPES int32_t ) #int64_t uint32_t uint64_t float double)
+set(CUDA_TEST_MONOIDS PLUS MIN MAX) # TIMES ANY)
+set(CUDA_TEST_BINOPS TIMES PLUS MIN MAX DIV) #MINUS RDIV RMINUS FIRST SECOND PAIR)
+set(CUDA_TEST_SEMIRINGS PLUS_TIMES MIN_PLUS MAX_PLUS)
+set(CUDA_TEST_DATATYPES int32_t int64_t uint32_t uint64_t float double)
 set(CUDA_TEST_KERNELS vsvs) # mp vsvs dndn spdn vssp)
+set(CUDA_TEST_FORMATS sparse dense sparse_dense reduce)
 
 
 # TODO: Update testGen.py to accept the above CUDA_TEST_* params as arguments
@@ -77,23 +94,25 @@ set(CUDA_TEST_CPP_FILES "")
 foreach(var ${CUDA_TEST_SUITES})
     foreach(semiring ${CUDA_TEST_SEMIRINGS})
         foreach(kernel ${CUDA_TEST_KERNELS})
-
-            # TODO: Have Python script also build separate cudaTest.cpp (named something
-            # like AxB_dot3_cuda_tests.cpp) for each suite. This way we should be able to
-            # easily ignore them from the build
-            add_custom_command(
-                    OUTPUT
-                    ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_test_instances.hpp
-                    ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_cuda_tests.cpp
-                    DEPENDS
-                    jitFactory.hpp
-                    COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/test/testGen_cmake.py "\"${CMAKE_CURRENT_SOURCE_DIR}\"" "\"${var}\"" "\"${CUDA_TEST_MONOIDS}\""
-                        "\"${CUDA_TEST_BINOPS}\"" "\"${semiring}\"" "\"${CUDA_TEST_DATATYPES}\""
-                        "\"${kernel}\""
-            )
-
-            # Construct final list of files to compile (in parallel)
-            list(APPEND CUDA_TEST_CPP_FILES ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_cuda_tests.cpp)
+            foreach(format ${CUDA_TEST_FORMATS})
+
+                # TODO: Have Python script also build separate cudaTest.cpp (named something
+                # like AxB_dot3_cuda_tests.cpp) for each suite. This way we should be able to
+                # easily ignore them from the build
+                add_custom_command(
+                        OUTPUT
+                        ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_test_instances.hpp
+                        ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_cuda_tests.cpp
+                        DEPENDS
+                        jitFactory.hpp
+                        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/test/testGen_cmake.py "\"${CMAKE_CURRENT_SOURCE_DIR}\"" "\"${var}\"" "\"${CUDA_TEST_MONOIDS}\""
+                            "\"${CUDA_TEST_BINOPS}\"" "\"${semiring}\"" "\"${CUDA_TEST_DATATYPES}\""
+                            "\"${kernel}\""
+                )
+
+                # Construct final list of files to compile (in parallel)
+                list(APPEND CUDA_TEST_CPP_FILES ${CMAKE_CURRENT_BINARY_DIR}/${var}_${semiring}_${format}_cuda_tests.cpp)
+            endforeach()
         endforeach()
     endforeach()
 endforeach()
@@ -127,13 +146,13 @@ endif()
 # 3. Compile/link individual {test_suite_name}_cuda_tests.cpp files into a gtest executable
 set(GRAPHBLAS_CUDA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/test)
 
-message(STATUS "${CUDA_TEST_CPP_FILES}")
+message(STATUS "CUDA tests files: " "${CUDA_TEST_CPP_FILES}")
 
 add_executable(graphblascuda_test ${CUDA_TEST_CPP_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/test/run_tests.cpp)
 
 set_target_properties(graphblascuda_test PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(graphblascuda_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-set_target_properties(graphblascuda_test PROPERTIES CUDA_ARCHITECTURES "70")
+set_target_properties(graphblascuda_test PROPERTIES CUDA_ARCHITECTURES "75")
 
 include(GoogleTest)
 
diff --git a/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cpp b/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cpp
index 376a93408..078e27adc 100644
--- a/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cpp
+++ b/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cpp
@@ -22,6 +22,7 @@ extern "C"
 
 #include "jitFactory.hpp"
 #include "GB_cuda_type_wrap.hpp"
+#include "test/GpuTimer.h"
 
 template<typename T, typename I>
 void print_array(void *arr, I size, const char *name) {
@@ -36,11 +37,12 @@ void print_array(void *arr, I size, const char *name) {
 #undef  GB_FREE_WORKSPACE
 #define GB_FREE_WORKSPACE                                               \
 {                                                                       \
-    if (Nanobuckets != NULL) rmm_wrap_free (Nanobuckets) ; Nanobuckets = NULL ; \
-    if (Blockbucket != NULL) rmm_wrap_free (Blockbucket) ; Blockbucket = NULL ; \
-    if (Bucket      != NULL) rmm_wrap_free (Bucket);       Bucket      = NULL ; \
-    if (Bucketp     != NULL) rmm_wrap_free (Bucketp);      Bucketp     = NULL ; \
-    if (offset      != NULL) rmm_wrap_free (offset);       offset      = NULL ; \
+    /* FIXME: use GB_FREE_WORK */                                       \
+    if (Nanobuckets != NULL) rmm_wrap_free (Nanobuckets) ; Nanobuckets = NULL ;\
+    if (Blockbucket != NULL) rmm_wrap_free (Blockbucket) ; Blockbucket = NULL ;\
+    if (Bucket      != NULL) rmm_wrap_free (Bucket);       Bucket      = NULL ;\
+    if (Bucketp     != NULL) rmm_wrap_free (Bucketp);      Bucketp     = NULL ;\
+    if (offset      != NULL) rmm_wrap_free (offset);       offset      = NULL ;\
 }
 
 #undef  GB_FREE_ALL
@@ -64,12 +66,15 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 )
 {
 
+    cudaStream_t stream;
+    CHECK_CUDA_SIMPLE(cudaStreamCreate(&stream));
+
+    GpuTimer kernel_timer; 
+
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
 
-    printf ("HERE IN cuda dot3, mask_struct is %d\n", Mask_struct) ;
-
     // when CUDA is enabled, no static headers are used in all of GraphBLAS
     GrB_Info info ;
     ASSERT (C != NULL && !(C->static_header)) ;
@@ -139,12 +144,14 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     int64_t cnz = mnz ;
     int64_t cnvec = mnvec ;
 
-    int sparsity_M = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+    int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+    int C_sparsity = M_sparsity ;
+    bool C_iso = false ;
     info = GB_new_bix (&C, // sparse or hyper (from M), existing header
         ctype, cvlen, cvdim, GB_Ap_malloc, true,
-        sparsity_M, false, M->hyper_switch, cnvec,
+        M_sparsity, false, M->hyper_switch, cnvec,
         cnz+1,  // add one to cnz for GB_cumsum of Cwork 
-        true, /* not iso: */ false, Context) ;
+        true, C_iso, Context) ;
 
     if (info != GrB_SUCCESS)
     { 
@@ -153,183 +160,376 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
         return (info) ;
     }
 
-    //int64_t *Citemp =  C->i ;        
-    //auto *Cxtemp = C->x ;        
-    //cudaMalloc ((void**) &(C->i), cnz * sizeof( int64_t) ); 
-    //cudaMalloc ((void**) &(C->x), cnz * C->type->size ); 
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( C->i, (cnz+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device));
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( C->x, (cnz+1) * C->type->size , cudaMemAdviseSetPreferredLocation, device));
 
+    CHECK_CUDA_SIMPLE(cudaMemAdvise( C->i, (cnz+1) * sizeof ( int64_t),
+        cudaMemAdviseSetPreferredLocation, device));
+    if (!C_iso)
+    {
+        CHECK_CUDA_SIMPLE(cudaMemAdvise( C->x, (cnz+1) * C->type->size ,
+            cudaMemAdviseSetPreferredLocation, device));
+    }
+
+    //--------------------------------------------------------------------------
+    // Pre-fetch arrays that will be used on the device
+    //--------------------------------------------------------------------------
+
+    // prefetch M
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t),
+        device, stream)) ; //stream_data) ;
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t),
+        device, stream )) ; //stream_data) ;
+    if (!(Mask_struct || M->iso))
+    {
+        // prefetch M->x only if the mask is valued and M is non-iso
+        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size,
+            device, stream )) ; //stream_data) ;
+    }
+
+    // prefetch C
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
+        device, stream )); //stream_data) ;
+    if (!C_iso)
+    {
+        // FIXME: why prefect C->x?
+        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
+            device, stream )); //stream_data) ;
+    }
 
     //--------------------------------------------------------------------------
     // copy Mp and Mh into C
     //--------------------------------------------------------------------------
 
-    CHECK_CUDA_SIMPLE(cudaMemcpy (C->p, M->p, (cnvec+1) * sizeof (int64_t), cudaMemcpyDefault)) ;
+    CHECK_CUDA_SIMPLE( cudaMemcpyAsync (C->p, M->p, (cnvec+1) * sizeof (int64_t),
+        cudaMemcpyDefault, stream)) ;
+    //memcpy( C->p, M->p, (cnvec+1)* sizeof( int64_t) );
     if (M_is_hyper)
     { 
-        // FIXME
-        CHECK_CUDA_SIMPLE(cudaMemcpy (C->h, M->h, cnvec * sizeof (int64_t), cudaMemcpyDefault)) ;
+        // FIXME: this method does not yet handle the hypersparse case
+        CHECK_CUDA_SIMPLE(cudaMemcpyAsync (C->h, M->h, cnvec * sizeof (int64_t),
+            cudaMemcpyDefault, stream)) ;
     }
 
     C->magic = GB_MAGIC ;
     C->nvec_nonempty = M->nvec_nonempty ;
-    C->nvec = M->nvec ;
-    // the dot3 CUDA kernel will produce C->i with jumbled indices
-    C->jumbled = true ;
+    C->jumbled = GB_JUMBLED (M) ;   // C is jumbled if M is jumbled
 
     GBURBLE ("(GPU C created and copied from M) ") ;
+
     //--------------------------------------------------------------------------
     // stringify the semiring and the mask
     //--------------------------------------------------------------------------
 
-    GB_cuda_semiring_factory mysemiring = GB_cuda_semiring_factory ( ) ;
+    GB_cuda_mxm_factory my_mxm_spec = GB_cuda_mxm_factory ( ) ;
 
-    // (1) create the semiring code and name
-    mysemiring.semiring_factory ( semiring, flipxy,
-        ctype, M->type, A->type, B->type, Mask_struct,  // matrix types
-        false, GB_sparsity(C), GB_sparsity(M), GB_sparsity(A), GB_sparsity(B) ) ;
+    // (1) create the mxm code and name
+    my_mxm_spec.mxm_factory ( C_iso, C_sparsity, ctype,
+        M, Mask_struct, false, semiring, flipxy, A, B) ;
 
-    // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
+    // (2) ensure the jitifier has "GB_mxm_[my_mxm_spec.sr_code].h"
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (mysemiring) ;
+    filecache.getFile (my_mxm_spec) ;
+
+    GBURBLE ("(GPU stringified srcode = %lu)\n", my_mxm_spec.sr_code) ;
+
+//  cases:
+
+// (1)
+//      A full          B full
+//      A bit           B full
+//      A full          B bit
+//      A bit           B bit
+
+    if ( GB_IS_FULL(A) && GB_IS_FULL(B) )
+    {
+
+        // Full x Full
+        dense_phase1launchFactory dp1lf(my_mxm_spec);
+
+        GBURBLE ("(GPU phase1 start nblk = %d) ", dp1lf.get_number_of_blocks(M)) ;
+        kernel_timer.Start();
+            dp1lf.jitGridBlockLaunch(C, M, A, B, stream);
+            CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+        kernel_timer.Stop();
+        GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
+
+        mxm_dense_launchFactory mdlf(my_mxm_spec);
+        GBURBLE ("(GPU Dense full x full launch ) ") ;
+        kernel_timer.Start();
+            mdlf.jitGridBlockLaunch( C, M, A, B, stream);
+            CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));  // only for timing
+        kernel_timer.Stop();
+        GBURBLE ("(GPU Dense full x full done %12.6g ms, rate=%12.6g)\n", 
+                   kernel_timer.Elapsed(), (mnvec)/(1000*kernel_timer.Elapsed())) ;  
+
+    }
+    else if ( GB_IS_FULL(A) && GB_IS_BITMAP(B) )
+    {
+        //  FIXME: Full x Bitmap
+    }
+    else if ( GB_IS_BITMAP(A) && GB_IS_FULL(B) )
+    {
+        //  FIXME: Bitmap x Full
+    }
+    else if ( GB_IS_BITMAP(A) && GB_IS_BITMAP(B) )
+    {
+        //  FIXME Bitmap x Bitmap
+    }
+    else if ( GB_IS_SPARSE(A) && GB_IS_FULL(B) )
+    {
+
+        // (2) Sparse x Full
+        //      A sparse        B full
+        //      A hyper         B full      GB_IS_HYPERSPARSE(A) && GB_IS_FULL (B))
+        //      A sparse        B bit
+        //      A hyper         B bit
+
+        dense_phase1launchFactory dp1lf(my_mxm_spec);
+
+        GBURBLE ("(GPU phase1 start nblk = %d) ", dp1lf.get_number_of_blocks(M)) ;
+        kernel_timer.Start();
+            dp1lf.jitGridBlockLaunch(C, M, A, B, stream);
+            CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+        kernel_timer.Stop();
+        GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
+
+        mxm_sparse_dense_launchFactory spdnlf(my_mxm_spec);
+        GBURBLE ("(GPU Dense sparse x full launch ) ") ;
+        kernel_timer.Start();
+            spdnlf.jitGridBlockLaunch( C, M, A, B, stream);
+            CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));  // only for timing
+        kernel_timer.Stop();
+        GBURBLE ("(GPU Dense sparse x full done %12.6g ms, rate=%12.6g)\n", 
+                   kernel_timer.Elapsed(), (mnvec)/(1000*kernel_timer.Elapsed())) ;  
+
+    }
+    else if ( GB_IS_HYPERSPARSE(A) && GB_IS_FULL(B) )
+    {
+        //  FIXME: Hyper x Full 
+    }
+    else if ( GB_IS_HYPERSPARSE(A) && GB_IS_BITMAP(B) )
+    {
+        //  FIXME: Sparse x Bitmap 
+    }
+    else if ( GB_IS_BITMAP(A) && GB_IS_BITMAP(B) )
+    {
+        //  FIXME: Hyper x Bitmap
+    }
+
+// (3)
+//      A full          B sparse
+//      A bit           B sparse
+//      A full          B hyper
+//      A bit           B hyper
+
+// (4) phase1, phase2, phase2end, phase3:
+//      A sparse        B sparse    <<<
+//      A hyper         B sparse
+//      A sparse        B hyper
+//      A hyper         B hyper
+
+
+//          && !GB_IS_BITMAP (A) && !GB_IS_BITMAP (B)
+//          && !GB_IS_FULL (A) && !GB_IS_FULL (B))
+
+    else if ( GB_IS_SPARSE(A) && GB_IS_SPARSE(B) )
+    {
+
+    // Sparse x Sparse
 
-    GBURBLE ("(GPU stringified) ") ;
     //--------------------------------------------------------------------------
     // construct the tasks for phase1 and phase2
     //--------------------------------------------------------------------------
 
     // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
     // on the GPU:
-    phase1launchFactory p1lf(mysemiring);
+    phase1launchFactory p1lf(my_mxm_spec);
     phase2launchFactory p2lf;
     phase2endlaunchFactory p2elf;
 
 
-    // # of threads in phase1 and phase2 kernel launches must be the same
+    // # of threads in phase1 and phase2 kernel launches are related
+    // # by the size of the warp.  ph2_task = ph1_task/32 for example
     int nthrd = p2lf.get_threads_per_block();
     int ntasks = p2elf.get_number_of_blocks(M);
 
     int64_t nanobuckets_size = NBUCKETS * nthrd * ntasks;
     int64_t blockbuckets_size = NBUCKETS * ntasks;
 
-    Nanobuckets = (int64_t*)rmm_wrap_malloc(nanobuckets_size * sizeof (int64_t));
-    Blockbucket = (int64_t*)rmm_wrap_malloc(blockbuckets_size * sizeof (int64_t));
+    // FIXME: use GB_MALLOC_WORK which calls rmm_wrap_malloc anyway
+    Nanobuckets = (int64_t*)
+        rmm_wrap_malloc(nanobuckets_size * sizeof (int64_t));
+    Blockbucket = (int64_t*)
+        rmm_wrap_malloc(blockbuckets_size * sizeof (int64_t));
     Bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
-    Bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
     offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t));
+    Bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
+    if (Nanobuckets == NULL || Blockbucket == NULL || Bucketp == NULL
+        || Bucket == NULL || offset == NULL)
+    {
+        // out of memory
+        GB_FREE_WORKSPACE ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
 
-    CHECK_CUDA_SIMPLE(cudaMemset(Nanobuckets, 0, nanobuckets_size * sizeof(int64_t)));
-    CHECK_CUDA_SIMPLE(cudaMemset(Blockbucket, 0, blockbuckets_size * sizeof(int64_t)));
-    CHECK_CUDA_SIMPLE(cudaMemset(Bucketp, 0, (NBUCKETS+1) * sizeof(int64_t)));
-    CHECK_CUDA_SIMPLE(cudaMemset(Bucket, 0, mnz * sizeof(int64_t)));
-    CHECK_CUDA_SIMPLE(cudaMemset(offset, 0, NBUCKETS * sizeof(int64_t)));
+    // fixme: do async with streams
+    // FIXME: do we need any of these?
+  //CHECK_CUDA_SIMPLE(cudaMemsetAsync(Nanobuckets, 0,
+  //    nanobuckets_size * sizeof(int64_t), stream));
+  //CHECK_CUDA_SIMPLE(cudaMemsetAsync(Blockbucket, 0,
+  //    blockbuckets_size * sizeof(int64_t), stream));
+    CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucketp, 0,
+        (NBUCKETS+1) * sizeof(int64_t), stream));
+    CHECK_CUDA_SIMPLE(cudaMemsetAsync(offset, 0,
+        NBUCKETS * sizeof(int64_t), stream));
+  //CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucket, 0,
+  //    mnz * sizeof(int64_t), stream));
 
     //--------------------------------------------------------------------------
     // phase1 and phase2: place each C(i,j) in a bucket
     //--------------------------------------------------------------------------
 
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));
+    CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t),
+        cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+    CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t),
+        cudaMemAdviseSetAccessedBy, device));
+
+    CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t),
+        cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+    CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t),
+        cudaMemAdviseSetAccessedBy, device));
+
+    //--------------------------------------------------------------------------
+    // Pre-fetch arrays that will be used on the device
+    //--------------------------------------------------------------------------
 
-    offset = (int64_t*)rmm_wrap_malloc( (NBUCKETS)*sizeof(int64_t)) ;
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));
+    // prefetch M
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t),
+        device, stream)) ; //stream_data) ;
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t),
+        device, stream )) ; //stream_data) ;
+    if (!(Mask_struct || M->iso))
+    {
+        // prefetch M->x only if the mask is valued and M is non-iso
+        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size,
+            device, stream )) ; //stream_data) ;
+    }
 
-    memset( offset, 0, NBUCKETS * sizeof(int64_t) );
+//  // prefetch C
+//  CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
+//      device, stream )); //stream_data) ;
+//  if (!C_iso)
+//  {
+//      // FIXME: why prefect C->x?
+//      CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
+//          device, stream )); //stream_data) ;
+//  }
 
     //--------------------------------------------------------------------------
     // Pre-fetch arrays that will be used on the device
     //--------------------------------------------------------------------------
 
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL)) ; //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL )) ; //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t), device, NULL )); //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size, device, NULL )); //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL)); // stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL )) ; //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL)); //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL )); //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL )); //stream_data) ;
+    // prefetch A
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t),
+        device, stream)); // stream_data) ;
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t),
+        device, stream )) ; //stream_data) ;
+    if (!A->iso)
+    {
+        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->x, anz * A->type->size,
+            device, stream )) ; //stream_data) ;
+    }
+
+    // prefetch B
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t),
+        device, stream)); //stream_data) ;
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t),
+        device, stream )); //stream_data) ;
+    if (!B->iso)
+    {
+        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->x, bnz * B->type->size,
+            device, stream )); //stream_data) ;
+    }
 
     // The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in
     // as the pth entry in C.
     
-    //----------------------------------------------------------------------
+    //--------------------------------------------------------------------------
     // phase1: assign each C(i,j) to a bucket, and count them
-    //----------------------------------------------------------------------
-
-    GBURBLE ("(GPU phase1 start) ") ;
-
-    p1lf.jitGridBlockLaunch(Nanobuckets, Blockbucket, C, M, A, B);
+    //--------------------------------------------------------------------------
 
-    GBURBLE ("(GPU phase1 done) ") ;
+    GBURBLE ("(GPU phase1 start nblk = %d) ", p1lf.get_number_of_blocks(M)) ;
+    kernel_timer.Start();
+    p1lf.jitGridBlockLaunch(Nanobuckets, Blockbucket, C, M, A, B, stream);
+    CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+    kernel_timer.Stop();
 
-    print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
-    print_array<int64_t>(Blockbucket, blockbuckets_size , "Blockbucket");
+    GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
 
-    //----------------------------------------------------------------------
+    //--------------------------------------------------------------------------
     // phase2: cumsum across the blockbuckets, propagate to thread level
-    //----------------------------------------------------------------------
+    //--------------------------------------------------------------------------
 
-    GBURBLE ("(GPU phase1 start) ") ;
+    GBURBLE ("(GPU phase2 start nblk=%d ) ", ntasks) ;
 
-    p2lf.jitGridBlockLaunch(Blockbucket, offset, M);
+    kernel_timer.Start();
+    p2lf.jitGridBlockLaunch(Blockbucket, offset, M, stream);
+    kernel_timer.Stop();
 
-    int64_t s= 0;
-    for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
+    CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+
+    int64_t s= offset[0];
+    C->nzombies = s;
+    bool all_in_one = false;
+    for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
     {
-        Bucketp[bucket] = s;
+        Bucketp[bucket] = s; 
         s+= offset[bucket];
-        printf("bucketp[%d] = %ld, offset=%ld\n", bucket, Bucketp[bucket], offset[bucket]);
+        if ( (Bucketp[bucket] - Bucketp[bucket-1] ) == mnz ) all_in_one = true;
     }
 
-    GBURBLE ("(GPU phase2 done) ") ;
-
-    GBURBLE ("(GPU phase2end start) ") ;
+    GBURBLE ("(GPU phase2 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
 
-    p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket,
-                             Bucketp, Bucket, offset, C, M);
+    if( !all_in_one) 
+    {
+        GBURBLE ("(GPU phase2end start nblk=%d) ",  ntasks) ;
 
-    GBURBLE ("(GPU phase2end done) ") ;
+        kernel_timer.Start();
+        p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket,
+                                 Bucketp, Bucket, offset, C, M, stream);
 
-    print_array<int64_t>(Bucket, mnz , "Bucket");
-    print_array<int64_t>(M->i, mnz , "M->i");
-    print_array<int64_t>(C->i, mnz , "C->i");
+        CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+        kernel_timer.Stop();
+        GBURBLE ("(GPU phase2end done %12.6g ms)\n",kernel_timer.Elapsed()) ;
+    }
 
-    //----------------------------------------------------------------------
+    //--------------------------------------------------------------------------
     // phase3: do the numerical work
-    //----------------------------------------------------------------------
+    //--------------------------------------------------------------------------
 
-    print_array<int64_t>(Bucketp, NBUCKETS + 1 , "Bucketp");
-    C->nzombies = Bucketp[1];  //set pre-zombie counts
-    printf("pre-kernel C->nzombies=%ld\n", C->nzombies);
 
     for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
     {
         int64_t start = Bucketp[bucket];
-        int64_t end = Bucketp[bucket+1];
-
+        int64_t end   = Bucketp[bucket + 1 ];
+      //int64_t start = 0;
+      //int64_t end   = cnz;
 
         if(end - start > 0) {
-            printf("Executing bucket: %d with %ld edges\n", bucket, end-start);
-            // TODO: We might want to consider submitting these in different cuda streams (maybe use cuda stream pool?)
-            phase3launchFactory p3lf(mysemiring, (GB_bucket_code)bucket);
-            p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C,  M, A, B);
-        } else {
-            printf("Skipping bucket %d, no work to do\n", bucket);
-        }
-
-        GBURBLE ("(GPU phase3 done ) ") ;
+            // TODO: Use stream pool
+            phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket);
+            GBURBLE ("(GPU phase3 bucket %d launch ) ", bucket) ;
+            kernel_timer.Start();
+            p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B, stream);
+            CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));  // only for timing
+            kernel_timer.Stop();
+            GBURBLE ("(GPU phase3 bucket %d done %12.6g ms, rate=%12.6g)\n", bucket, kernel_timer.Elapsed(), (end-start)/(1000*kernel_timer.Elapsed())) ; }
     }
-    C->nzombies += Bucketp[1];
-    printf("C->p[0]=%ld\n", C->p[0]);
-    printf("C->p[1]=%ld\n", C->p[1]);
-    printf("C->nzombies=%ld\n", C->nzombies);
 
     GB_FREE_WORKSPACE ;
+    }
+
+    CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+    CHECK_CUDA_SIMPLE(cudaStreamDestroy(stream));
     return GrB_SUCCESS; 
 }
 
diff --git a/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cpp b/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cpp
index 6f33f6113..753bcfb5e 100644
--- a/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cpp
+++ b/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cpp
@@ -18,9 +18,6 @@ bool GB_AxB_dot3_cuda_branch
     GB_Context Context
 )
 {
-
-    printf ("HERE IN cuda branch, mask_struct is %d\n", Mask_struct) ;
-
         // very rough estimate of the work to do
         double adeg = ((double) GB_nnz (A)) / ((double) GB_IMAX (1, A->nvec)) ;
         double bdeg = ((double) GB_nnz (B)) / ((double) GB_IMAX (1, B->nvec)) ;
@@ -41,22 +38,21 @@ bool GB_AxB_dot3_cuda_branch
         GBURBLE (" work:%g GPUs:%d ", work, ngpus_to_use) ;
         if (ngpus_to_use > 0
             // FIXME: FUTURE: user-defined types and operators
-//            && (semiring->header_size == 0)     // semiring is built-in
             && (A->type->code != GB_UDT_code)
             && (B->type->code != GB_UDT_code)
-            // FIXME: M could be hypersparse.  we should handle this
+            // FIXME: handle M, A, B hypersparse
             && !GB_IS_HYPERSPARSE (M)
-            // FIXME: this is easy
-            && !A->iso && !B->iso
-            // FIXME:
+            && !GB_IS_HYPERSPARSE (A)
+            && !GB_IS_HYPERSPARSE (B)
+            // FIXME: handle A, B bitmap and/or full
             && !GB_IS_BITMAP (A) && !GB_IS_BITMAP (B)
             && !GB_IS_FULL (A) && !GB_IS_FULL (B))
         {
-            printf("Using CUDA Path.\n");
             return true;
         }
         else
         {
+            // FIXME: remove debug outout here:
             std::cout << "Not using cuda path. M_is_hypersparse: " << GB_IS_HYPERSPARSE(M) <<
                          ", A->iso: " << A->iso << ", B->iso: " << B->iso << ", A_BITMAP: " << GB_IS_BITMAP(A) <<
                          ", B_BITMAP: " << GB_IS_BITMAP(B) << ", GB_IS_FULL(A): " << GB_IS_FULL(A)
diff --git a/GraphBLAS/CUDA/GB_Matrix_allocate.c b/GraphBLAS/CUDA/GB_Matrix_allocate.c
deleted file mode 100644
index 6b1bdf609..000000000
--- a/GraphBLAS/CUDA/GB_Matrix_allocate.c
+++ /dev/null
@@ -1,383 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_Matrix_allocate: allocate space for GrB_Matrix, GrB_Vector, or GrB_Scalar
-//------------------------------------------------------------------------------
-
-// A mock of the actual methods in ../Source.  These are just for testing.
-
-// FIXME: We should remove this altogether and use GrB_Matrix_allocate
-
-#include <assert.h>
-#include "GB_Matrix_allocate.h"
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-//  TODO after allocating :
-//      set A->nvec_nonempty
-//      set A->nvals for bitmap
-//      fill A->[p,h,b,i,x]
-
-// The GrB_Matrix and GrB_Vector objects are different names for the same
-// content.  A GrB_Vector is held as an m-by-1 non-hypersparse CSC matrix.
-// This file is #include'd in GB.h to define the GB_Matrix_opaque,
-// GB_Vector_opaque, and GB_Scalar_opaque structs.
-
-GrB_Scalar GB_Scalar_allocate
-(   
-    GrB_Type type,          // NULL on the GPU
-    size_t type_size,       // type->size
-    int sparsity   // GxB_FULL, GxB_BITMAP, or GxB_SPARSE
-)
-{
-    assert (sparsity != GxB_HYPERSPARSE) ;
-    GrB_Scalar s = (GrB_Scalar) GB_Matrix_allocate (type, type_size,
-        1, 1, sparsity, true, false, 1, 1) ;
-    return (s) ;
-}
-
-//------------------------------------------------------------------------------
-// GB_Vector_allocate
-//------------------------------------------------------------------------------
-
-// For a GrB_Vector object, as an m-by-1 non-hypersparse CSC matrix:
-//      bool is_csc ;           // always true
-//      int64_t plen ;          // always 1, so A->p always has length 2, and
-//                              // contains [0 k] if the vector has k entries;
-//                              // A->p is NULL if the GrB_Vector is bitmap.
-//      int64_t vdim ;          // always 1
-//      int64_t nvec ;          // always 1
-//      int64_t *h ;            // always NULL
-
-GrB_Vector GB_Vector_allocate
-(   
-    GrB_Type type,          // NULL on the GPU
-    size_t type_size,       // type->size
-    int64_t length,
-    int sparsity,   // GxB_FULL, GxB_BITMAP, or GxB_SPARSE
-    bool iso,
-    int64_t anz     // ignored if sparsity is GxB_FULL or GxB_BITMAP
-)
-{
-    assert (sparsity != GxB_HYPERSPARSE) ;
-    GrB_Vector v = (GrB_Vector) GB_Matrix_allocate (type, type_size,
-        1, length, sparsity, true, iso, anz, 1) ;
-    return (v) ;
-}
-
-//------------------------------------------------------------------------------
-// GB_Matrix_allocate
-//------------------------------------------------------------------------------
-
-GrB_Matrix GB_Matrix_allocate
-(
-        GrB_Type type,          // NULL on the GPU
-        size_t type_size,       // type->size
-        int64_t nrows,
-        int64_t ncols,
-        int sparsity,   //GxB_FULL, ..
-        bool is_csc,
-        bool iso,
-        int64_t anz,    // ignored if sparsity is GxB_FULL or GxB_BITMAP
-        int64_t nvec    // hypersparse only
-) {
-
-//------------------------------------------------------------------------------
-// basic information: magic, error logger, and type
-//------------------------------------------------------------------------------
-
-// The first four items exactly match the first four items in the
-// GrB_Descriptor struct.
-
-    GrB_Matrix A = rmm_wrap_malloc(sizeof(struct GB_Matrix_opaque));
-
-// int64_t magic ;         // for detecting uninitialized objects
-
-    A->magic = GB_MAGIC;                 // object is valid
-
-// size_t header_size ;    // size of the malloc'd block for this struct, or 0
-
-    A->header_size = sizeof(struct GB_Matrix_opaque);  // or more
-
-// char *logger ;          // error logger string
-
-    A->logger = NULL;
-
-// size_t logger_size ;    // size of the malloc'd block for logger, or 0
-
-    A->logger_size = 0;
-
-// The remaining items are specific the GrB_Matrix, GrB_Vector and GrB_Scalar
-// structs, and do not appear in the GrB_Descriptor struct:
-// GrB_Type type ;         // the type of each numerical entry
-
-    A->type = type; // GrB_FP32 etc
-
-//------------------------------------------------------------------------------
-// compressed sparse vector data structure
-//------------------------------------------------------------------------------
-
-// The matrix can be held in one of 8 formats, each one consisting of a set of
-// vectors.  The vector "names" are in the range 0 to A->vdim-1.  Each
-// vector has length A->vlen.  These two values define the dimension of the
-// matrix, where A is m-by-n.  The m and n dimenions are vlen and vdim for the
-// CSC formats, and reversed for the CSR formats.
-
-// Ap, Ai, Ax, Ah, and Ab are abbreviations for A->p, A->i, A->x, A->h, and
-// A->b, respectively.
-
-// For the sparse and hypersparse formats, Ap is an integer array of size
-// A->plen+1, with Ap [0] always zero.  The matrix contains A->nvec sparse
-// vectors, where A->nvec <= A->plen <= A->vdim.  The arrays Ai and Ax are
-// of size A->(WHATERVER) and define the indices and values in each sparse vector.
-// The total number of entries in the matrix is Ap [nvec] <= max # entries.
-// For the bitmap and full sparsity structures, Ap and Ai are NULL.
-
-// For both hypersparse and non-hypersparse matrices, if A->nvec_nonempty is
-// computed, it is the number of vectors that contain at least one entry, where
-// 0 <= A->nvec_nonempty <= A->nvec always holds.  If not computed,
-// A->nvec_nonempty is equal to -1.
-
-//------------------------------------------------------------------------------
-// The 8 formats:  (hypersparse, sparse, bitmap, full) x (CSR or CSC)
-//------------------------------------------------------------------------------
-
-    A->is_csc = is_csc; // true: CSC, false: CSR
-
-    //TODO: This should be enabled in master branch
-//    A->iso = iso; // true: A->x holds just one entry, false: normal case
-
-    // set the vector dimension and length
-    if (is_csc) {
-        A->vlen = nrows;
-        A->vdim = ncols;
-    } else {
-        A->vlen = ncols;
-        A->vdim = nrows;
-    }
-
-    if (sparsity == GxB_FULL || sparsity == GxB_BITMAP) {
-        anz = nrows * ncols;
-    }
-
-// create phbix:  A->[p,h,b,i,x]
-
-    A->p_size = 0;
-    A->h_size = 0;
-    A->b_size = 0;
-    A->i_size = 0;
-    A->x_size = 0;
-
-    A->p = NULL;
-    A->h = NULL;
-    A->b = NULL;
-    A->i = NULL;
-    A->x = NULL;
-
-    // for all matrices:
-
-    if (iso) {
-        // DIE if cuda_merge_in_progress
-        // OK for master
-        A->x_size = type_size;
-    } else {
-        A->x_size = anz * type_size;
-    }
-    A->x = rmm_wrap_malloc(A->x_size);
-
-    A->nvals = 0;              // for bitmapped matrices only
-    A->nzombies = 0;
-    A->jumbled = false;
-    A->Pending = NULL;
-    A->nvec_nonempty = -1;
-    A->hyper_switch = 0.0625;
-    A->bitmap_switch = 0.10;
-    A->sparsity_control = sparsity;
-
-    switch (sparsity) {
-        case GxB_FULL: {
-
-            // --------------------------------------
-            // Full structure:
-            // --------------------------------------
-
-            // Ah, Ap, Ai, and Ab are all NULL.
-            // A->nvec == A->vdim.   A->plen is not needed (set to -1)
-
-            A->plen = -1;
-            A->nvec = A->vdim;
-            A->nvec_nonempty = (A->vlen > 0) ? A->vdim : 0;
-
-            // --------------------------------------
-            // A->is_csc is true:  full CSC format
-            // --------------------------------------
-
-            // A is m-by-n: where A->vdim = n, and A->vlen = m
-
-            // Column A(:,j) is held in Ax [p1:p2-1] where p1 = k*m, p2 = (k+1)*m.
-            // A(i,j) at position p has row index i = p%m and value Ax [p]
-
-            // --------------------------------------
-            // A->is_csc is false:  full CSR format
-            // --------------------------------------
-
-            // A is m-by-n: where A->vdim = m, and A->vlen = n
-
-            // Row A(i,:) is held in Ax [p1:p2-1] where p1 = k*n, p2 = (k+1)*n.
-            // A(i,j) at position p has column index j = p%n and value Ax [p]
-        }
-            break;
-
-        case GxB_BITMAP: {
-
-            // --------------------------------------
-            // Bitmap structure:
-            // --------------------------------------
-
-            // Ah, Ap, and Ai are NULL.  Ab is an int8_t array of size m*n.
-            // A->nvec == A->vdim.   A->plen is not needed (set to -1)
-
-            A->plen = -1;
-            A->nvec = A->vdim;
-            A->nvec_nonempty = (A->vlen > 0) ? A->vdim : 0;
-            A->b_size = anz * sizeof(bool);
-            A->b = rmm_wrap_malloc(A->b_size);
-
-            // The bitmap structure is identical to the full structure, except for the
-            // addition of the bitmap array A->b.
-
-            // --------------------------------------
-            // A->is_csc is true:  bitmap CSC format
-            // --------------------------------------
-
-            // A is m-by-n: where A->vdim = n, and A->vlen = m
-
-            // Column A(:,j) is held in Ax [p1:p2-1] where p1 = k*m, p2 = (k+1)*m.
-            // A(i,j) at position p has row index i = p%m and value Ax [p].
-            // The entry A(i,j) is present if Ab [p] == 1, and not present if
-            // Ab [p] == 0.
-
-            // --------------------------------------
-            // A->is_csc is false:  bitmap CSR format
-            // --------------------------------------
-
-            // A is m-by-n: where A->vdim = m, and A->vlen = n
-
-            // Row A(i,:) is held in Ax [p1:p2-1] where p1 = k*n, p2 = (k+1)*n.
-            // A(i,j) at position p has column index j = p%n and value Ax [p]
-            // The entry A(i,j) is present if Ab [p] == 1, and not present if
-            // Ab [p] == 0.
-        }
-            break;
-
-        case GxB_SPARSE: {
-
-            // --------------------------------------
-            // Sparse structure:
-            // --------------------------------------
-
-            // Ah and Ab are NULL
-            // A->nvec == A->plen == A->vdim
-
-            A->plen = A->vdim;       // size of A->p is plen+1
-            A->nvec = A->plen;
-            A->p_size = (A->plen + 1) * sizeof(int64_t);
-            A->i_size = anz * sizeof(int64_t);
-            A->p = rmm_wrap_malloc(A->p_size);
-            A->i = rmm_wrap_malloc(A->i_size);
-
-            // --------------------------------------
-            // A->is_csc is true:  sparse CSC format
-            // --------------------------------------
-
-            // Ap, Ai, and Ax store a sparse matrix in the a very similar style
-            // as MATLAB and CSparse, as a collection of sparse column vectors.
-
-            // Column A(:,j) is held in two parts: the row indices are in
-            // Ai [Ap [j]...Ap [j+1]-1], and the numerical values are in the
-            // same positions in Ax.
-
-            // A is m-by-n: where A->vdim = n, and A->vlen = m
-
-            // --------------------------------------
-            // A->is_csc is false:  sparse CSR format
-            // --------------------------------------
-
-            // Ap, Ai, and Ax store a sparse matrix in CSR format, as a collection
-            // of sparse row vectors.
-
-            // Row A(i,:) is held in two parts: the column indices are in
-            // Ai [Ap [i]...Ap [i+1]-1], and the numerical values are in the
-            // same positions in Ax.
-
-            // A is m-by-n: where A->vdim = m, and A->vlen = n
-        }
-            break;
-
-        case GxB_HYPERSPARSE: {
-            // --------------------------------------
-            // Hypersparse structure:
-            // --------------------------------------
-
-            // Ab is NULL
-            // Ah is non-NULL and has size A->plen; it is always kept sorted,
-            // A->nvec <= A->plen <= A->vdim
-
-            A->plen = nvec;     // size of A->p is plen+1
-            A->nvec = nvec;
-            A->p_size = (A->plen + 1) * sizeof(int64_t);
-            A->h_size = (A->plen) * sizeof(int64_t);
-            A->i_size = anz * sizeof(int64_t);
-            A->p = rmm_wrap_malloc(A->p_size);
-            A->h = rmm_wrap_malloc(A->h_size);
-            A->i = rmm_wrap_malloc(A->i_size);
-
-            // --------------------------------------
-            // A->is_csc is true: hypersparse CSC format
-            // --------------------------------------
-
-            // A is held as a set of A->nvec sparse column vectors, but not all
-            // columns 0 to n-1 are present.
-
-            // If column A(:,j) has any entries, then j = Ah [k] for some
-            // k in the range 0 to A->nvec-1.
-
-            // Column A(:,j) is held in two parts: the row indices are in Ai [Ap
-            // [k]...Ap [k+1]-1], and the numerical values are in the same
-            // positions in Ax.
-
-            // A is m-by-n: where A->vdim = n, and A->vlen = m
-
-            // --------------------------------------
-            // A->is_csc is false: hypersparse CSR format
-            // --------------------------------------
-
-            // A is held as a set of A->nvec sparse row vectors, but not all
-            // row 0 to m-1 are present.
-
-            // If row A(i,:) has any entries, then i = Ah [k] for some
-            // k in the range 0 to A->nvec-1.
-
-            // Row A(i,:) is held in two parts: the column indices are in Ai
-            // [Ap [k]...Ap [k+1]-1], and the numerical values are in the same
-            // positions in Ax.
-
-            // A is m-by-n: where A->vdim = n, and A->vlen = m
-
-        }
-            break;
-
-        default:;
-    }
-
-    A->p_shallow = false ;
-    A->h_shallow = false ;
-    A->b_shallow = false ;
-    A->i_shallow = false ;
-    A->x_shallow = false ;
-    A->static_header = false ;    // true if this struct is statically allocated
-
-    return (A) ;
-}
-
diff --git a/GraphBLAS/CUDA/GB_Matrix_allocate.h b/GraphBLAS/CUDA/GB_Matrix_allocate.h
deleted file mode 100644
index e9c477a74..000000000
--- a/GraphBLAS/CUDA/GB_Matrix_allocate.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#ifndef GB_MATRIX_ALLOCATE_H
-#define GB_MATRIX_ALLOCATE_H
-#include "matrix.h"
-#include "pmr_malloc.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-GrB_Matrix GB_Matrix_allocate
-        (
-                GrB_Type type,          // NULL on the GPU
-                size_t type_size,       // type->size
-                int64_t nrows,
-                int64_t ncols,
-                int sparsity,   //GxB_FULL, ..
-                bool is_csc,
-                bool iso,
-                int64_t anz,    // ignored if sparsity is GxB_FULL or GxB_BITMAP
-                int64_t nvec    // hypersparse only
-        );
-
-GrB_Vector GB_Vector_allocate
-        (
-                GrB_Type type,          // NULL on the GPU
-                size_t type_size,       // type->size
-                int64_t length,
-                int sparsity,   // GxB_FULL, GxB_BITMAP, or GxB_SPARSE
-                bool iso,
-                int64_t anz     // ignored if sparsity is GxB_FULL or GxB_BITMAP
-        );
-
-GrB_Scalar GB_Scalar_allocate
-        (
-                GrB_Type type,          // NULL on the GPU
-                size_t type_size,       // type->size
-                int sparsity   // GxB_FULL, GxB_BITMAP, or GxB_SPARSE
-        );
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
diff --git a/GraphBLAS/CUDA/GB_Operator.h b/GraphBLAS/CUDA/GB_Operator.h
new file mode 120000
index 000000000..c2478dc96
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_Operator.h
@@ -0,0 +1 @@
+../Source/Template/GB_Operator.h
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/GB_cuda.h b/GraphBLAS/CUDA/GB_cuda.h
index aae498729..315140097 100644
--- a/GraphBLAS/CUDA/GB_cuda.h
+++ b/GraphBLAS/CUDA/GB_cuda.h
@@ -14,6 +14,8 @@
 
 extern "C"
 {
+#include <cassert>
+#include <cmath>
     #include "GB.h"
 }
 
@@ -21,16 +23,14 @@ extern "C"
 #include "cuda_runtime.h"
 #include "cuda.h"
 #include "jitify.hpp"
-#include "GB_cuda_semiring_factory.hpp"
+#include "GB_cuda_mxm_factory.hpp"
 
-#include <cassert>
-#include <cmath>
 #include <iostream>
 
 #define CHECK_CUDA_SIMPLE(call)                                           \
   do {                                                                    \
     cudaError_t err = call;                                               \
-    if (err != cudaSuccess) {                                            \
+    if (err != cudaSuccess) {                                             \
       const char* str = cudaGetErrorName( err);                           \
       std::cout << "(CUDA runtime) returned " << str;                     \
       std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \
@@ -39,7 +39,6 @@ extern "C"
     }                                                                     \
   } while (0)
 
-
 //------------------------------------------------------------------------------
 // GB_CUDA_CATCH: catch error from a try { ... } region
 //------------------------------------------------------------------------------
@@ -68,14 +67,15 @@ extern "C"
         return (GB_ERROR (info, (GB_LOG, "CUDA died\n"))) ;                    \
     }
 
-// 12 buckets: computed by up to 11 kernel launches (zombies need no work...),
-// using 5 different kernels (with different configurations depending on the
-// bucket).
-    #include "GB_cuda_buckets.h"
+// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need
+// no work...), using different kernels (with different configurations
+// depending on the bucket).
+
+#include "GB_cuda_buckets.h"
+
 extern "C"
 {
     #include "GB_stringify.h"
-    
 }
 #endif
 
diff --git a/GraphBLAS/CUDA/GB_cuda_atomics.cuh b/GraphBLAS/CUDA/GB_cuda_atomics.cuh
index 86608e1db..75fa19a47 100644
--- a/GraphBLAS/CUDA/GB_cuda_atomics.cuh
+++ b/GraphBLAS/CUDA/GB_cuda_atomics.cuh
@@ -45,7 +45,9 @@ template <typename T>
 __device__ void atomic_sub(T* ptr, T val);
 
 template<> __device__ __inline__ void atomic_add<int>(int* ptr, int val) { atomicAdd(ptr, val); }
+template<> __device__ __inline__ void atomic_add<uint32_t>(uint32_t* ptr, uint32_t val) { atomicAdd((unsigned int*)ptr, (unsigned int)val); }
 template<> __device__ __inline__ void atomic_add<int64_t>(int64_t* ptr, int64_t val) { atomicAdd((unsigned long long*)ptr, (unsigned long long)val); }
+template<> __device__ __inline__ void atomic_add<uint64_t>(uint64_t* ptr, uint64_t val) { atomicAdd((unsigned long long*)ptr, (unsigned long long)val); }
 template<> __device__ __inline__ void atomic_add<float>(float* ptr, float val) { atomicAdd(ptr, val); }
 template<> __device__ __inline__ void atomic_add<double>(double* ptr, double val) { atomicAdd(ptr, val); }
 
diff --git a/GraphBLAS/CUDA/GB_cuda_buckets.h b/GraphBLAS/CUDA/GB_cuda_buckets.h
index ba7b2bc25..27ce063c4 100644
--- a/GraphBLAS/CUDA/GB_cuda_buckets.h
+++ b/GraphBLAS/CUDA/GB_cuda_buckets.h
@@ -1,11 +1,10 @@
-// SPDX-License-Identifier: Apache-2.0
-
 //------------------------------------------------------------------------------
 // GB_cuda_buckets.h: definitions for buckets using for dot3 
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
 // http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,60 +13,41 @@
 #ifndef GB_CUDA_BUCKETS_H
 #define GB_CUDA_BUCKETS_H
 
-#define NBUCKETS 12
-// 12 buckets: computed by up to 11 kernel launches (zombies need no work...),
-// using 5 different kernels (with different configurations depending on the
-// bucket).
-typedef enum
-{
-    // bring out your dead:
-    GB_BUCKET_ZOMBIE = 0,              // C(i,j) is a zombie (not a bucket)
-
-// dot3:  C<M>=A'B, M is sparse or hyper, C is sparse or hyper
-// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x M is (sparse/hyper)
-
-// a full/full kernel:
-    // CUDA kernel: dndn, handles a single bucket:
-    // both A(:,i) and B(:,j) are dense
-    GB_BUCKET_DNDN = 1,
-
-// two full/(sparse,hyper) kernels:
-    // CUDA kernel: spdn, handles 4 buckets:
-    // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
-    GB_BUCKET_DNVS = 2,
-    // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
-    GB_BUCKET_DNSP = 3,
+#define NBUCKETS 3
 
-// a sparse/full kernel
-    // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
-    GB_BUCKET_VSDN = 4,
-    // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
-    GB_BUCKET_SPDN = 5,
+// NBUCKETS buckets: computed by up to NBUCKETS-1 kernel launches (zombies need
+// no work...), each using different kernels (with different configurations
+// depending on the bucket).
 
-// a sparse/bitmap kernel
-// a bitmap/bitmap kernel
-// a bitmap/sparse kernel
-// ...
-
-
-// sparse/sparse:
-    // CUDA kernel: vssp, handles 1 bucket, uses binary search:
-    // A(:,i) is very sparse compared to B(:,j), or visa versa
-    GB_BUCKET_VSSP = 6,
-
-    // CUDA kernel: vsvs, handles 4 buckets:
-    // let len = nnz (A (:,i) + nnz (B (:,j)), then:
-    GB_BUCKET_VSVS_4 = 7,       // len <= 4
-    GB_BUCKET_VSVS_16 = 8,      // len <= 16
-    GB_BUCKET_VSVS_64 = 9,      // len <= 64
-    GB_BUCKET_VSVS_256 = 10,     // len <= 256
-
-    // CUDA kernel: mp, use the merge-path method:
-    GB_BUCKET_MERGEPATH = 11,
+// dot3:  C<M>=A'B, M is sparse or hyper, C is sparse or hyper
+// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x (M and C are sparse/hyper)
 
-    // CUDA kernel: warpix, use the warp-intersect method, unused so far:
-    GB_BUCKET_WARP_IX = 12
+typedef enum
+{
+    GB_BUCKET_ZOMBIE = 0,       // C(i,j) is a zombie (not a bucket)
+    GB_BUCKET_VSVS = 1,         // vsvs: both A(:,i) and B(:,j) are very sparse
+    GB_BUCKET_MERGEPATH = 2,    // mp: use the merge-path method
 }
 GB_bucket_code ;
 
+// These may use another bucket enum:
+
+    // two full/(sparse,hyper) kernels:
+    //  // CUDA kernel: spdn, handles 4 buckets:
+    //  // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
+    //  GB_BUCKET_DNVS = 2,
+    //  // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
+    //  GB_BUCKET_DNSP = 3,
+
+    // a sparse/full kernel
+    //  // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
+    //  GB_BUCKET_VSDN = 4,
+    //  // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
+    //  GB_BUCKET_SPDN = 5,
+
+    // a sparse/bitmap kernel
+    // a bitmap/bitmap kernel
+    // a bitmap/sparse kernel
+    // ...
+
 #endif
diff --git a/GraphBLAS/CUDA/GB_cuda_calloc.cu_dep b/GraphBLAS/CUDA/GB_cuda_calloc.cu_dep
deleted file mode 100644
index 4d5d46bf9..000000000
--- a/GraphBLAS/CUDA/GB_cuda_calloc.cu_dep
+++ /dev/null
@@ -1,31 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_cuda_calloc.cu: wrapper for cudaMallocManaged and memset
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#include "GB_cuda.h"
-
-void *GB_cuda_calloc (size_t n, size_t size)   // standcard calloc signature
-{
-
-    // malloc the space
-    void *p = GB_cuda_malloc (n * size) ;
-
-    if (p == NULL)
-    {
-        // out of memory, or other CUDA error
-        return (NULL) ;
-    }
-
-    // set the space to zero
-    memset (p, 0, n * size) ;
-
-    // return the result
-    return (p) ;
-}
-
diff --git a/GraphBLAS/CUDA/GB_cuda_free.cu_dep b/GraphBLAS/CUDA/GB_cuda_free.cu_dep
deleted file mode 100644
index ac1128541..000000000
--- a/GraphBLAS/CUDA/GB_cuda_free.cu_dep
+++ /dev/null
@@ -1,19 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_cuda_free.cu: wrapper for cudaFree
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#include "GB_cuda.h"
-#include "rmm/detail/cnmem.hpp"
-
-void GB_cuda_free (void *p)     // standard free signature
-{
-    cnmemFree( p , NULL);
-    //printf(" GPU %d freeing mem\n", device);
-}
-
diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
index b535298d9..a1fe58a46 100644
--- a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
+++ b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
@@ -10,88 +10,97 @@
 
 #include "GB_cuda.h"
 
-bool GB_cuda_get_device ( int &device){
-    bool goodreturn = false;
+//------------------------------------------------------------------------------
+// GB_cuda_get_device: get the current GPU
+//------------------------------------------------------------------------------
+
+bool GB_cuda_get_device (int &device)
+{
     if (&device == NULL)
     {
         // invalid inputs
         return (false) ;
     }
-
-    CHECK_CUDA_SIMPLE ( cudaGetDevice( &device ) ); 
-    goodreturn = true;
-
-    return goodreturn;
-
+    CHECK_CUDA_SIMPLE (cudaGetDevice (&device)) ;
+    return (true) ;
 }
 
-bool GB_cuda_set_device( int device) {
-    bool goodreturn = false;
+//------------------------------------------------------------------------------
+// GB_cuda_set_device: set the current GPU
+//------------------------------------------------------------------------------
+
+bool GB_cuda_set_device (int device)
+{
     if (device < 0)
     {
         // invalid inputs
         return (false) ;
     }
-
-    CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ); 
-    goodreturn = true;
-
-    return goodreturn;
+    CHECK_CUDA_SIMPLE (cudaSetDevice (device)) ;
+    return (true) ;
 }
 
+//------------------------------------------------------------------------------
+// GB_cuda_get_device_properties: determine all properties of a single GPU
+//------------------------------------------------------------------------------
+
 bool GB_cuda_get_device_properties  // true if OK, false if failure
 (
     int device,
-    rmm_device *prop
+    GB_cuda_device *prop
 )
 {
 
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
-    bool goodreturn = false;
+
     if (prop == NULL || device < 0)
     {
         // invalid inputs
         return (false) ;
     }
 
-    int old_device;
-    CHECK_CUDA_SIMPLE ( cudaGetDevice( &old_device ) ) ; 
+    // clear the GPU settings
+    memset (prop, 0, sizeof (GB_cuda_device)) ;
 
+    int old_device ;
+    CHECK_CUDA_SIMPLE ( cudaGetDevice( &old_device ) ) ;
 
     //--------------------------------------------------------------------------
     // get the properties
     //--------------------------------------------------------------------------
-    int num_sms;
-    int compute_capability_major;
-    int compute_capability_minor;
-    size_t memfree, memtotal;
+
+    int num_sms, compute_capability_major, compute_capability_minor ;
+    size_t memfree, memtotal ;
 
     CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&num_sms,
                                          cudaDevAttrMultiProcessorCount,
-                                         device) );
+                                         device) ) ;
     CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_major,
                                          cudaDevAttrComputeCapabilityMajor,
-                                         device) );
+                                         device) ) ;
     CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_minor,
                                          cudaDevAttrComputeCapabilityMajor,
-                                         device) );
+                                         device) ) ;
 
-    CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ); 
+    CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ) ;
     CHECK_CUDA_SIMPLE ( cudaMemGetInfo( & memfree, &memtotal) ) ;
-    CHECK_CUDA_SIMPLE ( cudaSetDevice( old_device ) ); 
-
-    prop->total_global_memory = memtotal;
-    prop->number_of_sms = num_sms;
-    prop->compute_capability_major = compute_capability_major;
-    prop->compute_capability_minor = compute_capability_minor;
-    
-    goodreturn = true;
+    CHECK_CUDA_SIMPLE ( cudaSetDevice( old_device ) ) ;
+
+    prop->total_global_memory = memtotal ;
+    prop->number_of_sms = num_sms ;
+    prop->compute_capability_major = compute_capability_major ;
+    prop->compute_capability_minor = compute_capability_minor ;
+
+    printf ("Device: %d: memory: %ld SMs: %d compute: %d.%d\n",
+        device, prop->total_global_memory, prop->number_of_sms,
+        prop->compute_capability_major, prop->compute_capability_minor) ;
+
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    return  goodreturn;
+    return (true) ;
 }
 
diff --git a/GraphBLAS/CUDA/GB_cuda_init.c b/GraphBLAS/CUDA/GB_cuda_init.c
new file mode 100644
index 000000000..bcc40eb57
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_init.c
@@ -0,0 +1,39 @@
+//------------------------------------------------------------------------------
+// GB_cuda_init: initialize the GPUs for use by GraphBLAS
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_cuda_init queries the system for the # of GPUs available, their memory
+// sizes, SM counts, and other capabilities.  Unified Memory support is
+// assumed.  Then each GPU is "warmed up" by allocating a small amount of
+// memory.
+
+#include "GB.h"
+
+GrB_Info GB_cuda_init (void)
+{
+    GB_Global_gpu_control_set (GxB_DEFAULT) ;
+    if (!GB_Global_gpu_count_set (true)) return (GrB_PANIC) ;
+    int gpu_count = GB_Global_gpu_count_get ( ) ;
+    for (int device = 0 ; device < 1 ; device++) // TODO for GPU: gpu_count
+    {
+        // query the GPU and then warm it up
+        if (!GB_Global_gpu_device_properties_get (device))
+        {
+            return (GrB_PANIC) ;
+        }
+        if (!GB_cuda_warmup (device))
+        {
+            return (GrB_PANIC) ;
+        }
+    }
+    // make GPU 0 the default device
+    GB_cuda_set_device( 0 );
+    // also check for jit cache, pre-load library of common kernels ...
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_kernel.h b/GraphBLAS/CUDA/GB_cuda_kernel.h
new file mode 100644
index 000000000..af2a4e760
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_kernel.h
@@ -0,0 +1,361 @@
+//------------------------------------------------------------------------------
+// CUDA/GB_cuda_kernel.h: definitions for all GraphBLAS CUDA kernels
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This file is #include'd into all CUDA kernels for GraphBLAS.  It provides
+// a
+
+#pragma once
+#undef  ASSERT
+#define ASSERT(x)
+
+//------------------------------------------------------------------------------
+// TODO: this will be in the jit code:
+#define chunksize 128 
+
+//------------------------------------------------------------------------------
+// GETA, GETB: get entries from input matrices A and B
+//------------------------------------------------------------------------------
+
+#if GB_FLIPXY
+
+    #if GB_A_IS_PATTERN
+        #define GB_DECLAREA(aval)
+        #define GB_SHAREDA(aval)
+        #define GB_GETA( aval, ax, p)
+    #else
+        #define GB_DECLAREA(aval) T_Y aval
+        #define GB_SHAREDA(aval) __shared__ T_Y aval
+        #if GB_A_ISO
+            #define GB_GETA( aval, ax, p) aval = (T_Y) (ax [0]) ;
+        #else
+            #define GB_GETA( aval, ax, p) aval = (T_Y) (ax [p]) ;
+        #endif
+    #endif
+
+    #if GB_B_IS_PATTERN
+        #define GB_DECLAREB(bval)
+        #define GB_SHAREDB(bval)
+        #define GB_GETB( bval, bx, p)
+    #else
+        #define GB_DECLAREB(bval) T_X bval
+        #define GB_SHAREDB(bval) __shared__ T_X bval
+        #if GB_B_ISO
+            #define GB_GETB( bval, bx, p) bval = (T_X) (bx [0]) ;
+        #else
+            #define GB_GETB( bval, bx, p) bval = (T_X) (bx [p]) ;
+        #endif
+    #endif
+
+#else
+
+    #if GB_A_IS_PATTERN
+        #define GB_DECLAREA(aval)
+        #define GB_SHAREDA(aval)
+        #define GB_GETA( aval, ax, p)
+    #else
+        #define GB_DECLAREA(aval) T_X aval
+        #define GB_SHAREDA(aval) __shared__ T_X aval
+        #if GB_A_ISO
+            #define GB_GETA( aval, ax, p) aval = (T_X) (ax [0]) ;
+        #else
+            #define GB_GETA( aval, ax, p) aval = (T_X) (ax [p]) ;
+        #endif
+    #endif
+
+    #if GB_B_IS_PATTERN
+        #define GB_DECLAREB(bval)
+        #define GB_SHAREDB(bval)
+        #define GB_GETB( bval, bx, p)
+    #else
+        #define GB_DECLAREB(bval) T_Y bval
+        #define GB_SHAREDB(bval) __shared__ T_Y bval
+        #if GB_B_ISO
+            #define GB_GETB( bval, bx, p) bval = (T_Y) (bx [0]) ;
+        #else
+            #define GB_GETB( bval, bx, p) bval = (T_Y) (bx [p]) ;
+        #endif
+    #endif
+
+#endif
+
+//------------------------------------------------------------------------------
+// operators
+//------------------------------------------------------------------------------
+
+#if GB_C_ISO
+
+    #define GB_ADD_F( f , s)
+    #define GB_C_MULT( c, a, b)
+    #define GB_MULTADD( c, a ,b )
+    #define GB_DOT_TERMINAL ( c )   
+    #define GB_DOT_MERGE(pA,pB)                                         \
+    {                                                                   \
+        cij_exists = true ;                                             \
+    }
+    #define GB_CIJ_EXIST_POSTCHECK
+
+#else
+
+    #define GB_ADD_F( f , s)  f = GB_ADD ( f, s ) 
+    #define GB_C_MULT( c, a, b)  c = GB_MULT( (a), (b) )
+    #define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
+    #define GB_DOT_TERMINAL ( c )
+    //# if ( c == TERMINAL_VALUE) break;
+
+    #if GB_IS_PLUS_PAIR_REAL_SEMIRING
+
+        // cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring)
+        #if GB_ZTYPE_IGNORE_OVERFLOW
+            // plus_pair for int64, uint64, float, or double
+            #define GB_DOT_MERGE(pA,pB) cij++ ;
+            #define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ;
+        #else
+            // plus_pair semiring for small integers
+            #define GB_DOT_MERGE(pA,pB)                                     \
+            {                                                               \
+                cij_exists = true ;                                         \
+                cij++ ;                                                     \
+            }
+            #define GB_CIJ_EXIST_POSTCHECK
+        #endif
+
+    #else
+
+        // cij += A(k,i) * B(k,j), for merge operation (general case)
+        #define GB_DOT_MERGE(pA,pB)                                         \
+        {                                                                   \
+            GB_GETA (aki, Ax, pA) ;         /* aki = A(k,i) */              \
+            GB_GETB (bkj, Bx, pB) ;         /* bkj = B(k,j) */              \
+            cij_exists = true ;                                             \
+            GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */          \
+        }
+        #define GB_CIJ_EXIST_POSTCHECK
+
+    #endif
+
+#endif
+
+//------------------------------------------------------------------------------
+// subset of GraphBLAS.h
+//------------------------------------------------------------------------------
+
+#ifndef GRAPHBLAS_H
+#define GRAPHBLAS_H
+
+#undef restrict
+#undef GB_restrict
+#if defined ( GB_CUDA_KERNEL ) || defined ( __NVCC__ )
+    #define GB_restrict __restrict__
+#else
+    #define GB_restrict
+#endif
+#define restrict GB_restrict
+
+#include <stdint.h>
+//#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+
+// GB_STR: convert the content of x into a string "x"
+#define GB_XSTR(x) GB_STR(x)
+#define GB_STR(x) #x
+
+#undef  GB_PUBLIC
+#define GB_PUBLIC extern
+#undef  GxB_MAX_NAME_LEN
+#define GxB_MAX_NAME_LEN 128
+
+typedef uint64_t GrB_Index ;
+typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
+typedef struct GB_Type_opaque *GrB_Type ;
+typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
+typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
+typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
+typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
+typedef struct GB_Monoid_opaque *GrB_Monoid ;
+typedef struct GB_Semiring_opaque *GrB_Semiring ;
+typedef struct GB_Scalar_opaque *GrB_Scalar ;
+typedef struct GB_Vector_opaque *GrB_Vector ;
+typedef struct GB_Matrix_opaque *GrB_Matrix ;
+
+#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
+#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
+#define GxB_BITMAP      4   // store matrix as a bitmap
+#define GxB_FULL        8   // store matrix as full; all entries must be present
+
+typedef void (*GxB_unary_function)  (void *, const void *) ;
+typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
+
+typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
+(
+    GrB_Index i,                // row index of A(i,j)
+    GrB_Index j,                // column index of A(i,j)
+    const void *x,              // value of A(i,j)
+    const void *thunk           // optional input for select function
+) ;
+
+typedef void (*GxB_index_unary_function)
+(
+    void *z,            // output value z, of type ztype
+    const void *x,      // input value x of type xtype; value of v(i) or A(i,j)
+    GrB_Index i,        // row index of A(i,j)
+    GrB_Index j,        // column index of A(i,j), or zero for v(i)
+    const void *y       // input scalar y
+) ;
+
+typedef enum
+{
+    // for all GrB_Descriptor fields:
+    GxB_DEFAULT = 0,    // default behavior of the method
+
+    // for GrB_OUTP only:
+    GrB_REPLACE = 1,    // clear the output before assigning new values to it
+
+    // for GrB_MASK only:
+    GrB_COMP = 2,       // use the structural complement of the input
+    GrB_SCMP = 2,       // same as GrB_COMP (historical; use GrB_COMP instead)
+    GrB_STRUCTURE = 4,  // use the only pattern of the mask, not its values
+
+    // for GrB_INP0 and GrB_INP1 only:
+    GrB_TRAN = 3,       // use the transpose of the input
+
+    // for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
+    GxB_GPU_ALWAYS  = 2001,
+    GxB_GPU_NEVER   = 2002,
+
+    // for GxB_AxB_METHOD only:
+    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
+    GxB_AxB_DOT       = 1003,   // dot product
+    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
+    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
+}
+GrB_Desc_Value ;
+
+#include "GB_opaque.h"
+#endif
+
+//------------------------------------------------------------------------------
+// subset of GB.h
+//------------------------------------------------------------------------------
+//#include GB_iceil.h
+#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
+//#include GB_imin.h
+#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y))
+#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y))
+//#include GB_zombie.h
+#define GB_FLIP(i)             (-(i)-2)
+#define GB_IS_FLIPPED(i)       ((i) < 0)
+#define GB_IS_ZOMBIE(i)        ((i) < 0)
+#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
+#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
+#define GBI_UNFLIP(Ai,p,avlen)      \
+    ((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p]))
+
+#include "GB_nnz.h"
+#include "GB_partition.h"
+//#include "GB_binary_search.h"
+// version for the GPU, with fewer branches
+#define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright)                             \
+{                                                                           \
+    /* binary search of X [pleft ... pright] for integer i */               \
+    while (pleft < pright)                                                  \
+    {                                                                       \
+        int64_t pmiddle = (pleft + pright) >> 1 ;                           \
+        bool less = (X [pmiddle] < i) ;                                     \
+        pleft  = less ? (pmiddle+1) : pleft ;                               \
+        pright = less ? pright : pmiddle ;                                  \
+    }                                                                       \
+    /* binary search is narrowed down to a single item */                   \
+    /* or it has found the list is empty */                                 \
+    ASSERT (pleft == pright || pleft == pright + 1) ;                       \
+}
+#define GB_BINARY_SEARCH(i,X,pleft,pright,found)                            \
+{                                                                           \
+    GB_TRIM_BINARY_SEARCH (i, X, pleft, pright) ;                           \
+    found = (pleft == pright && X [pleft] == i) ;                           \
+}
+#define GB_SPLIT_BINARY_SEARCH(i,X,pleft,pright,found)                      \
+{                                                                           \
+    GB_BINARY_SEARCH (i, X, pleft, pright, found)                           \
+    if (!found && (pleft == pright))                                        \
+    {                                                                       \
+        if (i > X [pleft])                                                  \
+        {                                                                   \
+            pleft++ ;                                                       \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            pright++ ;                                                      \
+        }                                                                   \
+    }                                                                       \
+}
+
+//#include "GB_search_for_vector_template.c"
+__device__
+static inline int64_t GB_search_for_vector_device
+(
+    const int64_t p,                // search for vector k that contains p
+    const int64_t *restrict Ap,  // vector pointers to search
+    int64_t kleft,                  // left-most k to search
+    int64_t anvec,                  // Ap is of size anvec+1
+    int64_t avlen                   // A->vlen
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    if (Ap == NULL)
+    { 
+        // A is full or bitmap
+        ASSERT (p >= 0 && p < avlen * anvec) ;
+        return ((avlen == 0) ? 0 : (p / avlen)) ;
+    }
+
+    // A is sparse
+    ASSERT (p >= 0 && p < Ap [anvec]) ;
+
+    //--------------------------------------------------------------------------
+    // search for k
+    //--------------------------------------------------------------------------
+
+    int64_t k = kleft ;
+    int64_t kright = anvec ;
+    bool found ;
+    GB_SPLIT_BINARY_SEARCH (p, Ap, k, kright, found) ;
+    if (found)
+    {
+        // Ap [k] == p has been found, but if k is an empty vector, then the
+        // next vector will also contain the entry p.  In that case, k needs to
+        // be incremented until finding the first non-empty vector for which
+        // Ap [k] == p.
+        ASSERT (Ap [k] == p) ;
+        while (k < anvec-1 && Ap [k+1] == p)
+        { 
+            k++ ;
+        }
+    }
+    else
+    { 
+        // p has not been found in Ap, so it appears in the middle of Ap [k-1]
+        // ... Ap [k], as computed by the binary search.  This is the range of
+        // entries for the vector k-1, so k must be decremented.
+        k-- ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    // The entry p must reside in a non-empty vector.
+    ASSERT (k >= 0 && k < anvec) ;
+    ASSERT (Ap [k] <= p && p < Ap [k+1]) ;
+
+    return (k) ;
+}
diff --git a/GraphBLAS/CUDA/GB_cuda_malloc.cu_dep b/GraphBLAS/CUDA/GB_cuda_malloc.cu_dep
deleted file mode 100644
index 64c592896..000000000
--- a/GraphBLAS/CUDA/GB_cuda_malloc.cu_dep
+++ /dev/null
@@ -1,24 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_cuda_malloc.cu: wrapper for cuda Managed Memory allocator, or pool 
-//------------------------------------------------------------------------------
-
-// SPDX-License-Identifier: Apache-2.0
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#include "GB_cuda.h"
-#include "rmm/detail/cnmem.h"
-
-void *GB_cuda_malloc (size_t size)          // standard malloc signature
-{
-    void *p = NULL ;
-
-    cnmemMalloc( &p, size, NULL);
-
-    return p;
-  
-     
-}
-
diff --git a/GraphBLAS/CUDA/GB_cuda_semiring_factory.hpp b/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp
similarity index 50%
rename from GraphBLAS/CUDA/GB_cuda_semiring_factory.hpp
rename to GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp
index 2bf520281..8d1516a04 100644
--- a/GraphBLAS/CUDA/GB_cuda_semiring_factory.hpp
+++ b/GraphBLAS/CUDA/GB_cuda_mxm_factory.hpp
@@ -1,4 +1,4 @@
-// Class to manage both stringify functions from semiring, ops and monoids to char buffers
+// Class to manage both stringify functions from mxm, ops and monoids to char buffers
 // Also provides a iostream callback to deliver the buffer to jitify as if read from a file
 
 // (c) Nvidia Corp. 2020 All rights reserved 
@@ -21,20 +21,21 @@ extern "C"
 // Define function pointer we will use later
 //std::istream* (*file_callback)(std::string, std::iostream&);
 
-// Define a factory class for building any semiring text definitions
-class GB_cuda_semiring_factory: public jit::File_Desc {
+// Define a factory class for building any mxm text definitions
 
-    public:
+// FIXME: delegate problem generation to data factory
+class GB_cuda_mxm_factory: public jit::File_Desc {
 
+    public:
         uint64_t sr_code;
-        bool mask_struct;
+        GrB_Semiring semiring ;
 
         // file ptr
         FILE *fp;
 
     void open( const char *path_and_file, const char *mode)
     {
-        std::cout<< "opening "<< path_and_file<<" for write"<< std::endl;  
+//        std::cout<< "opening "<< path_and_file<<" for write"<< std::endl;
         fp = fopen( path_and_file, mode);
     }
 
@@ -44,64 +45,72 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
     }
 
     //--------------------------------------------------------------------------
-    //semiring_factor takes a set of inputs describing and operation (semiring,
-    //mask, datatypes, sparsity formats) and produces a numerical unique value
-    //for those This allows rapid lookups to see if we have handled this case
-    //before, and avoids the need to generate and manage strings at this stage.
+    // mxm_factory takes a set of inputs describing and operation (semiring,
+    // mask, datatypes, sparsity formats, etc) and produces a numerical unique
+    // value for those This allows rapid lookups to see if we have handled this
+    // case before, and avoids the need to generate and manage strings at this
+    // stage.
     //--------------------------------------------------------------------------
 
-    void semiring_factory 
+    // FIXME: pass in user's C_in matrix, in case C_in<M>+=A*B can be done
+    //        in-place 
+    // FIXME: handle hypersparse case in dot3
+
+    void mxm_factory
     (  
-        // input:
-        GrB_Semiring semiring,  // the semiring to enumify
-        bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+        // C matrix:
+        bool C_iso,             // true if C is iso-valued
+        int C_sparsity,         // sparsity structure of C
         GrB_Type ctype,         // the type of C
-        GrB_Type mtype,         // the type of M, or NULL if no mask
-        GrB_Type atype,         // the type of A
-        GrB_Type btype,         // the type of B
+        // M matrix:
+        GrB_Matrix M,           // may be NULL
         bool Mask_struct,       // mask is structural
         bool Mask_comp,         // mask is complemented
-        int C_sparsity,         // sparsity structure of C
-        int M_sparsity,         // sparsity structure of M
-        int A_sparsity,         // sparsity structure of A
-        int B_sparsity          // sparsity structure of B
+        // semiring:
+        GrB_Semiring semiring,  // the semiring to enumify
+        bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+        // A and B:
+        GrB_Matrix A,
+        GrB_Matrix B
     )
     {
-       std::cout<<" calling stringify semiring: " << std::endl;
-       GxB_Semiring_fprint (semiring, "stringfiy the smiering", GxB_COMPLETE, stdout) ;
-       std::cout<<" Mask_struct: " << Mask_struct << std::endl;
+//       std::cout<<" calling stringify semiring: " << std::endl;
+//     GxB_Semiring_fprint (semiring, "stringfiy the smiering", GxB_COMPLETE, stdout) ;
+//       std::cout<<" Mask_struct: " << Mask_struct << std::endl;
        uint64_t scode; 
-       GB_enumify_semiring (
+       GB_enumify_mxm (
 	    // output:
 	    &scode,         // unique encoding of the entire semiring
 	    // input:
+            C_iso,          // true if C is iso-valued
+	    C_sparsity,     // sparsity structure of C
+	    ctype,          // the type of C
+            // M matrix:
+            M,
+	    Mask_struct,    // mask is structural
+	    Mask_comp,      // mask is complemented
+            // semiring:
 	    semiring,      // the semiring to enumify
 	    flipxy,        // multiplier is: mult(a,b) or mult(b,a)
-	    ctype,         // the type of C
-	    mtype,         // the type of M, or NULL if no mask
-	    atype,         // the type of A
-	    btype,         // the type of B
-	    Mask_struct,   // mask is structural
-	    Mask_comp,     // mask is complemented
-	    C_sparsity,    // sparsity structure of C
-	    M_sparsity,    // sparsity structure of M
-	    A_sparsity,    // sparsity structure of A
-	    B_sparsity     // sparsity structure of B
+            // A and B:
+            A,
+            B
        ) ;
 
-       printf("scode=%lu\n", scode);
-       std::cout << "done stringify semiring" << std::endl;
+//       printf("scode=%lu\n", scode);
+//       std::cout << "done stringify mxm" << std::endl;
        this->sr_code = scode;
 
-       mask_struct = Mask_struct;
+       this->semiring = semiring ;
+
        std::stringstream ss;
-       ss << "GB_semiring_" << this->sr_code << ".h";
+       ss << "GB_mxm_" << this->sr_code << ".h";
 
        std::string new_filename = ss.str();
        filename.resize(new_filename.size());
        strcpy(filename.data(), new_filename.data());
 
-       std::cout<<" returned from  stringify semiring"<< std::endl;
+//       std::cout<<" returned from  stringify mxm"<< std::endl;
     }
 
 //------------------------------------------------------------------------------
@@ -111,17 +120,17 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
 
     void macrofy ( ) override
     {
-       std::cout<<" calling macrofy semiring. sr_code="<< this->sr_code << std::endl;
-       GB_macrofy_semiring (
+//       std::cout<<" calling macrofy mxm. sr_code="<< this->sr_code << std::endl;
+       GB_macrofy_mxm (
 	    // output to file :
 	    fp, 
 	    // input:
 	    this->sr_code  
        ) ;
-       std::cout<<" returned from  macrofy semiring"<< std::endl; 
+//       std::cout<<" returned from  macrofy mxm"<< std::endl;
 
     }
 
 
-}; // GB_cuda_semiring_factory
+}; // GB_cuda_mxm_factory
 
diff --git a/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp b/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp
new file mode 100644
index 000000000..49459fc9e
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_reduce_factory.hpp
@@ -0,0 +1,79 @@
+// Class to manage both stringify functions from mxm, ops and monoids to char buffers
+// Also provides a iostream callback to deliver the buffer to jitify as if read from a file
+
+// (c) Nvidia Corp. 2020 All rights reserved
+// SPDX-License-Identifier: Apache-2.0
+
+// Implementations of string callbacks
+#pragma once
+
+#include <iostream>
+#include <cstdint>
+#include "GB_jit_cache.h"
+
+extern "C"
+{
+#include "GB.h"
+#include "GB_stringify.h"
+}
+
+// FIXME: delegate problem generation to data factory
+class GB_cuda_reduce_factory: public jit::File_Desc {
+
+public:
+    uint64_t rcode;
+    GrB_Monoid reduce ;
+
+    // file ptr
+    FILE *fp;
+
+    void open( const char *path_and_file, const char *mode)
+    {
+        fp = fopen( path_and_file, mode);
+    }
+
+    void close( )
+    {
+        fclose( fp );
+    }
+
+
+    void reduce_factory(GrB_Monoid reduce, GrB_Matrix A)
+    {
+        uint64_t rcode;
+        GB_enumify_reduce (
+                // output:
+                &rcode,         // unique encoding of entire monoid
+                // input:
+                reduce,
+                A
+        ) ;
+
+        this->rcode = rcode;
+        this->reduce = reduce ;
+        std::stringstream ss;
+        ss << "GB_reduce_" << this->rcode << ".h";
+
+        std::string new_filename = ss.str();
+        filename.resize(new_filename.size());
+        strcpy(filename.data(), new_filename.data());
+    }
+
+//------------------------------------------------------------------------------
+// Macrofy takes a code and creates the corresponding string macros for
+// operators, datatypes, sparsity formats and produces a character buffer.
+//------------------------------------------------------------------------------
+
+    void macrofy ( ) override
+    {
+        GB_macrofy_reduce (
+                // output to file :
+                fp,
+                // input:
+                this->rcode
+        ) ;
+    }
+
+
+}; // GB_cuda_reduce_factory
+
diff --git a/GraphBLAS/CUDA/GB_cuda_type_wrap.hpp b/GraphBLAS/CUDA/GB_cuda_type_wrap.hpp
index b571dceb5..3745fe399 100644
--- a/GraphBLAS/CUDA/GB_cuda_type_wrap.hpp
+++ b/GraphBLAS/CUDA/GB_cuda_type_wrap.hpp
@@ -113,6 +113,38 @@ template<> inline GrB_Info vector_reduce<float>(float *scalar, GrB_Vector A, GrB
 template<> inline GrB_Info vector_reduce<double>(double *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_FP64(scalar, NULL, op, A, NULL); }
 template<> inline GrB_Info vector_reduce<bool>(bool *scalar, GrB_Vector A, GrB_Monoid op) { return GrB_Vector_reduce_BOOL(scalar, NULL, op, A, NULL); }
 
+/**
+ *     GxB_Matrix_reduce_FC32     // c = accum (c, reduce_to_scalar (A))
+            (
+                    GxB_FC32_t *c,                  // result scalar
+    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
+    const GrB_Monoid monoid,        // monoid to do the reduction
+    const GrB_Matrix A,             // matrix to reduce
+    const GrB_Descriptor desc
+
+ * @tparam T 
+ * @param scalar 
+ * @param A 
+ * @param op 
+ * @return 
+ */
+
+template<typename T>
+GrB_Info matrix_reduce(T *scalar, GrB_Matrix A, GrB_Monoid op);
+
+template<> inline GrB_Info matrix_reduce<int8_t>(int8_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT8(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<int16_t>(int16_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT16(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<int32_t>(int32_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT32(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<int64_t>(int64_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_INT64(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<uint8_t>(uint8_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT8(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<uint16_t>(uint16_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT16(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<uint32_t>(uint32_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT32(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<uint64_t>(uint64_t *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_UINT64(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<float>(float *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_FP32(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<double>(double *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_FP64(scalar, NULL, op, A, NULL); }
+template<> inline GrB_Info matrix_reduce<bool>(bool *scalar, GrB_Matrix A, GrB_Monoid op) { return GrB_Matrix_reduce_BOOL(scalar, NULL, op, A, NULL); }
+
+
 template <typename T>
 GrB_Info get_element(GrB_Matrix A, T* x, int64_t i, int64_t j);
 template<> inline GrB_Info get_element<int8_t>(GrB_Matrix A, int8_t *x, int64_t i, int64_t j) { return GrB_Matrix_extractElement_INT8(x, A, i, j); }
diff --git a/GraphBLAS/CUDA/GB_cuda_warmup.cu b/GraphBLAS/CUDA/GB_cuda_warmup.cu
index 22aae719f..9e5e12f3a 100644
--- a/GraphBLAS/CUDA/GB_cuda_warmup.cu
+++ b/GraphBLAS/CUDA/GB_cuda_warmup.cu
@@ -23,11 +23,6 @@ bool GB_cuda_warmup (int device)
 
     double gpu_memory_size = GB_Global_gpu_memorysize_get (device);
 
-    printf ("warming up device %d memsize %g sms %d\n",
-        device,
-        gpu_memory_size, 
-        GB_Global_gpu_sm_get (device)) ;
-
     size_t size = 0 ;
     void *p = GB_malloc_memory (1, 1, &size) ;
     if (p == NULL)
@@ -35,11 +30,11 @@ bool GB_cuda_warmup (int device)
         printf ("Hey!! where's da memory???\n") ;
         return (false) ;
     }
-    printf ("oooo nice block of memory of size %lu\n", size) ;
+//    printf ("oooo nice block of memory of size %lu\n", size) ;
     GB_free_memory ( &p, size) ;
-    printf ("be free, block of memory of size %lu\n", size) ;
+//    printf ("be free, block of memory of size %lu\n", size) ;
 
-    printf ("good ol' cudaMalloc just to be sure\n");
+//    printf ("good ol' cudaMalloc just to be sure\n");
     cudaMalloc ( &p, size ) ;
     if (p == NULL)
     {
@@ -48,7 +43,7 @@ bool GB_cuda_warmup (int device)
     }
     cudaFree (p) ;
 
-    printf ("GPU %d nice and toasty now\n", device) ;
+//    printf ("GPU %d nice and toasty now\n", device) ;
 
     // TODO check for jit cache? or in GB_init?
 
diff --git a/GraphBLAS/CUDA/GB_index.h b/GraphBLAS/CUDA/GB_index.h
new file mode 120000
index 000000000..896c5ab89
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_index.h
@@ -0,0 +1 @@
+../Source/GB_index.h
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/GB_int64_multiply.c b/GraphBLAS/CUDA/GB_int64_multiply.c
new file mode 120000
index 000000000..3ab424008
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_int64_multiply.c
@@ -0,0 +1 @@
+../Source/GB_int64_multiply.c
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/GB_jit_cache.cu b/GraphBLAS/CUDA/GB_jit_cache.cu
index a42488b69..99414c69a 100644
--- a/GraphBLAS/CUDA/GB_jit_cache.cu
+++ b/GraphBLAS/CUDA/GB_jit_cache.cu
@@ -86,7 +86,7 @@ std::string getCacheDir() {
   struct stat st;
   if ( (stat( kernel_cache_path.c_str(), &st) != 0) ) {
     // `mkdir -p` the kernel cache path if it doesn't exist
-    printf("cache is going to path %s\n", kernel_cache_path.c_str());
+//    printf("cache is going to path %s\n", kernel_cache_path.c_str());
     int status;
     status = std::filesystem::create_directories(kernel_cache_path.c_str());
 //    if (status != 0 ) return std::string();
@@ -128,7 +128,7 @@ named_prog<jitify::experimental::Program> GBJitCache::getProgram(
 {
     // Lock for thread safety
     std::lock_guard<std::mutex> lock(_program_cache_mutex);
-    printf(" jit_cache get program %s\n", prog_name.c_str());
+//    printf(" jit_cache get program %s\n", prog_name.c_str());
 
     return getCached(prog_name, program_map, 
         [&](){
@@ -155,7 +155,7 @@ named_prog<jitify::experimental::KernelInstantiation> GBJitCache::getKernelInsta
     std::string kern_inst_name = kern_name;
     for ( auto&& arg : arguments ) kern_inst_name += '_' + arg;
 
-    printf(" got kernel instance %s\n",kern_inst_name.c_str());
+//    printf(" got kernel instance %s\n",kern_inst_name.c_str());
 
     return getCached(kern_inst_name, kernel_inst_map, 
         [&](){return program.kernel(kern_name)
@@ -197,7 +197,7 @@ std::string GBJitCache::cacheFile::read_file()
     int fd = open ( _file_name.c_str(), O_RDWR );
     if ( fd == -1 ) {
         // TODO: connect errors to GrB_error result
-        printf(" failed to open cache file %s\n",_file_name.c_str());
+//        printf(" failed to open cache file %s\n",_file_name.c_str());
         successful_read = false;
         return std::string();
     }
@@ -231,10 +231,10 @@ std::string GBJitCache::cacheFile::read_file()
         return content; // FIXME: use unique_ptr here
     }
 
-    printf("about to close\n");
+//    printf("about to close\n");
     fclose(fp);
     successful_read = true;
-    printf(" read cache file %s\n",_file_name.c_str());
+//    printf(" read cache file %s\n",_file_name.c_str());
 
     return content;
 }
@@ -244,7 +244,7 @@ void GBJitCache::cacheFile::write(std::string content)
     // Open file and create if it doesn't exist, with access 0600
     int fd = open ( _file_name.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR );
     if ( fd == -1 ) {
-        printf(" failed to open cache file for write %s\n",_file_name.c_str());
+//        printf(" failed to open cache file for write %s\n",_file_name.c_str());
         successful_write = false;
         return;
     }
@@ -260,7 +260,7 @@ void GBJitCache::cacheFile::write(std::string content)
 
     // Copy string into file
     if( fwrite(content.c_str(), content.length(), 1, fp) != 1 ) {
-        printf(" failed to write cache file %s\n",_file_name.c_str());
+//        printf(" failed to write cache file %s\n",_file_name.c_str());
         successful_write = false;
         fclose(fp);
         return;
diff --git a/GraphBLAS/CUDA/GB_jit_cache.h b/GraphBLAS/CUDA/GB_jit_cache.h
index 0b6f22862..7fe947fa5 100644
--- a/GraphBLAS/CUDA/GB_jit_cache.h
+++ b/GraphBLAS/CUDA/GB_jit_cache.h
@@ -46,6 +46,7 @@ class File_Desc
    virtual void macrofy() {
 
        printf("Uh oh. this isn't good\n");
+
    }
    std::string filename;
 };
@@ -203,13 +204,13 @@ class GBJitCache
         umap_str_shptr<T>& map )
     {
 
-        printf("INside get cached file\n");
+//        printf("INside get cached file\n");
         std::string name = file_object.filename;
 
         // Find memory cached T object
         auto it = map.find(name);
         if ( it != map.end()) {
-            std::cout<<"found memory-cached file "<<name<<std::endl;
+//            std::cout<<"found memory-cached file "<<name<<std::endl;
             return std::make_pair(name, it->second);
         }
         else { // Find file cached T object
@@ -219,27 +220,27 @@ class GBJitCache
             std::string file_name = cache_dir + "/" + name;
             if (not cache_dir.empty() ) {
                 // TODO: Use OS-agnostic path separator here
-                std::cout<<"looking for prog in file "<<file_name<<std::endl;
+//                std::cout<<"looking for prog in file "<<file_name<<std::endl;
                 file_object.open(file_name.c_str(), "r");
                 cacheFile file{file_name};
                 serialized = file.read_file();
                 successful_read = file.is_read_successful();
-                std::cout << "successful_read: " << successful_read << std::endl;
+//                std::cout << "successful_read: " << successful_read << std::endl;
                 if(successful_read) {
                     file_object.close();
-                    std::cout << "Just closed" << std::endl;
+//                    std::cout << "Just closed" << std::endl;
                 }
             }
             if (not successful_read) {
                 // JIT compile and write to file if possible
-                std::cout << "not successful read. macrofying" << std::endl;
+//                std::cout << "not successful read. macrofying" << std::endl;
                 file_object.open(file_name.c_str(), "w");
                 file_object.macrofy();
-                std::cout<<" got fresh content for "<<name<<std::endl;
+//                std::cout<<" got fresh content for "<<name<<std::endl;
                 file_object.close();
 
                 if (not cache_dir.empty()) {
-                    std::cout<<"writing in file "<<file_name<<std::endl;
+//                    std::cout<<"writing in file "<<file_name<<std::endl;
                     cacheFile file{file_name};
 
                     cacheFile macrofied{name};
@@ -268,7 +269,7 @@ class GBJitCache
         // Find memory cached T object
         auto it = map.find(name);
         if ( it != map.end()) {
-            std::cout<<"found memory-cached prog "<<name<<std::endl;
+//            std::cout<<"found memory-cached prog "<<name<<std::endl;
             return std::make_pair(name, it->second);
         }
         else { // Find file cached T object
@@ -288,17 +289,17 @@ class GBJitCache
             #endif
             if (not successful_read) {
                 // JIT compile and write to file if possible
-                    std::cout << "compiling now" << std::endl;
+//                    std::cout << "compiling now" << std::endl;
                 auto f = func();
 
-                    std::cout << "completed func()" << std::endl;
+//                    std::cout << "completed func()" << std::endl;
                 serialized = f.serialize();
-                std::cout<<" compiled serialized prog "<<name<<std::endl;
+//                std::cout<<" compiled serialized prog "<<name<<std::endl;
 
                 #if defined(JITIFY_USE_CACHE)
                     if (not cache_dir.empty()) {
                         std::string file_name = cache_dir + "/" + name;
-                        std::cout<<"writing prog in file "<<file_name<<std::endl;
+//                        std::cout<<"writing prog in file "<<file_name<<std::endl;
                         cacheFile file{file_name};
                         file.write(serialized);
                     }
diff --git a/GraphBLAS/CUDA/GB_jit_launcher.h b/GraphBLAS/CUDA/GB_jit_launcher.h
index 93b551d9b..f8ad4cdac 100644
--- a/GraphBLAS/CUDA/GB_jit_launcher.h
+++ b/GraphBLAS/CUDA/GB_jit_launcher.h
@@ -125,8 +125,8 @@ class launcher {
    * @tparam grid and block sizes 
    * @return Return launcher reference if successful
    */
-  jitify::experimental::KernelLauncher configure( dim3 grid, dim3 block){
-    return get_kernel().configure( grid, block); 
+  jitify::experimental::KernelLauncher configure( dim3 grid, dim3 block, unsigned int smem = 0, cudaStream_t stream = 0){
+    return get_kernel().configure( grid, block, smem, stream);
     //return get_kernel().configure_1d_max_occupancy( max_block_size=block.x); 
   }
 
diff --git a/GraphBLAS/CUDA/GB_matrix.h b/GraphBLAS/CUDA/GB_matrix.h
new file mode 120000
index 000000000..740b98579
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_matrix.h
@@ -0,0 +1 @@
+../Source/Template/GB_matrix.h
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/GB_opaque.h b/GraphBLAS/CUDA/GB_opaque.h
new file mode 120000
index 000000000..482bdacb1
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_opaque.h
@@ -0,0 +1 @@
+../Source/GB_opaque.h
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/GB_partition.h b/GraphBLAS/CUDA/GB_partition.h
new file mode 120000
index 000000000..35c6dcf08
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_partition.h
@@ -0,0 +1 @@
+../Source/GB_partition.h
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cpp b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cpp
index 67178b307..a546395e9 100644
--- a/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cpp
+++ b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cpp
@@ -29,16 +29,25 @@ GrB_Info GB_reduce_to_scalar_cuda
 )
 {
 
+    cudaStream_t stream;
+    CHECK_CUDA(cudaStreamCreate(&stream));
+
     //----------------------------------------------------------------------
     // reduce C to a scalar, just for testing:
     //----------------------------------------------------------------------
 
+    GBURBLE ("(get nnz) ") ;
     int64_t nz = GB_nnz(A);
+    GBURBLE ("(got nnz) ") ;
 
-    GB_cuda_reduce( A, s, reduce);
+    GB_cuda_reduce_factory myreducefactory;
+    myreducefactory.reduce_factory(reduce, A);
 
-    printf("num_triangles = %d\n",  s[0] );
+    GB_cuda_reduce( myreducefactory, A, s, reduce, stream);
+    GBURBLE ("(did reduce) ") ;
 
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    CHECK_CUDA(cudaStreamDestroy(stream));
     return GrB_SUCCESS ;
 }
 
diff --git a/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda_branch.cpp b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda_branch.cpp
index 6481f3aae..166567780 100644
--- a/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda_branch.cpp
+++ b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda_branch.cpp
@@ -13,22 +13,28 @@ bool GB_reduce_to_scalar_cuda_branch
     // work to do
     double work = GB_nnz (A) ;
 
-//    std::cout << "IS_BITMAP: " << GB_IS_BITMAP (A) << "IS_FULL: " << GB_IS_FULL(A) << std::endl;
-
     int ngpus_to_use = GB_ngpus_to_use (work) ;
     GBURBLE (" work:%g gpus:%d ", work, ngpus_to_use) ;
-    printf (" work:%g gpus:%d ", work, ngpus_to_use) ;
+
+    GB_Opcode opcode = reduce->op->opcode ;
+
     if (ngpus_to_use > 0
-        && (reduce->header_size == 0)     // semiring is built-in
+        // do it on the CPU if the monoid operator is user-defined:
+        // FIXME: handle user-defined operators
+        && (opcode != GB_USER_binop_code)
+        // the ANY monoid takes O(1) time; do it on the CPU:
+        && (opcode != GB_ANY_binop_code)
+        // FIXME: handle user-defined types:
         && (A->type->code != GB_UDT_code)
-        // FIXME: this is easy
+        // A iso takes O(log(nvals(A))) time; do it on the CPU:
         && !A->iso
-    ) {
+    )
+    {
         return true;
     }
     else
     { 
         return false;
     }
-
 }
+
diff --git a/GraphBLAS/CUDA/GB_stringify.h b/GraphBLAS/CUDA/GB_stringify.h
index 79da4f090..1cb87cc67 100644
--- a/GraphBLAS/CUDA/GB_stringify.h
+++ b/GraphBLAS/CUDA/GB_stringify.h
@@ -25,47 +25,89 @@
 #define GB_CUDA_STRLEN 2048
 
 //------------------------------------------------------------------------------
-// GB_stringify_semiring: build all strings for a semiring
+// left and right shift
 //------------------------------------------------------------------------------
 
-void GB_stringify_semiring     // build a semiring (name and code)
+#define LSHIFT(x,k) (((uint64_t) x) << k)
+#define RSHIFT(x,k,b) ((x >> k) & ((((uint64_t)0x00000001) << b) -1))
+
+//------------------------------------------------------------------------------
+// GB_stringify_reduce
+//------------------------------------------------------------------------------
+
+void GB_enumify_reduce      // enumerate a GrB_reduce problem
 (
+    // output:
+    uint64_t *rcode,        // unique encoding of the entire problem
     // input:
-    FILE *fp,               // File to write macros, assumed open already
-    GrB_Semiring semiring,  // the semiring to stringify
-    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
-    GrB_Type ctype,         // the type of C
-    GrB_Type mtype,         // the type of M, or NULL if no mask
-    GrB_Type atype,         // the type of A
-    GrB_Type btype,         // the type of B
+    GrB_Monoid reduce,      // the monoid to enumify
+    GrB_Matrix A
+) ;
+
+void GB_macrofy_reduce      // construct all macros for GrB_reduce to scalar
+(
+    // input:
+    FILE *fp,               // target file to write, already open
+    uint64_t rcode
+) ;
+
+//------------------------------------------------------------------------------
+// GB_stringify_ewise
+//------------------------------------------------------------------------------
+
+void GB_enumify_ewise         // enumerate a GrB_eWise problem
+(
+    // output:    2 x uint64?
+    uint64_t *ecode,        // unique encoding of the entire operation
+    // input:
+    // C matrix:
+    bool C_iso,             // if true, operator is ignored
+    int C_sparsity,         // sparse, hyper, bitmap, or full
+    GrB_Type ctype,         // C=((ctype) T) is the final typecast
+    // M matrix:
+    GrB_Matrix M,           // may be NULL
     bool Mask_struct,       // mask is structural
     bool Mask_comp,         // mask is complemented
-    int C_sparsity,         // sparsity structure of C
-    int M_sparsity,         // sparsity structure of M
-    int A_sparsity,         // sparsity structure of A
-    int B_sparsity          // sparsity structure of B
+    // operator:
+    GrB_BinaryOp binaryop,  // the binary operator to enumify
+    // A and B:
+    GrB_Matrix A,
+    GrB_Matrix B
 ) ;
 
-void GB_enumify_semiring   // enumerate a semiring
+void GB_macrofy_ewise           // construct all macros for GrB_eWise
 (
-    // output:
+    // input:
+    FILE *fp,                   // target file to write, already open
+    uint64_t ecode
+) ;
+
+//------------------------------------------------------------------------------
+// GB_stringify_mxm
+//------------------------------------------------------------------------------
+
+void GB_enumify_mxm         // enumerate a GrB_mxm problem
+(
+    // output:    2 x uint64?
     uint64_t *scode,        // unique encoding of the entire semiring
     // input:
-    GrB_Semiring semiring,  // the semiring to enumify
-    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
-    GrB_Type ctype,         // the type of C
-    GrB_Type mtype,         // the type of M, or NULL if no mask
-    GrB_Type atype,         // the type of A
-    GrB_Type btype,         // the type of B
+    // C matrix:
+    bool C_iso,             // if true, semiring is ignored
+    int C_sparsity,         // sparse, hyper, bitmap, or full
+    GrB_Type ctype,         // C=((ctype) T) is the final typecast
+    // M matrix:
+    GrB_Matrix M,           // may be NULL
     bool Mask_struct,       // mask is structural
     bool Mask_comp,         // mask is complemented
-    int C_sparsity,         // sparsity structure of C
-    int M_sparsity,         // sparsity structure of M
-    int A_sparsity,         // sparsity structure of A
-    int B_sparsity          // sparsity structure of B
+    // semiring:
+    GrB_Semiring semiring,  // the semiring to enumify
+    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+    // A and B:
+    GrB_Matrix A,
+    GrB_Matrix B
 ) ;
 
-void GB_macrofy_semiring   // construct all macros for a semiring
+void GB_macrofy_mxm        // construct all macros for GrB_mxm
 (
     // input:
     FILE *fp,                   // target file to write, already open
@@ -267,18 +309,6 @@ void GB_macrofy_terminal_statement     // macro for terminal statement
     const char *terminal_statement
 ) ;
 
-//------------------------------------------------------------------------------
-// GB_stringify_load: return a string to load/typecast macro
-//------------------------------------------------------------------------------
-
-void GB_stringify_load         // return a string to load/typecast macro
-(
-    // input:
-    FILE *fp,                       // File to write macros, assumed open already
-    const char *load_macro_name,    // name of macro to construct
-    bool is_pattern                 // if true, load/cast does nothing
-) ;
-
 //------------------------------------------------------------------------------
 // GB_stringify_opcode: name of unary/binary opcode
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/CUDA/GB_stringify_binop.c b/GraphBLAS/CUDA/GB_stringify_binop.c
index bf0b06bcc..87a819815 100644
--- a/GraphBLAS/CUDA/GB_stringify_binop.c
+++ b/GraphBLAS/CUDA/GB_stringify_binop.c
@@ -37,8 +37,6 @@ void GB_stringify_binop
     // get ecode from opcode, xcode, and for_semiring
     GB_enumify_binop (&ecode, opcode, xcode, for_semiring) ;
 
-    printf("ecode in stringify binop: %d\n", ecode);
-
     // convert ecode to string
     GB_charify_binop (&op_string, ecode) ;
 
@@ -104,7 +102,6 @@ void GB_enumify_binop
 
         case GB_MAX_binop_code :    // z = max(x,y)
 
-            printf("INside max binop code\n");
             switch (xcode)
             {
                 case GB_BOOL_code   : e = 17 ; break ; // x || y
@@ -118,8 +115,6 @@ void GB_enumify_binop
 
         case GB_PLUS_binop_code :   // z = x + y
 
-            printf("Inside plus binop code\n");
-
             switch (xcode)
             {
                 case GB_BOOL_code   : e = 17 ; break ; // x || y
@@ -520,7 +515,6 @@ void GB_enumify_binop
         default : break ;
     }
 
-    printf("e %d\n", e);
     (*ecode) = e ;
 }
 
@@ -539,8 +533,6 @@ void GB_charify_binop
 
     const char *f ;
 
-    printf("ecode in charify binop: %d\n", ecode);
-
     switch (ecode)
     {
 
@@ -791,7 +783,6 @@ void GB_charify_binop
         default  : f = NULL ;                       ; break ;
     }
 
-    printf("f = %s, %d\n", f, ecode);
     (*op_string) = f ;
 }
 
diff --git a/GraphBLAS/CUDA/GB_stringify_ewise.c b/GraphBLAS/CUDA/GB_stringify_ewise.c
new file mode 100644
index 000000000..f89c67d30
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_stringify_ewise.c
@@ -0,0 +1,297 @@
+//------------------------------------------------------------------------------
+// GB_stringify_ewise: build strings for GrB_eWise* (Add, Mult, and Union)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Construct a string defining all macros for an ewise operation, for
+// eWiseAdd, eWiseMult, and eWiseUnion, and its name.
+// User-defined types are not handled.
+
+#include "GB.h"
+#include "GB_stringify.h"
+
+//------------------------------------------------------------------------------
+// GB_enumify_ewise: enumerate a GrB_eWise* problem
+//------------------------------------------------------------------------------
+
+// dot3:  C<M>=A'*B, no accum
+// saxpy
+// inplace C_in is full/bitmap
+//      C_in <M> += A*B     monoid ztype doesn't cast (= accum->ytype)
+//      C_in <M>  = A*B     monoid ztype casts to C_in->type
+// ...
+
+// accum is not present.  Kernels that use it would require accum to be
+// the same as the monoid binary operator.
+
+void GB_enumify_ewise         // enumerate a GrB_eWise problem
+(
+    // output:    2 x uint64?
+    uint64_t *ecode,        // unique encoding of the entire operation
+    // input:
+    // C matrix:
+    bool C_iso,             // if true, operator is ignored
+    int C_sparsity,         // sparse, hyper, bitmap, or full
+    GrB_Type ctype,         // C=((ctype) T) is the final typecast
+    // M matrix:
+    GrB_Matrix M,           // may be NULL
+    bool Mask_struct,       // mask is structural
+    bool Mask_comp,         // mask is complemented
+    // operator:
+    GrB_BinaryOp binaryop,  // the binary operator to enumify
+    // A and B:
+    GrB_Matrix A,
+    GrB_Matrix B
+)
+{
+
+    //--------------------------------------------------------------------------
+    // handle the C_iso case
+    //--------------------------------------------------------------------------
+
+    if (C_iso)
+    {
+        // values of C are not computed by the kernel
+        binaryop = GxB_PAIR_BOOL ;
+    }
+
+    //--------------------------------------------------------------------------
+    // get the types
+    //--------------------------------------------------------------------------
+
+    GrB_Type atype = A->type ;
+    GrB_Type btype = B->type ;
+    GrB_Type mtype = (M == NULL) ? NULL : M->type ;
+
+    GrB_Type xtype = binaryop->xtype ;
+    GrB_Type ytype = binaryop->ytype ;
+    GrB_Type ztype = binaryop->ztype ;
+
+    GB_Opcode binaryop_opcode = binaryop->opcode ;
+
+    GB_Type_code xcode = xtype->code ;
+    GB_Type_code ycode = ytype->code ;
+    GB_Type_code zcode = ztype->code ;
+
+    //--------------------------------------------------------------------------
+    // rename redundant boolean operators
+    //--------------------------------------------------------------------------
+
+    // consider z = op(x,y) where both x and y are boolean:
+    // DIV becomes FIRST
+    // RDIV becomes SECOND
+    // MIN and TIMES become LAND
+    // MAX and PLUS become LOR
+    // NE, ISNE, RMINUS, and MINUS become LXOR
+    // ISEQ becomes EQ
+    // ISGT becomes GT
+    // ISLT becomes LT
+    // ISGE becomes GE
+    // ISLE becomes LE
+
+    if (xcode == GB_BOOL_code)  // && (ycode == GB_BOOL_code)
+    {
+        // rename the operator
+        binaryop_opcode = GB_boolean_rename (binaryop_opcode) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine if A and/or B are value-agnostic
+    //--------------------------------------------------------------------------
+
+    // These 1st, 2nd, and pair operators are all handled by the flip, so if
+    // flipxy is still true, all of these booleans will be false.
+    bool op_is_first  = (binaryop_opcode == GB_FIRST_binop_code ) ;
+    bool op_is_second = (binaryop_opcode == GB_SECOND_binop_code) ;
+    bool op_is_pair   = (binaryop_opcode == GB_PAIR_binop_code) ;
+    bool A_is_pattern = op_is_second || op_is_pair ;
+    bool B_is_pattern = op_is_first  || op_is_pair ;
+
+    //--------------------------------------------------------------------------
+    // enumify the binary operator
+    //--------------------------------------------------------------------------
+
+    int binaryop_ecode ;
+    GB_enumify_binop (&binaryop_ecode, binaryop_opcode, xcode, true) ;
+
+    //--------------------------------------------------------------------------
+    // enumify the types
+    //--------------------------------------------------------------------------
+
+    int acode = A_is_pattern ? 0 : atype->code ;   // 0 to 14
+    int bcode = B_is_pattern ? 0 : btype->code ;   // 0 to 14
+    int ccode = C_iso ? 0 : ctype->code ;          // 0 to 14
+
+    int A_iso_code = A->iso ? 1 : 0 ;
+    int B_iso_code = B->iso ? 1 : 0 ;
+
+    //--------------------------------------------------------------------------
+    // enumify the mask
+    //--------------------------------------------------------------------------
+
+    int mtype_code = (mtype == NULL) ? 0 : mtype->code ; // 0 to 14
+    int mask_ecode ;
+    GB_enumify_mask (&mask_ecode, mtype_code, Mask_struct, Mask_comp) ;
+
+    //--------------------------------------------------------------------------
+    // enumify the sparsity structures of C, M, A, and B
+    //--------------------------------------------------------------------------
+
+    int M_sparsity = GB_sparsity (M) ;
+    int A_sparsity = GB_sparsity (A) ;
+    int B_sparsity = GB_sparsity (B) ;
+
+    int csparsity, msparsity, asparsity, bsparsity ;
+    GB_enumify_sparsity (&csparsity, C_sparsity) ;
+    GB_enumify_sparsity (&msparsity, M_sparsity) ;
+    GB_enumify_sparsity (&asparsity, A_sparsity) ;
+    GB_enumify_sparsity (&bsparsity, B_sparsity) ;
+
+    //--------------------------------------------------------------------------
+    // construct the ewise ecode
+    //--------------------------------------------------------------------------
+
+    // total ecode bits: 46
+
+    printf ("before: "
+            " binaryop_ecode: %d, zcode: %d, xcode: %d, ycode: %d\n"
+            ", mask_ecode: %d, ccode: %d, acode: %d, bcode: %d, \n"
+            "csparsity: %d, msparsity: %d, asparsity: %d, bsparsity: %d\n",
+            binaryop_ecode, zcode, xcode, ycode, mask_ecode, ccode, acode,
+            bcode, csparsity, msparsity, asparsity, bsparsity) ;
+
+    (*ecode) =
+                                            // range        bits
+
+                LSHIFT (A_iso_code , 45) |  // 0 or 1       1
+                LSHIFT (B_iso_code , 44) |  // 0 or 1       1
+
+                // binaryop, z = f(x,y)
+                LSHIFT (binaryop_ecode, 36) |  // 0 to 139     8
+                LSHIFT (zcode      , 32) |  // 0 to 14      4
+                LSHIFT (xcode      , 28) |  // 0 to 14      4
+                LSHIFT (ycode      , 24) |  // 0 to 14      4
+
+                // mask
+                LSHIFT (mask_ecode , 20) |  // 0 to 13      4
+
+                // types of C, A, and B (bool, int*, uint*, etc)
+                LSHIFT (ccode      , 16) |  // 0 to 14      4
+                LSHIFT (acode      , 12) |  // 0 to 14      4
+                LSHIFT (bcode      ,  8) |  // 0 to 14      4
+
+                // sparsity structures of C, M, A, and B
+                LSHIFT (csparsity  ,  6) |  // 0 to 3       2
+                LSHIFT (msparsity  ,  4) |  // 0 to 3       2
+                LSHIFT (asparsity  ,  2) |  // 0 to 3       2
+                LSHIFT (bsparsity  ,  0) ;  // 0 to 3       2
+
+    printf ("binaryop_ecode: %lu\n", *ecode) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_macrofy_ewise: construct all macros for a semiring
+//------------------------------------------------------------------------------
+
+void GB_macrofy_ewise           // construct all macros for GrB_eWise
+(
+    // input:
+    FILE *fp,                   // target file to write, already open
+    uint64_t ecode
+)
+{
+
+    printf ("ecode in macrofy_ewise: %lu\n", ecode) ;
+
+    //--------------------------------------------------------------------------
+    // extract the binaryop ecode
+    //--------------------------------------------------------------------------
+
+    // A and B iso-valued
+    int A_iso_code  = RSHIFT (ecode, 45, 1) ;
+    int B_iso_code  = RSHIFT (ecode, 44, 1) ;
+
+    // binary operator
+    int binaryop_ecode  = RSHIFT (ecode, 36, 8) ;
+    int zcode       = RSHIFT (ecode, 32, 4) ;
+    int xcode       = RSHIFT (ecode, 28, 4) ;
+    int ycode       = RSHIFT (ecode, 24, 4) ;
+
+    // mask
+    int mask_ecode  = RSHIFT (ecode, 20, 4) ;
+
+    // types of C, A, and B
+    int ccode       = RSHIFT (ecode, 16, 4) ;   // if 0: C is iso
+    int acode       = RSHIFT (ecode, 12, 4) ;   // if 0: A is pattern
+    int bcode       = RSHIFT (ecode,  8, 4) ;   // if 0: B is pattern
+
+    // formats of C, A, and B
+    int csparsity   = RSHIFT (ecode,  6, 2) ;
+    int msparsity   = RSHIFT (ecode,  4, 2) ;
+    int asparsity   = RSHIFT (ecode,  2, 2) ;
+    int bsparsity   = RSHIFT (ecode,  0, 2) ;
+
+    printf ("before: "
+            " binaryop_ecode: %d, zcode: %d, xcode: %d, ycode: %d\n"
+            ", mask_ecode: %d, ccode: %d, acode: %d, bcode: %d, \n"
+            "csparsity: %d, msparsity: %d, asparsity: %d, bsparsity: %d\n",
+            binaryop_ecode, zcode, xcode, ycode, mask_ecode, ccode, acode,
+            bcode, csparsity, msparsity, asparsity, bsparsity) ;
+
+    //--------------------------------------------------------------------------
+    // construct macros to load scalars from A and B (and typecast) them
+    //--------------------------------------------------------------------------
+
+    int A_is_pattern = (acode == 0) ? 1 : 0 ;
+    int B_is_pattern = (bcode == 0) ? 1 : 0 ;
+
+    fprintf (fp, "// GB_ewise_%016" PRIX64 ".h\n", ecode) ;
+    fprintf (fp, "#define GB_A_IS_PATTERN %d\n", A_is_pattern) ;
+    fprintf (fp, "#define GB_A_ISO %d\n", A_iso_code) ;
+    fprintf (fp, "#define GB_B_IS_PATTERN %d\n", B_is_pattern) ;
+    fprintf (fp, "#define GB_B_ISO %d\n", B_iso_code) ;
+
+    //--------------------------------------------------------------------------
+    // construct macros for the binary operator
+    //--------------------------------------------------------------------------
+
+    const char *s ;
+    GB_charify_binop (&s, binaryop_ecode) ;
+    GB_macrofy_binop (fp, "GB_BINOP", s, false) ;
+
+    //--------------------------------------------------------------------------
+    // macro to typecast the result back into C
+    //--------------------------------------------------------------------------
+
+    bool C_iso = (ccode == 0) ;
+    if (C_iso)
+    {
+        fprintf (fp, "#define GB_PUTC(blob)\n") ;
+        fprintf (fp, "#define GB_C_ISO 1\n") ;
+    }
+    else
+    {
+        fprintf (fp, "#define GB_PUTC(blob) blob\n") ;
+        fprintf (fp, "#define GB_C_ISO 0\n") ;
+    }
+
+    //--------------------------------------------------------------------------
+    // construct the macros to access the mask (if any), and its name
+    //--------------------------------------------------------------------------
+
+    GB_macrofy_mask (fp, mask_ecode) ;
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity formats of C, M, A, and B
+    //--------------------------------------------------------------------------
+
+    GB_macrofy_sparsity (fp, "C", csparsity) ;
+    GB_macrofy_sparsity (fp, "M", msparsity) ;
+    GB_macrofy_sparsity (fp, "A", asparsity) ;
+    GB_macrofy_sparsity (fp, "B", bsparsity) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_stringify_load.c b/GraphBLAS/CUDA/GB_stringify_load.c
deleted file mode 100644
index f0c6d08b3..000000000
--- a/GraphBLAS/CUDA/GB_stringify_load.c
+++ /dev/null
@@ -1,50 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_stringify_load: return a string to load/save a value
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2021, All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-//------------------------------------------------------------------------------
-
-// TODO: the typecast should be handled better.  See ../Source/GB_casting.h,
-// and note the inline functions to cast from double to integer.
-
-// Construct a macro to load and typecast.  For example:
-//  
-//  #define GB_GETA(blob) blob
-//
-// then use as:
-//      GB_GETA (double aij = Ax [p]) ;
-//      GB_GETA (double *Ax = A->x) ;
-//      GB_GETA (T_A *restrict Ax = A->x) ;
-//
-// which become
-//      double aij = Ax [p] ;
-//      double *Ax = A->x ;
-//      T_A *Ax = A->x ;
-//
-// or, if is_pattern is true, the macro becomes the empty string.
-
-#include "GB.h"
-#include "GB_stringify.h"
-
-void GB_stringify_load         // return a string to load/typecast macro
-(
-    // input:
-    FILE *fp,                       // File to write macros, assumed open already
-    const char *load_macro_name,    // name of macro to construct
-    bool is_pattern                 // if true, load/cast does nothing
-)
-{
-
-    if (is_pattern)
-    {
-        fprintf ( fp, "#define %s(blob)\n", load_macro_name) ;
-    }
-    else
-    {
-        fprintf ( fp, "#define %s(blob) blob\n", load_macro_name) ;
-    }
-}
-
diff --git a/GraphBLAS/CUDA/GB_stringify_mask.c b/GraphBLAS/CUDA/GB_stringify_mask.c
index 30aa16de2..d71f20f5a 100644
--- a/GraphBLAS/CUDA/GB_stringify_mask.c
+++ b/GraphBLAS/CUDA/GB_stringify_mask.c
@@ -29,7 +29,6 @@ void GB_stringify_mask     // return string to define mask macros
 {
 
     int mask_ecode ;
-    printf ("GB_stringify_mask gets mcode: %d\n", mcode) ;
 
     // get mask_ecode from mask type (mask_ecode) and mask descriptor
     GB_enumify_mask (&mask_ecode, mcode, Mask_struct, Mask_comp) ;
@@ -42,6 +41,8 @@ void GB_stringify_mask     // return string to define mask macros
 // GB_enumify_mask: return mask_ecode to define mask macros
 //------------------------------------------------------------------------------
 
+// FIXME: add M_sparsity here too?
+
 void GB_enumify_mask       // return enum to define mask macros
 (
     // output:
@@ -58,8 +59,6 @@ void GB_enumify_mask       // return enum to define mask macros
     // Mask_comp = (mask_ecode & 0x1) can be computed later.
     // Mask_struct = (mask_ecode == 2 || mask_ecode == 3)
 
-    printf ("GB_enumify_mask gets mcode: %d Mask_struct: %d Mask_comp: %d\n",
-        mcode, Mask_struct, Mask_comp) ;
     int e = -1 ;
 
     if (mcode == 0)
@@ -157,7 +156,6 @@ void GB_macrofy_mask       // return enum to define mask macros
 {
 
     const char *f ;
-    printf ("GB_macrofy_mask gets mask_ecode: %d\n", mask_ecode) ;
 
     switch (mask_ecode)
     {
@@ -309,8 +307,6 @@ void GB_macrofy_mask       // return enum to define mask macros
             break ;
     }
 
-    printf ("HERE is the Mask stuff:\n%s\n", f) ;
-
     fprintf( fp, "%s\n", f ) ;
 }
 
diff --git a/GraphBLAS/CUDA/GB_stringify_monoid.c b/GraphBLAS/CUDA/GB_stringify_monoid.c
index 0ac3d0075..8ee208afa 100644
--- a/GraphBLAS/CUDA/GB_stringify_monoid.c
+++ b/GraphBLAS/CUDA/GB_stringify_monoid.c
@@ -29,15 +29,11 @@ void GB_enumify_monoid  // enumerate a monoid
 )
 {
 
-    printf("Calling enumify binop\n");
     GB_enumify_binop (add_ecode, add_opcode, zcode, false) ;
     ASSERT (*add_ecode < 32) ;
-    printf("Calling enumify identity\n");
     GB_enumify_identity (id_ecode, add_opcode, zcode) ;
     bool is_term ;
-    printf("Calling enumify terminal\n");
     GB_enumify_terminal (&is_term, term_ecode, add_opcode, zcode) ;
-    printf("Done enumify monoid\n");
 }
 
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/CUDA/GB_stringify_semiring.c b/GraphBLAS/CUDA/GB_stringify_mxm.c
similarity index 63%
rename from GraphBLAS/CUDA/GB_stringify_semiring.c
rename to GraphBLAS/CUDA/GB_stringify_mxm.c
index 950d8b668..b4533240d 100644
--- a/GraphBLAS/CUDA/GB_stringify_semiring.c
+++ b/GraphBLAS/CUDA/GB_stringify_mxm.c
@@ -1,5 +1,5 @@
 //------------------------------------------------------------------------------
-// GB_stringify_semiring: build strings for a semiring
+// GB_stringify_mxm: build strings for GrB_mxm
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2021, All Rights Reserved.
@@ -14,99 +14,79 @@
 #include "GB_stringify.h"
 
 //------------------------------------------------------------------------------
-// GB_stringify_semiring: build strings for a semiring
+// GB_enumify_mxm: enumerate a GrB_mxm problem
 //------------------------------------------------------------------------------
 
-void GB_stringify_semiring     // build a semiring (name and code)
-(
-    // input:
-    FILE *fp,               // File to write macros, assumed open already
-    GrB_Semiring semiring,  // the semiring to stringify
-    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
-    GrB_Type ctype,         // the type of C
-    GrB_Type mtype,         // the type of M, or NULL if no mask
-    GrB_Type atype,         // the type of A
-    GrB_Type btype,         // the type of B
-    bool Mask_struct,       // mask is structural
-    bool Mask_comp,         // mask is complemented
-    int C_sparsity,         // sparsity structure of C
-    int M_sparsity,         // sparsity structure of M
-    int A_sparsity,         // sparsity structure of A
-    int B_sparsity          // sparsity structure of B
-)
-{
-
-    uint64_t scode ;
+// dot3:  C<M>=A'*B, no accum
+// saxpy
+// inplace C_in is full/bitmap
+//      C_in <M> += A*B     monoid ztype doesn't cast (= accum->ytype)
+//      C_in <M>  = A*B     monoid ztype casts to C_in->type
+// ...
 
-    printf("Inside stringify semiring\n");
+// accum is not present.  Kernels that use it would require accum to be
+// the same as the monoid binary operator.
 
-    GB_enumify_semiring (&scode,
-        semiring, flipxy,
-        ctype, mtype, atype, btype, Mask_struct, Mask_comp,
-        C_sparsity, M_sparsity, A_sparsity, B_sparsity) ;
-    printf("done enumify semiring: scode is %lu\n", scode);
-
-    GB_macrofy_semiring ( fp, scode) ;
-
-    printf("done macrofy semiring\n");
-}
-
-//------------------------------------------------------------------------------
-// GB_enumify_semiring: enumerate a semiring
-//------------------------------------------------------------------------------
-
-void GB_enumify_semiring   // enumerate a semiring
+void GB_enumify_mxm         // enumerate a GrB_mxm problem
 (
-    // output:
+    // output:    2 x uint64?
     uint64_t *scode,        // unique encoding of the entire semiring
     // input:
-    GrB_Semiring semiring,  // the semiring to enumify
-    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
-    GrB_Type ctype,         // the type of C
-    GrB_Type mtype,         // the type of M, or NULL if no mask
-    GrB_Type atype,         // the type of A
-    GrB_Type btype,         // the type of B
+    // C matrix:
+    bool C_iso,             // if true, semiring is ignored
+    int C_sparsity,         // sparse, hyper, bitmap, or full
+    GrB_Type ctype,         // C=((ctype) T) is the final typecast
+    // M matrix:
+    GrB_Matrix M,           // may be NULL
     bool Mask_struct,       // mask is structural
     bool Mask_comp,         // mask is complemented
-    int C_sparsity,         // sparsity structure of C
-    int M_sparsity,         // sparsity structure of M
-    int A_sparsity,         // sparsity structure of A
-    int B_sparsity          // sparsity structure of B
+    // semiring:
+    GrB_Semiring semiring,  // the semiring to enumify
+    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+    // A and B:
+    GrB_Matrix A,
+    GrB_Matrix B
 )
 {
 
+    //--------------------------------------------------------------------------
+    // handle the C_iso case
+    //--------------------------------------------------------------------------
+
+    if (C_iso)
+    {
+        semiring = GxB_ANY_PAIR_BOOL ;
+        flipxy = false ;
+    }
+
     //--------------------------------------------------------------------------
     // get the semiring
     //--------------------------------------------------------------------------
-    printf("inside enumify: \n") ;
-    GxB_print (semiring, 3) ;
 
-    printf("Getting semiring add\n");
+    // GxB_print (semiring, 3) ;
     GrB_Monoid add = semiring->add ;
-
-    printf("Getting semiring mult\n");
     GrB_BinaryOp mult = semiring->multiply ;
-
-    printf("Getting semiring add op\n");
     GrB_BinaryOp addop = add->op ;
 
-    printf("Getting types\n");
+    //--------------------------------------------------------------------------
+    // get the types
+    //--------------------------------------------------------------------------
+
+    GrB_Type atype = A->type ;
+    GrB_Type btype = B->type ;
+    GrB_Type mtype = (M == NULL) ? NULL : M->type ;
+
     GrB_Type xtype = mult->xtype ;
     GrB_Type ytype = mult->ytype ;
     GrB_Type ztype = mult->ztype ;
 
-    printf("Getting opcodes\n");
     GB_Opcode mult_opcode = mult->opcode ;
     GB_Opcode add_opcode  = addop->opcode ;
 
-
-
-    printf("Getting typecodes\n");
     GB_Type_code xcode = xtype->code ;
     GB_Type_code ycode = ytype->code ;
     GB_Type_code zcode = ztype->code ;
 
-    printf("Performing asserts\n");
     // these must always be true for any semiring:
     ASSERT (mult->ztype == addop->ztype) ;
     ASSERT (addop->xtype == addop->ztype && addop->ytype == addop->ztype) ;
@@ -127,15 +107,12 @@ void GB_enumify_semiring   // enumerate a semiring
     // ISGE becomes GE
     // ISLE becomes LE
 
-    printf("Invoking boolean rename\n");
     if (zcode == GB_BOOL_code)
     {
         // rename the monoid
         add_opcode = GB_boolean_rename (add_opcode) ;
     }
 
-    printf("Invoking boolean rename\n");
-
     if (xcode == GB_BOOL_code)  // && (ycode == GB_BOOL_code)
     {
         // rename the multiplicative operator
@@ -180,7 +157,6 @@ void GB_enumify_semiring   // enumerate a semiring
     //--------------------------------------------------------------------------
     // enumify the multiplier
     //--------------------------------------------------------------------------
-    printf("Invoking enumify binop\n");
 
     int mult_ecode ;
     GB_enumify_binop (&mult_ecode, mult_opcode, xcode, true) ;
@@ -188,41 +164,38 @@ void GB_enumify_semiring   // enumerate a semiring
     //--------------------------------------------------------------------------
     // enumify the monoid
     //--------------------------------------------------------------------------
-    printf("Invoking enumify monoid\n");
 
     int add_ecode, id_ecode, term_ecode ;
-    GB_enumify_monoid (&add_ecode, &id_ecode, &term_ecode, add_opcode, zcode ) ;
+    GB_enumify_monoid (&add_ecode, &id_ecode, &term_ecode, add_opcode, zcode) ;
 
     //--------------------------------------------------------------------------
     // enumify the types
     //--------------------------------------------------------------------------
 
-    printf("Done invoking enumify monoid\n");
-
-
-    printf("atype\n");
     int acode = A_is_pattern ? 0 : atype->code ;   // 0 to 14
-    printf("btype\n");
     int bcode = B_is_pattern ? 0 : btype->code ;   // 0 to 14
-    printf("ctype\n");
-    int ccode = ctype->code ;                      // 1 to 14
+    int ccode = C_iso ? 0 : ctype->code ;          // 0 to 14
+
+    int A_iso_code = A->iso ? 1 : 0 ;
+    int B_iso_code = B->iso ? 1 : 0 ;
 
     //--------------------------------------------------------------------------
     // enumify the mask
     //--------------------------------------------------------------------------
 
-    printf("Invoking enumify_mask, mtype: \n");
-    GxB_print (mtype, 3) ;
+    // GxB_print (mtype, 3) ;
     int mtype_code = (mtype == NULL) ? 0 : mtype->code ; // 0 to 14
     int mask_ecode ;
-    printf("Mask_struct: %d, Mask_comp: %d\n", Mask_struct, Mask_comp);
     GB_enumify_mask (&mask_ecode, mtype_code, Mask_struct, Mask_comp) ;
-    printf ("got mask_ecode: %d\n", mask_ecode) ;
 
     //--------------------------------------------------------------------------
     // enumify the sparsity structures of C, M, A, and B
     //--------------------------------------------------------------------------
 
+    int M_sparsity = GB_sparsity (M) ;
+    int A_sparsity = GB_sparsity (A) ;
+    int B_sparsity = GB_sparsity (B) ;
+
     int csparsity, msparsity, asparsity, bsparsity ;
     GB_enumify_sparsity (&csparsity, C_sparsity) ;
     GB_enumify_sparsity (&msparsity, M_sparsity) ;
@@ -233,20 +206,14 @@ void GB_enumify_semiring   // enumerate a semiring
     // construct the semiring scode
     //--------------------------------------------------------------------------
 
-    // total scode bits: 60
-
-    printf("constructing semiring scode\n");
-
-#define LSHIFT(x,k) (((uint64_t) x) << k)
-    printf("before: add_ecode: %d, id_ecode: %d, term_ecode: %d, mult_ecode: %d, flipxy: %d, zcode: %d, "
-           "xcode: %d, ycode: %d, mask_ecode: %d, ccode: %d, acode: %d, bcode: %d, csparsity: %d, msparsity: %d, "
-           "asparsity: %d, bsparsity: %d\n", add_ecode, id_ecode, term_ecode, mult_ecode, flipxy, zcode, xcode, ycode, mask_ecode,
-           ccode, acode, bcode, csparsity, msparsity, asparsity, bsparsity);
-
-
+    // total scode bits: 62
 
     (*scode) =
                                             // range        bits
+
+                LSHIFT (A_iso_code , 61) |  // 0 or 1       1
+                LSHIFT (B_iso_code , 60) |  // 0 or 1       1
+
                 // monoid
                 LSHIFT (add_ecode  , 55) |  // 0 to 22      5
                 LSHIFT (id_ecode   , 50) |  // 0 to 31      5
@@ -263,28 +230,24 @@ void GB_enumify_semiring   // enumerate a semiring
                 LSHIFT (mask_ecode , 20) |  // 0 to 13      4
 
                 // types of C, A, and B (bool, int*, uint*, etc)
-                LSHIFT (ccode      , 16) |  // 1 to 14      4
+                LSHIFT (ccode      , 16) |  // 0 to 14      4
                 LSHIFT (acode      , 12) |  // 0 to 14      4
                 LSHIFT (bcode      ,  8) |  // 0 to 14      4
 
-                // sparsity structures of C, A, and B
+                // sparsity structures of C, M, A, and B
                 LSHIFT (csparsity  ,  6) |  // 0 to 3       2
                 LSHIFT (msparsity  ,  4) |  // 0 to 3       2
                 LSHIFT (asparsity  ,  2) |  // 0 to 3       2
                 LSHIFT (bsparsity  ,  0) ;  // 0 to 3       2
 
-                printf("serialized_scode: %lu\n", *scode);
-
-
-    printf("done enumify semiring\n");
-
+//    printf ("serialized_scode: %lu\n", *scode) ;
 }
 
 //------------------------------------------------------------------------------
-// GB_macrofy_semiring: construct all macros for a semiring
+// GB_macrofy_mxm: construct all macros for a semiring
 //------------------------------------------------------------------------------
 
-void GB_macrofy_semiring   // construct all macros for a semiring
+void GB_macrofy_mxm        // construct all macros for GrB_mxm
 (
     // input:
     FILE *fp,                   // target file to write, already open
@@ -292,14 +255,13 @@ void GB_macrofy_semiring   // construct all macros for a semiring
 )
 {
 
-    printf("scode in macrofy_semiring: %lu\n", scode);
-
     //--------------------------------------------------------------------------
     // extract the semiring scode
     //--------------------------------------------------------------------------
 
-#define RSHIFT(x,k,b) (x >> k) & ((((uint64_t)0x00000001) << b) -1)
-//#define RSHIFT(x,k,b) (x >> k) & ((((uint64_t) 1) << (64-k) + b) - 1)
+    // A and B iso-valued
+    int A_iso_code  = RSHIFT (scode, 61, 1) ;
+    int B_iso_code  = RSHIFT (scode, 60, 1) ;
 
     // monoid
     int add_ecode   = RSHIFT (scode, 55, 5) ;
@@ -317,31 +279,17 @@ void GB_macrofy_semiring   // construct all macros for a semiring
     // mask
     int mask_ecode  = RSHIFT (scode, 20, 4) ;
 
-    printf("deserialized mask ecode: %d\n", mask_ecode);
-
     // types of C, A, and B
-    int ccode       = RSHIFT (scode, 16, 4) ;
-    int acode       = RSHIFT (scode, 12, 4) ;
-    int bcode       = RSHIFT (scode,  8, 4) ;
-
-    printf("deserialized acode: %d\n", acode);
+    int ccode       = RSHIFT (scode, 16, 4) ;   // if 0: C is iso
+    int acode       = RSHIFT (scode, 12, 4) ;   // if 0: A is pattern
+    int bcode       = RSHIFT (scode,  8, 4) ;   // if 0: B is pattern
 
-    // TODO: I have a suspicion here that these might not
-    // be getting serialized properly because they are the
-    // only elements which require only 2 bits.
-    // formats of C, A, and B
+    // formats of C, M, A, and B
     int csparsity   = RSHIFT (scode,  6, 2) ;
     int msparsity   = RSHIFT (scode,  4, 2) ;
     int asparsity   = RSHIFT (scode,  2, 2) ;
     int bsparsity   = RSHIFT (scode,  0, 2) ;
 
-    printf("after: add_ecode: %d, id_ecode: %d, term_ecode: %d, mult_ecode: %d, flipxy: %d, zcode: %d, "
-           "xcode: %d, ycode: %d, mask_ecode: %d, ccode: %d, acode: %d, bcode: %d, csparsity: %d, msparsity: %d, "
-           "asparsity: %d, bsparsity: %d\n", add_ecode, id_ecode, term_ecode, mult_ecode, flipxy, zcode, xcode, ycode, mask_ecode,
-           ccode, acode, bcode, csparsity, msparsity, asparsity, bsparsity);
-
-    printf("a sparsity ecode after rshift: %d\n", asparsity);
-
 
     //--------------------------------------------------------------------------
     // construct macros to load scalars from A and B (and typecast) them
@@ -351,53 +299,79 @@ void GB_macrofy_semiring   // construct all macros for a semiring
     // if flipxy false:  A is typecasted to x, and B is typecasted to y.
     // if flipxy true:   A is typecasted to y, and B is typecasted to x.
 
-    bool A_is_pattern = (acode == 0) ;
-    bool B_is_pattern = (bcode == 0) ;
+    int A_is_pattern = (acode == 0) ? 1 : 0 ;
+    int B_is_pattern = (bcode == 0) ? 1 : 0 ;
 
-    printf("stringify loaders \n");
-    GB_stringify_load ( fp, "GB_GETA", A_is_pattern) ;
-    GB_stringify_load ( fp, "GB_GETB", B_is_pattern) ;
+    fprintf (fp, "// GB_mxm_%016" PRIX64 ".h\n", scode) ;
+    fprintf (fp, "#define GB_A_IS_PATTERN %d\n", A_is_pattern) ;
+    fprintf (fp, "#define GB_A_ISO %d\n", A_iso_code) ;
+    fprintf (fp, "#define GB_B_IS_PATTERN %d\n", B_is_pattern) ;
+    fprintf (fp, "#define GB_B_ISO %d\n", B_iso_code) ;
 
     //--------------------------------------------------------------------------
     // construct macros for the multiply
     //--------------------------------------------------------------------------
 
-    printf("stringify mult \n");
     const char *s ;
-    printf("mult_ecode: %d\n", mult_ecode);
-    GB_charify_binop ( &s, mult_ecode) ;
-    GB_macrofy_binop ( fp, "GB_MULT", s, flipxy) ;
+    GB_charify_binop (&s, mult_ecode) ;
+    GB_macrofy_binop (fp, "GB_MULT", s, flipxy) ;
+    fprintf (fp, "#define GB_FLIPXY %d\n", flipxy ? 1 : 0) ;
 
     //--------------------------------------------------------------------------
     // construct the monoid macros
     //--------------------------------------------------------------------------
 
-    printf("stringify monoid \n");
-    GB_macrofy_monoid ( fp, add_ecode, id_ecode, term_ecode, is_term) ;
+    GB_macrofy_monoid (fp, add_ecode, id_ecode, term_ecode, is_term) ;
+
+    //--------------------------------------------------------------------------
+    // special cases
+    //--------------------------------------------------------------------------
+
+    // semiring is plus_pair_real
+    bool is_plus_pair_real =
+        (add_ecode == 11 // plus monoid
+        && mult_ecode == 133 // pair multiplicative operator
+        && !(zcode == GB_FC32_code || zcode == GB_FC64_code)) ; // real
+
+    fprintf (fp, "#define GB_IS_PLUS_PAIR_REAL_SEMIRING %d\n",
+        is_plus_pair_real) ;
+
+    // can ignore overflow in ztype when accumulating the result via the monoid
+    bool ztype_ignore_overflow = (
+        zcode == GB_INT64_code || zcode == GB_UINT64_code ||
+        zcode == GB_FP32_code  || zcode == GB_FP64_code ||
+        zcode == GB_FC32_code  || zcode == GB_FC64_code) ;
+
+    // note "CTYPE" is in the name in the CPU kernels (fix them to use ZTYPE)
+    fprintf (fp, "#define GB_ZTYPE_IGNORE_OVERFLOW %d\n",
+        ztype_ignore_overflow) ;
 
     //--------------------------------------------------------------------------
     // macro to typecast the result back into C
     //--------------------------------------------------------------------------
 
-    // for the ANY_PAIR semiring, "c_is_one" will be true, and Cx [0..cnz] will
-    // be filled with all 1's later.
-    bool c_is_one = false ;
-    // TODO:
-    // (add_ecode == GB_ANY_binop_code && mult_opcode == GB_PAIR_binop_code) ;
-    GB_stringify_load ( fp, "GB_PUTC", c_is_one) ;
+    bool C_iso = (ccode == 0) ;
+    if (C_iso)
+    {
+        fprintf (fp, "#define GB_PUTC(blob)\n") ;
+        fprintf (fp, "#define GB_C_ISO 1\n") ;
+    }
+    else
+    {
+        fprintf (fp, "#define GB_PUTC(blob) blob\n") ;
+        fprintf (fp, "#define GB_C_ISO 0\n") ;
+    }
 
     //--------------------------------------------------------------------------
     // construct the macros to access the mask (if any), and its name
     //--------------------------------------------------------------------------
 
-    printf("MACROFY MASK!\n");
-    GB_macrofy_mask ( fp, mask_ecode);
+    GB_macrofy_mask (fp, mask_ecode) ;
 
     //--------------------------------------------------------------------------
     // determine the sparsity formats of C, M, A, and B
     //--------------------------------------------------------------------------
 
-    printf("stringify sparsity \n");
     GB_macrofy_sparsity (fp, "C", csparsity) ;
     GB_macrofy_sparsity (fp, "M", msparsity) ;
     GB_macrofy_sparsity (fp, "A", asparsity) ;
diff --git a/GraphBLAS/CUDA/GB_stringify_opcode.c b/GraphBLAS/CUDA/GB_stringify_opcode.c
index 1326f77d3..b7b6fbd55 100644
--- a/GraphBLAS/CUDA/GB_stringify_opcode.c
+++ b/GraphBLAS/CUDA/GB_stringify_opcode.c
@@ -85,6 +85,7 @@ const char *GB_stringify_opcode    // name of unary/binary opcode
         case GB_TGAMMA_unop_code    : return ("tgamma") ;  // z = tgamma (x)
         case GB_ERF_unop_code       : return ("erf") ;     // z = erf (x)
         case GB_ERFC_unop_code      : return ("erfc") ;    // z = erfc (x)
+        case GB_CBRT_unop_code      : return ("cbrt") ;    // z = cbrt (x)
         case GB_FREXPX_unop_code    : return ("frexpx") ;  // z = frexpx (x)
         case GB_FREXPE_unop_code    : return ("frexpe") ;  // z = frexpe (x)
 
@@ -176,18 +177,18 @@ const char *GB_stringify_opcode    // name of unary/binary opcode
         // binary operators for real floating-point types (TxT -> T)
         //----------------------------------------------------------------------
 
-        case GB_ATAN2_binop_code     : return ("atan2") ;       // z = atan2 (x,y)
-        case GB_HYPOT_binop_code     : return ("hypot") ;       // z = hypot (x,y)
-        case GB_FMOD_binop_code      : return ("fmod") ;        // z = fmod (x,y)
-        case GB_REMAINDER_binop_code : return ("remainder") ;   // z=remainder(x,y)
-        case GB_COPYSIGN_binop_code  : return ("copysign") ;    // z=copysign (x,y)
-        case GB_LDEXP_binop_code     : return ("ldexp") ;       // z = ldexp (x,y)
+        case GB_ATAN2_binop_code     : return ("atan2") ;    // z = atan2 (x,y)
+        case GB_HYPOT_binop_code     : return ("hypot") ;    // z = hypot (x,y)
+        case GB_FMOD_binop_code      : return ("fmod") ;     // z = fmod (x,y)
+        case GB_REMAINDER_binop_code : return ("remainder") ;// z=remainder(x,y)
+        case GB_COPYSIGN_binop_code  : return ("copysign") ; // z=copysign (x,y)
+        case GB_LDEXP_binop_code     : return ("ldexp") ;    // z = ldexp (x,y)
 
         //----------------------------------------------------------------------
         // binary operator z=f(x,y) where z is complex, x,y real:
         //----------------------------------------------------------------------
 
-        case GB_CMPLX_binop_code     : return ("cmplx") ;       // z = cmplx (x,y)
+        case GB_CMPLX_binop_code     : return ("cmplx") ;    // z = cmplx (x,y)
 
         //----------------------------------------------------------------------
         // positional binary operators: z is int64, x and y are ignored
diff --git a/GraphBLAS/CUDA/GB_stringify_reduce.c b/GraphBLAS/CUDA/GB_stringify_reduce.c
new file mode 100644
index 000000000..bceaf8ff8
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_stringify_reduce.c
@@ -0,0 +1,168 @@
+//------------------------------------------------------------------------------
+// GB_stringify_reduce: build strings for GrB_reduce to scalar
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Construct a string defining all macros for reduction to scalar, and its name.
+// User-defined types are not handled.
+
+#include "GB.h"
+#include "GB_stringify.h"
+
+//------------------------------------------------------------------------------
+// GB_enumify_reduce: enumerate a GrB_reduce problem
+//------------------------------------------------------------------------------
+
+void GB_enumify_reduce      // enumerate a GrB_reduce problem
+(
+    // output:
+    uint64_t *rcode,        // unique encoding of the entire problem
+    // input:
+    GrB_Monoid reduce,      // the monoid to enumify
+    GrB_Matrix A
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get the monoid and type of A
+    //--------------------------------------------------------------------------
+
+    GrB_BinaryOp reduceop = reduce->op ;
+    GrB_Type atype = A->type ;
+    GrB_Type ztype = reduceop->ztype ;
+    GB_Opcode reduce_opcode  = reduceop->opcode ;
+    // these must always be true for any monoid:
+    ASSERT (reduceop->xtype == reduceop->ztype) ;
+    ASSERT (reduceop->ytype == reduceop->ztype) ;
+
+    //--------------------------------------------------------------------------
+    // rename redundant boolean operators
+    //--------------------------------------------------------------------------
+
+    // consider z = op(x,y) where both x and y are boolean:
+    // DIV becomes FIRST
+    // RDIV becomes SECOND
+    // MIN and TIMES become LAND
+    // MAX and PLUS become LOR
+    // NE, ISNE, RMINUS, and MINUS become LXOR
+    // ISEQ becomes EQ
+    // ISGT becomes GT
+    // ISLT becomes LT
+    // ISGE becomes GE
+    // ISLE becomes LE
+
+    GB_Type_code zcode = ztype->code ;
+    if (zcode == GB_BOOL_code)
+    {
+        // rename the monoid
+        reduce_opcode = GB_boolean_rename (reduce_opcode) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // enumify the monoid
+    //--------------------------------------------------------------------------
+
+    int red_ecode, id_ecode, term_ecode ;
+    GB_enumify_monoid (&red_ecode, &id_ecode, &term_ecode, reduce_opcode,
+        zcode) ;
+
+    //--------------------------------------------------------------------------
+    // enumify the type and sparsity structure of A
+    //--------------------------------------------------------------------------
+
+    int acode = atype->code ;   // 0 to 14
+    int A_sparsity = GB_sparsity (A) ;
+    int asparsity ;
+    GB_enumify_sparsity (&asparsity, A_sparsity) ;
+
+    //--------------------------------------------------------------------------
+    // construct the reduction rcode
+    //--------------------------------------------------------------------------
+
+    // total rcode bits: 25
+
+    (*rcode) =
+                                            // range        bits
+                // monoid
+                LSHIFT (red_ecode  , 20) |  // 0 to 22      5
+                LSHIFT (id_ecode   , 15) |  // 0 to 31      5
+                LSHIFT (term_ecode , 10) |  // 0 to 31      5
+
+                // type of the monoid
+                LSHIFT (zcode      ,  6) |  // 0 to 14      4
+
+                // type of A
+                LSHIFT (acode      ,  2) |  // 0 to 14      4
+
+                // sparsity structure of A
+                LSHIFT (asparsity  ,  0);  // 0 to 3       2
+}
+
+//------------------------------------------------------------------------------
+// GB_macrofy_reduce: construct all macros for a reduction to scalar
+//------------------------------------------------------------------------------
+
+void GB_macrofy_reduce      // construct all macros for GrB_reduce to scalar
+(
+    // input:
+    FILE *fp,               // target file to write, already open
+    uint64_t rcode
+)
+{
+
+    //--------------------------------------------------------------------------
+    // extract the reduction rcode
+    //--------------------------------------------------------------------------
+
+    // monoid
+    int red_ecode   = RSHIFT (rcode, 20, 5) ;
+    int id_ecode    = RSHIFT (rcode, 15, 5) ;
+    int term_ecode  = RSHIFT (rcode, 10, 5) ;
+    bool is_term    = (term_ecode < 30) ;
+
+    // type of the monoid
+    int zcode       = RSHIFT (rcode, 6, 4) ;
+
+    // type of A
+    int acode       = RSHIFT (rcode, 2, 4) ;
+
+    // format of A
+    int asparsity   = RSHIFT (rcode, 0, 2) ;
+
+    //--------------------------------------------------------------------------
+    // construct macros to load scalars from A (and typecast) them
+    //--------------------------------------------------------------------------
+
+    fprintf (fp, "// GB_reduce_%016" PRIX64 ".h\n", rcode) ;
+    fprintf (fp, "#define GB_A_IS_PATTERN 0\n") ;
+    fprintf (fp, "#define GB_A_ISO 0\n") ;
+    fprintf (fp, "#define GB_B_IS_PATTERN 1\n") ;
+    fprintf (fp, "#define GB_B_ISO 1\n") ;
+    fprintf (fp, "#define GB_FLIPXY 0\n") ;
+    fprintf (fp, "#define T_Y T_Z\n") ;
+    fprintf (fp, "#define T_X T_Z\n") ;
+
+    //--------------------------------------------------------------------------
+    // construct the monoid macros
+    //--------------------------------------------------------------------------
+
+    GB_macrofy_monoid (fp, red_ecode, id_ecode, term_ecode, is_term) ;
+
+    //--------------------------------------------------------------------------
+    // macro to typecast the result back into C
+    //--------------------------------------------------------------------------
+
+    fprintf (fp, "#define GB_PUTC(blob) blob\n") ;
+    fprintf (fp, "#define GB_C_ISO 0\n") ;
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity format of A
+    //--------------------------------------------------------------------------
+
+    GB_macrofy_sparsity (fp, "A", asparsity) ;
+}
+
diff --git a/GraphBLAS/CUDA/Makefile b/GraphBLAS/CUDA/Makefile
deleted file mode 100644
index 7aa5f4898..000000000
--- a/GraphBLAS/CUDA/Makefile
+++ /dev/null
@@ -1,141 +0,0 @@
-#-------------------------------------------------------------------------------
-# GraphBLAS/CUDA/Makefile
-#-------------------------------------------------------------------------------
-
-# cuda 10.1+ is assumed
-
-all: library
-
-GXX     ?= g++
-DOXYGEN ?= doxygen
-CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80
-
-CXX11 ?= 1
-
-CUDA_DIR ?= /usr/local/cuda
-
-CXXFLAGS += -pthread
-
-ifeq ($(CXX11),1)
-	CXXFLAGS += -std=c++14
-endif
-
-EMBED_BEGIN = -rdynamic -Wl,-b,binary,
-EMBED_END   = ,-b,default
-
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Linux)
-	CXXFLAGS += -D LINUX
-	CUDA_LIB_DIR = $(CUDA_DIR)/lib64
-else ifeq ($(UNAME_S),Darwin)
-	CUDA_LIB_DIR = $(CUDA_DIR)/lib
-endif
-
-INC += -I$(CUDA_DIR)/include
-LIB += -ldl -L$(CUDA_LIB_DIR) -lcuda -lcudart -lnvrtc
-
-
-GCC ?= gcc
-
-SRC = GB*.cu
-SRC2 = $(notdir $(wildcard $(SRC)))
-OBJ = $(SRC2:.cu=.o)
-cSRC = $(wildcard *.c) 
-cOBJ = $(cSRC:.c=.o)
-cppSRC = $(wildcard *.cpp)
-cppOBJ = $(cppSRC:.cpp=.o) 
-
-I = -I. -I../Source -I../Source/Template -I../Include -I../rmm/rmm/include/  -Irmm/thirdparty/spdlog/include 
-SO_NAME = libgraphblascuda.so
-SO_OPTS = --shared \
-    -Xlinker -soname \
-    -Xlinker $(SO_NAME)
-
-LIBS = -L/usr/local/cuda/lib64 -lcudadevrt -lcudart 
-
-CUDA_OPTS = -O2 --cudart=shared --gpu-architecture=compute_75 \
-        --relocatable-device-code true \
-        --std=c++14 -Xcompiler -fPIC
-
-library: $(SO_NAME) 
-
-HEADERS = jitify.hpp 
-
-TEMPLATES :=  $(wildcard templates/*.cu)
-
-JIT_TEMP := $(patsubst %.cu, %.cu.jit, $(TEMPLATES))
-
-%.cu: %.cutmp
-	cp $? $@
-
-%.cu.jit: %.cu 
-	./stringify $? > $@
-
-stringify: stringify.cpp
-	$(GXX) -o $@ $< -O3 -Wall
-
-    
-doc: jitify.hpp Doxyfile
-	$(DOXYGEN) Doxyfile
-.PHONY: doc
-
-test: $(cOBJ)
-	@echo $(cOBJ)
-
-$(cppOBJ): %.o: %.cpp 
-	$(GXX) $(I) -fPIC -o $@ -c $< -O2 -Wall
-
-$(cOBJ): %.o: %.c 
-	$(GCC) $(I) -fPIC -o $@ -c $< -O2 -Wall
-
-$(SO_NAME): $(OBJ) $(cOBJ) $(cppOBJ) $(JIT_TEMP) GB_AxB_dot3_cuda.o
-	echo $(OBJ)
-	nvcc $(SO_OPTS) $(LIBS) $(OBJ) $(cOBJ) -o $@
-
-GB_AxB_dot3_cuda.o: $(JIT_TEMP) GB_cuda_semiring_factory.hpp matrix.h
-%.o: %.cu
-	nvcc -c $(I) $(CUDA_OPTS) -o $@ $< $(LIBS)
-
-
-config:
-	nvidia-smi
-	nvcc --version
-	@echo " "
-	@echo "SO_NAME:   " $(SO_NAME)
-	@echo "SO_OPTS:   " $(SO_OPTS)
-	@echo "LIBS:      " $(LIBS)
-	@echo "CUDA_OPTS: " $(CUDA_OPTS)
-	@echo "SRC:       " $(SRC)
-	@echo "OBJ:       " $(OBJ)
-	@echo "I:         " $(I)
-	@echo " "
-	gcc  --version
-	icc  --version
-
-clean:
-	rm -f *.o
-	rm -f stringify
-.PHONY: clean
-
-distclean: clean
-	rm -f *.so *.a
-
-purge: distclean
-
-################################################################################
-
-
-EMBED_BEGIN = -rdynamic -Wl,-b,binary,
-EMBED_END   = ,-b,default
-
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Linux)
-	CXXFLAGS += -D LINUX
-	CUDA_LIB_DIR = $(CUDA_DIR)/lib64
-else ifeq ($(UNAME_S),Darwin)
-	CUDA_LIB_DIR = $(CUDA_DIR)/lib
-endif
-
-
-
-
diff --git a/GraphBLAS/CUDA/TODO.c b/GraphBLAS/CUDA/TODO.c
new file mode 100644
index 000000000..10498bab3
--- /dev/null
+++ b/GraphBLAS/CUDA/TODO.c
@@ -0,0 +1,12 @@
+
+//      FIXME: make dndn its own kernel, when both A and B are both dense
+//      also make a dndn kernel where A and/or B are bitmap/dense
+// GB_BUCKET_DNVS:      A(:,i) is dense and B(:,j) is very sparse
+// GB_BUCKET_DNSP:      A(:,i) is dense and B(:,j) is sparse
+// GB_BUCKET_VSDN:      B(:,j) is dense and A(:,j) is very sparse
+// GB_BUCKET_SPDN:      B(:,j) is dense and A(:,j) is sparse
+//      FIXME: the four D*S* and *S*D* buckets should be split into different
+//      kernels when A is bitmap/full and B is sparse/hypersparse or visa
+//      versa.  They would still need a preprocessing phase1 to split the
+//      entries of C into 2 buckets.
+
diff --git a/GraphBLAS/CUDA/TODO.txt b/GraphBLAS/CUDA/TODO.txt
deleted file mode 100644
index b80a11783..000000000
--- a/GraphBLAS/CUDA/TODO.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-
-TODO Get libgraphblascuda.a to work.
-TODO why is pthread demo hanging with CUDA?
-
diff --git a/GraphBLAS/CUDA/binary_search.h b/GraphBLAS/CUDA/binary_search.h
deleted file mode 100644
index ad15a32bf..000000000
--- a/GraphBLAS/CUDA/binary_search.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-#define GB_GETA( aval, ax, p) aval = (T_Z)ax[ ( p )]
-#define GB_GETB( bval, bx, p) bval = (T_Z)bx[ ( p )]
-#define GB_FLIP(i)             (-(i)-2)
-#define GB_IS_FLIPPED(i)       ((i) < 0)
-#define GB_IS_ZOMBIE(i)        ((i) < 0)
-#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
-//#define GB_IS_NOT_ZOMBIE(i)    ((i) >= 0)
-#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
-
-//------------------------------------------------------------------------------
-// GB_BINARY_SEARCH
-//------------------------------------------------------------------------------
-
-// search for integer i in the list X [pleft...pright]; no zombies.
-// The list X [pleft ... pright] is in ascending order.  It may have
-// duplicates.
-
-#define GB_BINARY_TRIM_SEARCH(i,X,pleft,pright)                             \
-{                                                                           \
-    /* binary search of X [pleft ... pright] for integer i */               \
-    while (pleft < pright)                                                  \
-    {                                                                       \
-        int64_t pmiddle = (pleft + pright) / 2 ;                            \
-        if (X [pmiddle] < i)                                                \
-        {                                                                   \
-            /* if in the list, it appears in [pmiddle+1..pright] */         \
-            pleft = pmiddle + 1 ;                                           \
-        }                                                                   \
-        else                                                                \
-        {                                                                   \
-            /* if in the list, it appears in [pleft..pmiddle] */            \
-            pright = pmiddle ;                                              \
-        }                                                                   \
-    }                                                                       \
-    /* binary search is narrowed down to a single item */                   \
-    /* or it has found the list is empty */                                 \
-    /*ASSERT (pleft == pright || pleft == pright + 1) ;*/                   \
-}
diff --git a/GraphBLAS/CUDA/go b/GraphBLAS/CUDA/go
deleted file mode 100755
index 4d7e48c80..000000000
--- a/GraphBLAS/CUDA/go
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-./jitFactory > o ; vim o
-
diff --git a/GraphBLAS/CUDA/jitFactory.hpp b/GraphBLAS/CUDA/jitFactory.hpp
index 18d14caaf..59561db65 100644
--- a/GraphBLAS/CUDA/jitFactory.hpp
+++ b/GraphBLAS/CUDA/jitFactory.hpp
@@ -27,6 +27,14 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/* fixme:  need to split this into multiple files.  One for the general
+bucket-based dot3 method (A and B both sparse/hyper), one for non-bucket-
+based dot3 methods (A and/or B bitmap/full), one for reduction, etc
+
+Otherwise, this will get too large when constructing all the CUDA kernels
+for all of GraphBLAS.
+*/
+
 /*
   Extended example for building on-the-fly kernels with C interface.
   Simple examples demonstrating different ways to load source code
@@ -42,14 +50,19 @@ extern "C" {
 #include "GraphBLAS.h"
 };
 #include "GB_jit_launcher.h"
-#include "GB_cuda_semiring_factory.hpp"
+#include "GB_cuda_mxm_factory.hpp"
+#include "GB_cuda_reduce_factory.hpp"
 #include "GB_cuda_buckets.h"
 #include "GB_cuda_type_wrap.hpp"
 #include "GB_cuda_error.h"
 #include "../rmm_wrap/rmm_wrap.h"
+#include "GB_iceil.h"
 
+// fixme: C11 is already required for all of GraphBLAS.  No need to test here:
 #if __cplusplus >= 201103L
 
+constexpr unsigned int SMEM = 0;
+
 /**
  * This file is responsible for picking all the parameters and what kernel variaiton we will use for a given instance
  * - data types
@@ -71,6 +84,7 @@ inline std::istream* (*file_callback)(std::string, std::iostream&);
 
 //AxB_dot3_phase1 kernel launchers
 template<int threads_per_block, int chunk_size> class phase1launchFactory ;
+template<int threads_per_block, int chunk_size> class dense_phase1launchFactory ;
 
 //AxB_dot3_phase3 kernel launchers
 
@@ -80,104 +94,189 @@ template<  typename T_C, typename T_M,
 
 static const std::vector<std::string> compiler_flags{
    "-std=c++14",
-   "-G",
+   //"-G",
    "-remove-unused-globals",
    "-w",
    "-D__CUDACC_RTC__",
    "-I.",
    "-I..",
-   "-I../../Source",
-   "-I../../Source/Template",
+  // "-I../../Source",
+  // "-I../../Source/Template",
    "-I../templates",
 
    // Add includes relative to GRAPHBLAS_SOURCE_PATH variable
    "-I" + jit::get_user_graphblas_source_path() + "/CUDA",
    "-I" + jit::get_user_graphblas_source_path() + "/CUDA/templates",
-   "-I" + jit::get_user_graphblas_source_path() + "/Source",
-   "-I" + jit::get_user_graphblas_source_path() + "/Source/Template",
+  // "-I" + jit::get_user_graphblas_source_path() + "/Source",
+  // "-I" + jit::get_user_graphblas_source_path() + "/Source/Template",
    "-I/usr/local/cuda/include",
 };
 
 static const std::vector<std::string> header_names ={};
 
-// FIXME: We probably want to remove this type template altogether and provide a
-// macro/function that can convert from a GrB_Type instance to the name of a type
-// that the jitifier will accept.
+//------------------------------------------------------------------------------
+// dot3: dense_phase1launchFactory 
+//------------------------------------------------------------------------------
+
+// Handles full/bitmap cases, which means we don't need buckets and zombies.
+// This is a much simpler kernel as a result, it only does the i,j lookup 
+// and stores the values in Mi and Ci. 
+
+template<int threads_per_block=32, int chunk_size = 128>
+class dense_phase1launchFactory 
+{
+  std::string base_name = "GB_jit";
+  std::string kernel_name = "AxB_dense_phase1";
+
+  GB_cuda_mxm_factory &mxm_factory_;
+
+public:
+
+  int get_number_of_blocks(GrB_Matrix M) {
+      int number_of_sms = GB_Global_gpu_sm_get (0);
+      int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
+      return GB_IMIN( nblks,  chunk_size * number_of_sms);
+  }
+
+  int get_threads_per_block() {
+      return threads_per_block;
+  }
+
+  // This assumes the needed state on the GB_cuda_mxm_factory
+  // has already been populated
+  dense_phase1launchFactory(GB_cuda_mxm_factory &mxm_factory): mxm_factory_(mxm_factory){}
+
+  bool jitGridBlockLaunch( GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, cudaStream_t stream = 0) {
+
+    // Idea is to have each task work on a continguous block of columns of C
+    // Note: for small tests, mnz is small so ntasks is be governed by
+    // chunksize, not chunk_size*number_of_sms.  For large problems in
+    // production, chunksize is less important since ntasks will likely be
+    // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100, for
+    // the default chunk_size of 128).
+
+    // Defining dummy instance only so we can introspect type
+//    // (1) create the mxm code and name
+
+//    // (2) ensure the jitifier has "GB_mxm_[mymxm.sr_code].h"
+    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
+    filecache.getFile (mxm_factory_) ;
+
+    auto sr_code = std::to_string(mxm_factory_.sr_code);
+
+    std::stringstream string_to_be_jitted ;
+    // FIXME: use mask_ecode instead, not even M->type->name
+    std::vector<std::string> template_types = {M->type->name, sr_code};
+
+    std::string hashable_name = base_name + "_" + kernel_name;
+    string_to_be_jitted << hashable_name << std::endl <<
+    R"(#include ")" << jit::get_user_home_cache_dir() << "/" << mxm_factory_.filename << R"(")" << std::endl <<
+    R"(#include "templates/)" << hashable_name << R"(.cuh")" << std::endl;
+
+    bool result = false;
+
+    dim3 grid(get_number_of_blocks(M));
+    dim3 block(get_threads_per_block());
+
+    jit::launcher( hashable_name + "_" + sr_code,   // FIXME: use mask_ecode
+                   string_to_be_jitted.str(),
+                   header_names,
+                   compiler_flags,
+                   file_callback)
+                 .set_kernel_inst(  kernel_name, template_types)
+                 .configure(grid, block, SMEM, stream)
+                 .launch( C, M, A, B);
+
+      result = true;
+
+      return result;
+     }
+};
+
+//------------------------------------------------------------------------------
+// dot3: phase1launchFactory 
+//------------------------------------------------------------------------------
+
+// FIXME: We probably want to remove this type template altogether and provide
+// a macro/function that can convert from a GrB_Type instance to the name of a
+// type that the jitifier will accept.
+
 template<int threads_per_block=32, int chunk_size = 128>
 class phase1launchFactory 
 {
   std::string base_name = "GB_jit";
   std::string kernel_name = "AxB_phase1";
 
-  GB_cuda_semiring_factory &semiring_factory_;
+  GB_cuda_mxm_factory &mxm_factory_;
 
 public:
 
   int get_number_of_blocks(GrB_Matrix M) {
       int number_of_sms = GB_Global_gpu_sm_get (0);
       int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
-      return GB_IMIN( nblks,  128 * number_of_sms);
+      return GB_IMIN( nblks,  chunk_size * number_of_sms);
   }
 
   int get_threads_per_block() {
       return threads_per_block;
   }
 
-  // This assumes the needed state on the GB_cuda_semiring_factory
+  // This assumes the needed state on the GB_cuda_mxm_factory
   // has already been populated
-  phase1launchFactory(GB_cuda_semiring_factory &semiring_factory): semiring_factory_(semiring_factory){}
+  phase1launchFactory(GB_cuda_mxm_factory &mxm_factory): mxm_factory_(mxm_factory){}
 
   bool jitGridBlockLaunch(int64_t *nanobuckets, int64_t *blockBucket,
-                          GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B) {
+                          GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, cudaStream_t stream = 0) {
 
     // Idea is to have each task work on a continguous block of columns of C
     // Note: for small tests, mnz is small so ntasks is be governed by
-    // chunksize, not 128*number_of_sms.  For large problems in production,
-    // chunksize is less important since ntasks will likely be bounded by
-    // 128*number_of_sms (say 128*80 = 10,240 on a V100).
+    // chunksize, not chunk_size*number_of_sms.  For large problems in
+    // production, chunksize is less important since ntasks will likely be
+    // bounded by chunk_size*number_of_sms (say 128*80 = 10,240 on a V100, for
+    // the default chunk_size of 128).
 
     // Defining dummy instance only so we can introspect type
+//    // (1) create the mxm code and name
 
-    std::cout << "A TYpe: " << A->type << std::endl;
-    std::cout << "B TYpe: " << B->type << std::endl;
-//    // (1) create the semiring code and name
-
-    //    // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
+//    // (2) ensure the jitifier has "GB_mxm_[mymxm.sr_code].h"
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (semiring_factory_) ;
+    filecache.getFile (mxm_factory_) ;
 
-    auto sr_code = std::to_string(semiring_factory_.sr_code);
+    auto sr_code = std::to_string(mxm_factory_.sr_code);
 
     std::stringstream string_to_be_jitted ;
+    // FIXME: use mask_ecode instead, not even M->type->name
     std::vector<std::string> template_types = {M->type->name, sr_code};
 
     std::string hashable_name = base_name + "_" + kernel_name;
     string_to_be_jitted << hashable_name << std::endl <<
-    R"(#include ")" << jit::get_user_home_cache_dir() << "/" << semiring_factory_.filename << R"(")" << std::endl <<
+    R"(#include ")" << jit::get_user_home_cache_dir() << "/" << mxm_factory_.filename << R"(")" << std::endl <<
     R"(#include "templates/)" << hashable_name << R"(.cuh")" << std::endl;
-    std::cout << string_to_be_jitted.str();
 
     bool result = false;
 
     dim3 grid(get_number_of_blocks(M));
     dim3 block(get_threads_per_block());
 
-    jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
+    jit::launcher( hashable_name + "_" + sr_code,   // FIXME: use mask_ecode
                    string_to_be_jitted.str(),
                    header_names,
                    compiler_flags,
                    file_callback)
                  .set_kernel_inst(  kernel_name, template_types)
-                 .configure(grid, block)
+                 .configure(grid, block, SMEM, stream)
                  .launch( nanobuckets, blockBucket, C, M, A, B);
 
-      checkCudaErrors( cudaDeviceSynchronize() );
       result = true;
 
       return result;
      }
 };
 
+//------------------------------------------------------------------------------
+// dot3: phase2launchFactory
+//------------------------------------------------------------------------------
+
 template<int threads_per_block = 32, int chunk_size = 128>
 class phase2launchFactory
 {
@@ -195,12 +294,19 @@ class phase2launchFactory
     const int64_t mnz = GB_nnz (M) ;
     int ntasks = ( mnz +chunk_size -1)/chunk_size;
     // Idea is to have each task work on a continguous block of columns of C
-    ntasks = GB_IMIN( ntasks,  128*GB_Global_gpu_sm_get (0)) ;    // ntasks will be grid.x
+    ntasks = GB_IMIN( ntasks,  chunk_size*GB_Global_gpu_sm_get (0)) ;    // ntasks will be grid.x
     return (ntasks + threads_per_block - 1) / threads_per_block ;
   }
 
+  int get_number_of_phase1_blocks( GrB_Matrix M){
+    const int64_t mnz = GB_nnz (M) ;
+    int number_of_sms = GB_Global_gpu_sm_get (0);
+    int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
+    return GB_IMIN( nblks,  chunk_size * number_of_sms);
+  }
+
   bool jitGridBlockLaunch(// parameters to AxB_phase2:
-                          int64_t *blockBucket, int64_t *offset, GrB_Matrix M) {
+                          int64_t *blockBucket, int64_t *offset, GrB_Matrix M, cudaStream_t stream = 0) {
 
     bool result = false;
 
@@ -212,9 +318,6 @@ class phase2launchFactory
       string_to_be_jitted <<
       hashable_name << std::endl << R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
 
-      // dump it:
-      std::cout << string_to_be_jitted.str();
-
       const int64_t mnz = GB_nnz (M) ;
       jit::launcher( hashable_name,
                      string_to_be_jitted.str(),
@@ -222,11 +325,10 @@ class phase2launchFactory
                      compiler_flags,
                      file_callback)
                    .set_kernel_inst( kernel_name, {})
-                   .configure(grid, block)
+                   .configure(grid, block, SMEM, stream)
                    // parameters to AxB_phase2:
-                   .launch( blockBucket, offset, get_number_of_blocks(M));
+                   .launch( blockBucket, offset, get_number_of_phase1_blocks(M));
 
-      checkCudaErrors( cudaDeviceSynchronize() );
       result= true;
 
       return result;
@@ -234,6 +336,10 @@ class phase2launchFactory
 
 };
 
+//------------------------------------------------------------------------------
+// dot3: phase2endlaunchFactory
+//------------------------------------------------------------------------------
+
 template< int threads_per_block = 32, int chunk_size = 128>
 class phase2endlaunchFactory
 {
@@ -253,12 +359,12 @@ class phase2endlaunchFactory
     int number_of_sms = GB_Global_gpu_sm_get (0);
 
     // Idea is to have each task work on a continguous block of columns of C
-    return GB_IMIN( ntasks,  128*number_of_sms) ;    // ntasks will be grid.x
+    return GB_IMIN( ntasks,  chunk_size*number_of_sms) ;    // ntasks will be grid.x
   }
 
   bool jitGridBlockLaunch(int64_t *nanobuckets, int64_t *blockBucket,
                           int64_t *bucketp, int64_t *bucket, int64_t *offset,
-                          GrB_Matrix C, GrB_Matrix M)
+                          GrB_Matrix C, GrB_Matrix M, cudaStream_t stream = 0)
      {
 
       bool result = false;
@@ -271,19 +377,15 @@ class phase2endlaunchFactory
       string_to_be_jitted <<
       hashable_name << std::endl << R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
 
-      // dump it:
-      std::cout << string_to_be_jitted.str();
-
       jit::launcher( hashable_name,
                      string_to_be_jitted.str(),
                      header_names,
                      compiler_flags,
                      file_callback)
                    .set_kernel_inst(  kernel_name , {})
-                   .configure(grid, block)
+                   .configure(grid, block, SMEM, stream)
                    .launch( nanobuckets, blockBucket, bucketp, bucket, offset, C, GB_nnz (M));
 
-      checkCudaErrors( cudaDeviceSynchronize() );
       result= true;
 
       return result;
@@ -291,37 +393,262 @@ class phase2endlaunchFactory
 
 };
 
+
+//------------------------------------------------------------------------------
+// dot3: mxm_dense_launchFactory
+//------------------------------------------------------------------------------
+
+class mxm_dense_launchFactory
+{
+  std::string base_name = "GB_jit";
+  std::string kernel_name = "AxB_dot3_phase3_dndn";
+
+  GB_cuda_mxm_factory &mxm_factory_;
+
+public:
+
+  /**
+   * This assumes the needed state on the GB_cuda_mxm_factory has already been populated.
+   * The `bucket_code` determines which kernel is launched
+   */
+  mxm_dense_launchFactory(GB_cuda_mxm_factory &mymxmfactory):
+      mxm_factory_(mymxmfactory) {}
+
+  bool jitGridBlockLaunch( GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B,
+                          cudaStream_t stream = 0) {
+
+      bool result = false;
+
+    //----------------------------------------------------------------------
+    // do the numerical work
+    //----------------------------------------------------------------------
+
+    const int64_t nz = GB_nnz(M); // number of dots in the mask
+    const int64_t mnvec = M->nvec ;
+
+    int gridsz, blocksz;
+
+    std::stringstream final_kernel_name_ss;
+    final_kernel_name_ss << kernel_name;
+
+    /**
+     * Configure geometry and kernel function name based on sparsity of C and number of vectors in M
+     */
+    configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz);
+
+    auto sr_code = std::to_string(mxm_factory_.sr_code);
+
+    GrB_BinaryOp mult = mxm_factory_.semiring->multiply ;
+
+    std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
+    std::stringstream string_to_be_jitted ;
+    std::vector<std::string> template_types =
+    {
+        C->type->name, A->type->name, B->type->name,
+        mult->ztype->name, mult->xtype->name, mult->ytype->name,
+        sr_code
+    };
+
+    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
+    filecache.getFile (mxm_factory_) ;
+
+    string_to_be_jitted << hashable_name << std::endl <<
+    R"(#include ")" << jit::get_user_home_cache_dir() << "/" << mxm_factory_.filename << R"(")" << std::endl <<
+    R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
+
+    dim3 grid(gridsz);
+    dim3 block(blocksz);
+
+    GBURBLE ("(GPU dot3 mxm dense launch nblocks,blocksize= %d,%d )\n", gridsz,blocksz) ;
+    jit::launcher( hashable_name + "_" + sr_code,
+                   string_to_be_jitted.str(),
+                   header_names,
+                   compiler_flags,
+                   file_callback)
+               .set_kernel_inst(final_kernel_name_ss.str(), template_types )
+                               // { C->type->name,
+                               //   A->type->name,
+                               //   B->type->name })
+               .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch
+               .launch(
+                        C,                 // final output matrix
+                                           // inputs, not modified:
+                        M,                 // Mi used for column index
+                        A,                 // A matrix
+                        B                  // B matrix
+                    );
+
+    result= true;
+
+    return result;
+  }
+
+private:
+    void configure(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname,
+                   int &blocksz, int &gridsz) {
+    int number_of_sms = GB_Global_gpu_sm_get (0) ;
+
+    int work_per_thread;
+
+     blocksz = 64;
+     work_per_thread = 8;
+     
+     if( Cnz > 1024){
+       blocksz = 512;
+       work_per_thread = 64;
+     }
+
+     // gridsz = ceiling (Cnz / work_per_thread*blocksz)
+     gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ;
+
+  }
+};
+
+//------------------------------------------------------------------------------
+// dot3: mxm_sparse_dense_launchFactory
+//------------------------------------------------------------------------------
+
+class mxm_sparse_dense_launchFactory
+{
+  std::string base_name = "GB_jit";
+  std::string kernel_name = "AxB_dot3_phase3_spdn";
+
+  GB_cuda_mxm_factory &mxm_factory_;
+
+public:
+
+  /**
+   * This assumes the needed state on the GB_cuda_mxm_factory has already been populated.
+   * The `bucket_code` determines which kernel is launched
+   */
+  mxm_sparse_dense_launchFactory(GB_cuda_mxm_factory &mymxmfactory):
+      mxm_factory_(mymxmfactory) {}
+
+  bool jitGridBlockLaunch( GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B,
+                          cudaStream_t stream = 0) {
+
+      bool result = false;
+
+    //----------------------------------------------------------------------
+    // do the numerical work
+    //----------------------------------------------------------------------
+
+    const int64_t nz = GB_nnz(M); // number of dots in the mask
+    const int64_t mnvec = M->nvec ;
+
+    int gridsz, blocksz;
+
+    std::stringstream final_kernel_name_ss;
+    final_kernel_name_ss << kernel_name;
+
+    /**
+     * Configure geometry and kernel function name based on sparsity of C and number of vectors in M
+     */
+    configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz);
+
+    auto sr_code = std::to_string(mxm_factory_.sr_code);
+
+    GrB_BinaryOp mult = mxm_factory_.semiring->multiply ;
+
+    std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
+    std::stringstream string_to_be_jitted ;
+    std::vector<std::string> template_types =
+    {
+        C->type->name, A->type->name, B->type->name,
+        mult->ztype->name, mult->xtype->name, mult->ytype->name,
+        sr_code
+    };
+
+    jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
+    filecache.getFile (mxm_factory_) ;
+
+    string_to_be_jitted << hashable_name << std::endl <<
+    R"(#include ")" << jit::get_user_home_cache_dir() << "/" << mxm_factory_.filename << R"(")" << std::endl <<
+    R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
+
+    dim3 grid(gridsz);
+    dim3 block(blocksz);
+
+    GBURBLE ("(GPU dot3 mxm sparse_dense launch nblocks,blocksize= %d,%d )\n", gridsz,blocksz) ;
+    jit::launcher( hashable_name + "_" + sr_code,
+                   string_to_be_jitted.str(),
+                   header_names,
+                   compiler_flags,
+                   file_callback)
+               .set_kernel_inst(final_kernel_name_ss.str(), template_types )
+                               // { C->type->name,
+                               //   A->type->name,
+                               //   B->type->name })
+               .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch
+               .launch(
+                        C,                 // final output matrix
+                                           // inputs, not modified:
+                        M,                 // Mi used for column index
+                        A,                 // A matrix
+                        B                  // B matrix
+                    );
+
+    result= true;
+
+    return result;
+  }
+
+private:
+    void configure(std::int64_t Cnz, std::int64_t mnvec, std::stringstream &opname,
+                   int &blocksz, int &gridsz) {
+    int number_of_sms = GB_Global_gpu_sm_get (0) ;
+
+    int work_per_thread;
+
+     blocksz = 64;
+     work_per_thread = 8;
+     
+     if( Cnz > 1024){
+       blocksz = 512;
+       work_per_thread = 64;
+     }
+
+     // gridsz = ceiling (Cnz / work_per_thread*blocksz)
+     gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ;
+
+  }
+};
+
+//------------------------------------------------------------------------------
+// dot3: phase3launchFactory
+//------------------------------------------------------------------------------
+
 class phase3launchFactory
 {
   std::string base_name = "GB_jit";
   std::string kernel_name = "AxB_dot3";
 
-  GB_cuda_semiring_factory &semiring_factory_;
+  GB_cuda_mxm_factory &mxm_factory_;
 
   GB_bucket_code bucket_code_;
 
 public:
 
-
+   std::string Opname;
 
   /**
-   * This assumes the needed state on the GB_cuda_semiring_factory has already been populated.
+   * This assumes the needed state on the GB_cuda_mxm_factory has already been populated.
    * The `bucket_code` determines which kernel is launched
    */
-  phase3launchFactory(GB_cuda_semiring_factory &mysemiringfactory, GB_bucket_code bucket_code):
-      semiring_factory_(mysemiringfactory), bucket_code_(bucket_code) {}
+  phase3launchFactory(GB_cuda_mxm_factory &mymxmfactory, GB_bucket_code bucket_code):
+      mxm_factory_(mymxmfactory), bucket_code_(bucket_code) {}
 
   bool jitGridBlockLaunch(int64_t start, int64_t end, int64_t *bucketp, int64_t *bucket,
-                          GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B) {
+                          GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B,
+                          cudaStream_t stream = 0) {
 
       bool result = false;
 
     //----------------------------------------------------------------------
     // phase3: do the numerical work
     //----------------------------------------------------------------------
-    C->jumbled = true;
-    C->nzombies = bucketp[1];  //set pre-zombie counts
-    const int64_t Cnz = GB_nnz (C) ;
+
+    const int64_t nz = end - start; // number of dots in this bucket  
     const int64_t mnvec = M->nvec ;
 
     int gridsz, blocksz, sz = 4;
@@ -332,33 +659,43 @@ class phase3launchFactory
     /**
      * Configure geometry and kernel function name based on sparsity of C and number of vectors in M
      */
-    configure(Cnz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);
+    configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);
+
+    auto sr_code = std::to_string(mxm_factory_.sr_code);
+
+    GrB_BinaryOp mult = mxm_factory_.semiring->multiply ;
 
     std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
     std::stringstream string_to_be_jitted ;
+    std::vector<std::string> template_types =
+    {
+        C->type->name, A->type->name, B->type->name,
+        mult->ztype->name, mult->xtype->name, mult->ytype->name,
+        sr_code
+    };
 
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (semiring_factory_) ;
+    filecache.getFile (mxm_factory_) ;
 
     string_to_be_jitted << hashable_name << std::endl <<
-    R"(#include ")" << jit::get_user_home_cache_dir() << "/" << semiring_factory_.filename << R"(")" << std::endl <<
+    R"(#include ")" << jit::get_user_home_cache_dir() << "/" << mxm_factory_.filename << R"(")" << std::endl <<
     R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
 
     dim3 grid(gridsz);
     dim3 block(blocksz);
 
-    C->nzombies = 0;
-    GBURBLE ("(GPU phase3 launch st,end=%ld,%ld nblocks,blocksize= %d,%d )\n",start,end,gridsz,blocksz) ;
-    jit::launcher( hashable_name,
+    GBURBLE ("(GPU phase3 launch %s st,end=%ld,%ld nblocks,blocksize= %d,%d )\n", this->Opname.c_str(),
+              start,end,gridsz,blocksz) ;
+    jit::launcher( hashable_name + "_" + sr_code,
                    string_to_be_jitted.str(),
                    header_names,
                    compiler_flags,
                    file_callback)
-               .set_kernel_inst(final_kernel_name_ss.str(),
-                                { C->type->name,
-                                  A->type->name,
-                                  B->type->name })
-               .configure(grid, block) //if commented, use implicit 1D configure in launch
+               .set_kernel_inst(final_kernel_name_ss.str(), template_types )
+                               // { C->type->name,
+                               //   A->type->name,
+                               //   B->type->name })
+               .configure(grid, block, SMEM, stream) //if commented, use implicit 1D configure in launch
                .launch(
                         start,             // input/output:
                         end,               // global bucket cumsum, of size NBUCKETS+1
@@ -371,10 +708,6 @@ class phase3launchFactory
                         sz                 // only used for sparse-sparse cases
                     );
 
-    GBURBLE ("(GPU phase3 done) ") ;
-
-    // do we really want to sync after each kernel launch in production?
-    checkCudaErrors( cudaDeviceSynchronize() );
     result= true;
 
     return result;
@@ -385,9 +718,8 @@ class phase3launchFactory
                    int &blocksz, int &gridsz, int &sz) {
     int number_of_sms = GB_Global_gpu_sm_get (0) ;
 
-    std::string Opname;
+    int work_per_thread;
 
-    printf("LAUNCHING BUCKET CODE: %d\n", (int)bucket_code_);
     switch (bucket_code_)
     {
 
@@ -399,70 +731,22 @@ class phase3launchFactory
             break ;
 
         //--------------------------------------------------------------
-        // CUDA kernel: dndn, handles a single bucket:
-        //--------------------------------------------------------------
-
-        // both A(:,i) and B(:,j) are dense
-        case GB_BUCKET_DNDN :
-            Opname = "phase3_dndn" ;
-
-            blocksz = 32;
-            gridsz = ( Cnz -1 + blocksz)/blocksz;
-            break ;
-
-        //--------------------------------------------------------------
-        // CUDA kernel: spdn, handles 4 buckets:
-        //--------------------------------------------------------------
-
-        // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
-        case GB_BUCKET_DNVS :
-        // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
-        case GB_BUCKET_VSDN :
-            sz = 64 ;
-            Opname = "phase3_spdn" ;
-            blocksz = 32;
-            gridsz = ( Cnz -1 + blocksz)/blocksz;
-            break ;
-
-        // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
-        case GB_BUCKET_DNSP :
-        // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
-        case GB_BUCKET_SPDN :
-            printf("Confiring spdn");
-            sz = 256 ;
-            Opname = "phase3_spdn" ;
-            blocksz = 32;
-            gridsz = ( Cnz -1 + blocksz)/blocksz;
-            break ;
-
-        //--------------------------------------------------------------
-        // CUDA kernel: vssp, handles 1 bucket, uses binary search:
-        //--------------------------------------------------------------
-
-        // A(:,i) is very sparse compared to B(:,j), or visa versa
-        case GB_BUCKET_VSSP :
-            Opname = "phase3_vssp" ;
-            blocksz = 32;
-            gridsz = ( Cnz -1 + blocksz)/blocksz;
-            break ;
-
-        //--------------------------------------------------------------
-        // CUDA kernel: vsvs, handles 4 buckets:
+        // CUDA kernel: vsvs bucket:
         //--------------------------------------------------------------
 
-        // let len = nnz (A (:,i) + nnz (B (:,j)), then:
-
-        printf("number_of_sms=%d\n", number_of_sms);
-        case GB_BUCKET_VSVS_256 : sz += 256-64 ;
-        case GB_BUCKET_VSVS_64 :  sz += 64-16  ;
-        case GB_BUCKET_VSVS_16 :  sz += 16-4   ;
-        case GB_BUCKET_VSVS_4 :   sz += 4      ;
+        case GB_BUCKET_VSVS :
             Opname = "phase3_vsvs" ;
-            blocksz = 512;
-
-            // FIXME: Is the first line not needed?
-            gridsz = GB_IMIN( 1024*number_of_sms, ( Cnz  + blocksz -1 )/blocksz);
-            gridsz =  ( Cnz  + blocksz -1 )/blocksz;
+            blocksz = 256;
+            work_per_thread = 4;
+            
+            if( Cnz > (2<<12)){
+              blocksz = 512;
+              work_per_thread = 4;
+            }
+
+            // gridsz = ceiling (Cnz / work_per_thread*blocksz)
+            gridsz = GB_ICEIL (Cnz, work_per_thread*blocksz) ;
+            if (gridsz > 256*number_of_sms)  gridsz = 256*number_of_sms;
             break ;
 
         //--------------------------------------------------------------
@@ -472,13 +756,17 @@ class phase3launchFactory
         case GB_BUCKET_MERGEPATH :
             Opname = "phase3_mp" ;
             blocksz = 32;
-            gridsz = ( Cnz -1 + blocksz)/blocksz;
-            break ;
-
-        case GB_BUCKET_WARP_IX :   sz = 32      ;
-            Opname = "phase3_warpix" ;
-            blocksz = 32;
-            gridsz =  GB_IMIN( (mnvec+15)/16, 256*number_of_sms);
+            work_per_thread = 256 ;
+
+            if( Cnz > (2<<20)){
+              work_per_thread = 1024;
+            }
+            gridsz = GB_ICEIL (Cnz, work_per_thread) ;
+            if ((gridsz < number_of_sms) && (Cnz > (2<<20)))
+            {
+               gridsz = number_of_sms; 
+            }
+            if (gridsz > 256*number_of_sms)  gridsz = 256*number_of_sms;
             break ;
 
         default:
@@ -489,52 +777,56 @@ class phase3launchFactory
   }
 };
 
+//------------------------------------------------------------------------------
+// reduceFactory
+//------------------------------------------------------------------------------
+
 class reduceFactory
 {
   std::string base_name = "GB_jit";
   std::string kernel_name = "reduceNonZombiesWarp";
 
-  int threads_per_block = 128;
+  int threads_per_block = 320 ;
+  int work_per_thread = 256;
+  int number_of_sms = GB_Global_gpu_sm_get (0);
+
+  GB_cuda_reduce_factory &reduce_factory_;
 
 public:
 
+  reduceFactory(GB_cuda_reduce_factory &myreducefactory) : reduce_factory_(myreducefactory) {}
+
   int get_threads_per_block() {
     return threads_per_block;
   }
 
   int get_number_of_blocks(unsigned int N) {
-      return (N + threads_per_block - 1)/threads_per_block;
+      return (N + work_per_thread*threads_per_block - 1)/(work_per_thread*threads_per_block);
   }
 
   // Note: this does assume the erased types are compatible w/ the monoid's ztype
   bool jitGridBlockLaunch(GrB_Matrix A, void* output,
-                          GrB_Monoid op)
+                          GrB_Monoid op, cudaStream_t stream = 0)
   {
-
-      // TODO: We probably want to "macrofy" the GrB_Monoid and define it in the `string_to_be_jitted`
-//      void GB_stringify_binop
-//        (
-//            // input:
-//            FILE *fp,                 // File to write macros, assumed open already
-//            const char *macro_name,   // name of macro to construct
-//            GB_Opcode opcode,   // opcode of GraphBLAS operator to convert into a macro
-//            GB_Type_code xcode, // op->xtype->code of the operator
-//            bool for_semiring,  // if true: op is a multiplier in a semiring
-//            bool flipxy         // if true, use mult(y,x) else mult(x,y)
-//        )
+      GBURBLE ("\n(launch reduce factory) \n") ;
 
       GrB_Scalar temp_scalar;
       GrB_Scalar_new(&temp_scalar, op->op->ztype);
 
       cuda::jit::scalar_set_element(temp_scalar, 0);
-
       GrB_Scalar_wait(temp_scalar, GrB_MATERIALIZE);
 
+      jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
+      filecache.getFile (reduce_factory_) ;
+
+      auto rcode = std::to_string(reduce_factory_.rcode);
+
       std::string hashable_name = base_name + "_" + kernel_name;
       std::stringstream string_to_be_jitted ;
       string_to_be_jitted <<
-      hashable_name << std::endl << R"(#include ")" <<
-        hashable_name << R"(.cuh")" << std::endl;
+      hashable_name << std::endl <<
+      R"(#include ")" << jit::get_user_home_cache_dir() << "/" << reduce_factory_.filename << R"(")" << std::endl <<
+      R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
 
       bool is_sparse = GB_IS_SPARSE(A);
       int64_t N = is_sparse ? GB_nnz(A) : GB_NCOLS(A) * GB_NROWS(A);
@@ -544,19 +836,22 @@ class reduceFactory
       dim3 grid(gridsz);
       dim3 block(blocksz);
 
-      jit::launcher(hashable_name,
+      // FIXME: call GB_stringify_reduce to create GB_ADD and related
+      // macros, in an include file: GB_reduce_123412341234.h
+      GBURBLE ("(cuda reduce launch %d threads in %d blocks)", blocksz, gridsz ) ;
+
+      jit::launcher(hashable_name + "_" + rcode,
                     string_to_be_jitted.str(),
                     header_names,
                     compiler_flags,
                     file_callback)
-               .set_kernel_inst(  kernel_name , { A->type->name, op->op->ztype->name, "true" })
-               .configure(grid, block)
-
+               .set_kernel_inst(  kernel_name , { A->type->name, op->op->ztype->name, rcode, "true" })
+               .configure(grid, block, SMEM, stream)
                // FIXME: GB_ADD is hardcoded into kernel for now
                .launch( A, temp_scalar, N, is_sparse);
 
-
-      checkCudaErrors( cudaDeviceSynchronize() );
+      // Need to synchronize before copying result to host
+      CHECK_CUDA( cudaStreamSynchronize(stream) );
 
       memcpy(output, temp_scalar->x, op->op->ztype->size);
 
@@ -565,145 +860,57 @@ class reduceFactory
   }
 };
 
-template<  int threads_per_block=32, int chunk_size = 128>
-inline bool GB_cuda_mxm_phase1(GB_cuda_semiring_factory &semiring_factory, int64_t *nanobuckets, int64_t *blockBucket,
-                        GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B) {
-    phase1launchFactory<threads_per_block, chunk_size> lf(semiring_factory);
-    return lf.jitGridBlockLaunch(nanobuckets, blockBucket, C, M, A, B);
+//------------------------------------------------------------------------------
+
+template<  int threads_per_block=32, int chunk_size = 256>
+inline bool GB_cuda_mxm_phase1(GB_cuda_mxm_factory &mxm_factory, int64_t *nanobuckets, int64_t *blockBucket,
+                        GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B,
+                        cudaStream_t stream = 0) {
+    phase1launchFactory<threads_per_block, chunk_size> lf(mxm_factory);
+    return lf.jitGridBlockLaunch(nanobuckets, blockBucket, C, M, A, B, stream);
 }
 
+//------------------------------------------------------------------------------
 
-template<int threads_per_block = 32, int chunk_size = 128>
+template<int threads_per_block = 32, int chunk_size = 256>
 bool GB_cuda_mxm_phase2(int64_t *nanobuckets, int64_t *blockBucket,
                           int64_t *bucketp, int64_t *bucket, int64_t *offset,
-                          GrB_Matrix M) {
+                          GrB_Matrix M, cudaStream_t stream = 0) {
 
   phase2launchFactory<threads_per_block, chunk_size> lf;
-  return lf.jitGridBlockLaunch(nanobuckets, blockBucket, bucketp, bucket, offset, M);
+  return lf.jitGridBlockLaunch(nanobuckets, blockBucket, bucketp, bucket, offset, M, stream);
 }
 
+//------------------------------------------------------------------------------
+
 template<int threads_per_block = 32, int chunk_size = 128>
 inline bool GB_cuda_mxm_phase2end(int64_t *nanobuckets, int64_t *blockBucket,
                            int64_t *bucketp, int64_t *bucket, int64_t *offset,
-                           GrB_Matrix C, GrB_Matrix M) {
+                           GrB_Matrix C, GrB_Matrix M, cudaStream_t stream) {
     phase2endlaunchFactory lf;
-    return lf.jitGridBlockLaunch(nanobuckets, blockBucket, bucketp, bucket, offset, C, M);
+    return lf.jitGridBlockLaunch(nanobuckets, blockBucket, bucketp, bucket, offset, C, M, stream);
 }
 
+//------------------------------------------------------------------------------
 
-
-inline bool GB_cuda_mxm_phase3(GB_cuda_semiring_factory &mysemiringfactory, GB_bucket_code bucket_code,
+inline bool GB_cuda_mxm_phase3(GB_cuda_mxm_factory &mymxmfactory, GB_bucket_code bucket_code,
                         int64_t start, int64_t end, int64_t *bucketp, int64_t *bucket,
-                          GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B) {
-    phase3launchFactory lf(mysemiringfactory, bucket_code);
-    return lf.jitGridBlockLaunch(start, end, bucketp, bucket, C, M, A, B);
+                          GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B, cudaStream_t stream = 0) {
+    phase3launchFactory lf(mymxmfactory, bucket_code);
+    return lf.jitGridBlockLaunch(start, end, bucketp, bucket, C, M, A, B, stream);
 }
 
+//------------------------------------------------------------------------------
 
-inline bool GB_cuda_reduce(GrB_Matrix A, void *output, GrB_Monoid op) {
-    reduceFactory rf;
-    return rf.jitGridBlockLaunch(A, output, op);
+inline bool GB_cuda_reduce(GB_cuda_reduce_factory &myreducefactory,
+                           GrB_Matrix A, void *output, GrB_Monoid op,
+                           cudaStream_t stream = 0) {
+    reduceFactory rf(myreducefactory);
+    GBURBLE ("(starting cuda reduce)" ) ;
+    bool result = rf.jitGridBlockLaunch(A, output, op, stream);
+    GBURBLE ("(ending cuda reduce)" ) ;
+    return (result) ;
 }
 
-
-//template<typename T1, typename T2, typename T3>
-//class spdotFactory
-//{
-//  std::string base_name = "GBjit_spDot_";
-//public:
-//  spdotFactory() {
-//  }
-//
-//  bool jitGridBlockLaunch(int gridsz, int blocksz, unsigned int xn, unsigned int *xi, T1* x,
-//                                                   unsigned int yn, unsigned int *yi, T2* y,
-//                                                        T3* output, std::string OpName)
-//  {
-//
-//      bool result = false;
-//      if (OpName == "PLUS_TIMES") {
-//         file_callback = &semiring_plus_times_callback;
-//      }
-//      else if (OpName == "MIN_PLUS") {
-//         file_callback = &semiring_min_plus_callback;
-//      }
-//
-//      T1 dum1;
-//      T2 dum2;
-//      T3 dum3;
-//
-//      dim3 grid(gridsz);
-//      dim3 block(blocksz);
-//
-//      jit::launcher( base_name + OpName,
-//                     ___templates_sparseDotProduct_cu,
-//                     header_names,
-//                     compiler_flags,
-//                     file_callback)
-//
-//                   .set_kernel_inst("sparseDotProduct",
-//                                    { GET_TYPE_NAME(dum1),
-//                                      GET_TYPE_NAME(dum2),
-//                                      GET_TYPE_NAME(dum3)})
-//                   .configure(grid, block)
-//                   .launch(xn, xi, x, yn, yi, y, output);
-//
-//
-//      checkCudaErrors( cudaDeviceSynchronize() );
-//      result= true;
-//
-//      return result;
-//  }
-//
-//};
-//
-//template<typename T1, typename T2, typename T3>
-//class dotFactory
-//{
-//  std::string base_name = "GBjit_dnDot_";
-//public:
-//  dotFactory() {
-//  }
-//
-//
-//  bool jitGridBlockLaunch(int gridsz, int blocksz, T1* x, T2* y, T3* output, unsigned int N, std::string OpName)
-//  {
-//
-//      bool result = false;
-//      if (OpName == "PLUS_TIMES") {
-//         file_callback = &semiring_plus_times_callback;
-//      }
-//      else if (OpName == "MIN_PLUS") {
-//         file_callback = &semiring_min_plus_callback;
-//      }
-//
-//      T1 dum1;
-//      T2 dum2;
-//      T3 dum3;
-//
-//      dim3 grid(gridsz);
-//      dim3 block(blocksz);
-//
-//      jit::launcher( base_name + OpName,
-//                     ___templates_denseDotProduct_cu,
-//                     header_names,
-//                     compiler_flags,
-//                     file_callback)
-//
-//                   .set_kernel_inst("denseDotProduct",
-//                                    { GET_TYPE_NAME(dum1),
-//                                      GET_TYPE_NAME(dum2),
-//                                      GET_TYPE_NAME(dum3)})
-//                   .configure(grid, block)
-//                   .launch(x, y, output, N);
-//
-//      checkCudaErrors( cudaDeviceSynchronize() );
-//      result= true;
-//
-//      return result;
-//  }
-//
-//};
-//
-//
 #endif  // C++11
-#endif
\ No newline at end of file
+#endif
diff --git a/GraphBLAS/CUDA/jitify.hpp b/GraphBLAS/CUDA/jitify.hpp
index 6370c6259..a744dd6ad 100644
--- a/GraphBLAS/CUDA/jitify.hpp
+++ b/GraphBLAS/CUDA/jitify.hpp
@@ -151,7 +151,7 @@
 #define JITIFY_PRINT_LOG 1
 #endif
 
-#define JITIFY_PRINT_ALL 1
+#define JITIFY_PRINT_ALL 0
 
 #if JITIFY_PRINT_ALL
 #define JITIFY_PRINT_INSTANTIATION 1
diff --git a/GraphBLAS/CUDA/matrix.h b/GraphBLAS/CUDA/matrix.h
deleted file mode 100644
index 6989f6499..000000000
--- a/GraphBLAS/CUDA/matrix.h
+++ /dev/null
@@ -1,145 +0,0 @@
-//SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-#undef  ASSERT
-#define ASSERT(x)
-
-//------------------------------------------------------------------------------
-// TODO: this will be in the jit code:
-#define chunksize 128 
-//#define GB_GETA( aval, ax, p) aval = (T_C)ax[ ( p )]
-//#define GB_GETB( bval, bx, p) bval = (T_C)bx[ ( p )]
-#define GB_ADD_F( f , s)  f = GB_ADD ( f, s ) 
-#define GB_C_MULT( c, a, b)  c = GB_MULT( (a), (b) )
-#define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
-#define GB_DOT_TERMINAL ( c )   
-//# if ( c == TERMINAL_VALUE) break;
-#undef GB_DOT_MERGE
-// cij += A(k,i) * B(k,j), for merge operation
-#define GB_DOT_MERGE                                                \
-{                                                                   \
-    GB_GETA ( aki= (T_C)Ax[pA]) ;       /* aki = A(k,i) */          \
-    GB_GETB ( bkj= (T_C)Bx[pB]) ;       /* bkj = B(k,j) */          \
-    if (cij_exists)                                                 \
-    {                                                               \
-        GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */      \
-    }                                                               \
-    else                                                            \
-    {                                                               \
-        /* cij = A(k,i) * B(k,j), and add to the pattern    */      \
-        cij_exists = true ;                                         \
-        GB_C_MULT (cij, aki, bkj) ;     /* cij  = aki * bkj */      \
-    }                                                               \
-}
-
-//------------------------------------------------------------------------------
-
-#ifndef GRAPHBLAS_H
-#define GRAPHBLAS_H
-
-// #include "GraphBLAS.h"
-
-#undef restrict
-#undef GB_restrict
-#if defined ( GB_CUDA_KERNEL ) || defined ( __NVCC__ )
-    #define GB_restrict __restrict__
-#else
-    #define GB_restrict
-#endif
-#define restrict GB_restrict
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <string.h>
-
-//------------------------------------------------------------------------------
-// remainder of this file is extracted from GraphBLAS.h:
-//------------------------------------------------------------------------------
-
-// GB_STR: convert the content of x into a string "x"
-#define GB_XSTR(x) GB_STR(x)
-#define GB_STR(x) #x
-
-#undef  GB_PUBLIC
-#define GB_PUBLIC extern
-#undef  GxB_MAX_NAME_LEN
-#define GxB_MAX_NAME_LEN 128
-
-typedef uint64_t GrB_Index ;
-typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
-typedef struct GB_Type_opaque *GrB_Type ;
-typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
-typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
-typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
-typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
-typedef struct GB_Monoid_opaque *GrB_Monoid ;
-typedef struct GB_Semiring_opaque *GrB_Semiring ;
-typedef struct GB_Scalar_opaque *GrB_Scalar ;
-typedef struct GB_Vector_opaque *GrB_Vector ;
-typedef struct GB_Matrix_opaque *GrB_Matrix ;
-
-#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
-#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
-#define GxB_BITMAP      4   // store matrix as a bitmap
-#define GxB_FULL        8   // store matrix as full; all entries must be present
-
-typedef void (*GxB_unary_function)  (void *, const void *) ;
-typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
-
-typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
-(
-    GrB_Index i,                // row index of A(i,j)
-    GrB_Index j,                // column index of A(i,j)
-    const void *x,              // value of A(i,j)
-    const void *thunk           // optional input for select function
-) ;
-
-typedef void (*GxB_index_unary_function)
-(
-    void *z,            // output value z, of type ztype
-    const void *x,      // input value x of type xtype; value of v(i) or A(i,j)
-    GrB_Index i,        // row index of A(i,j)
-    GrB_Index j,        // column index of A(i,j), or zero for v(i)
-    const void *y       // input scalar y
-) ;
-
-typedef enum
-{
-    // for all GrB_Descriptor fields:
-    GxB_DEFAULT = 0,    // default behavior of the method
-
-    // for GrB_OUTP only:
-    GrB_REPLACE = 1,    // clear the output before assigning new values to it
-
-    // for GrB_MASK only:
-    GrB_COMP = 2,       // use the structural complement of the input
-    GrB_SCMP = 2,       // same as GrB_COMP (historical; use GrB_COMP instead)
-    GrB_STRUCTURE = 4,  // use the only pattern of the mask, not its values
-
-    // for GrB_INP0 and GrB_INP1 only:
-    GrB_TRAN = 3,       // use the transpose of the input
-
-    // for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
-    GxB_GPU_ALWAYS  = 2001,
-    GxB_GPU_NEVER   = 2002,
-
-    // for GxB_AxB_METHOD only:
-    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
-    GxB_AxB_DOT       = 1003,   // dot product
-    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
-    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
-}
-GrB_Desc_Value ;
-
-#include "GB_opaque.h"
-#endif
-
-
-#include "GB_imin.h"
-#include "GB_zombie.h"
-#include "GB_nnz.h"
-#include "GB_partition.h"
-#include "GB_binary_search.h"
-#include "GB_search_for_vector_template.c"
-
diff --git a/GraphBLAS/CUDA/rmm_log.txt b/GraphBLAS/CUDA/rmm_log.txt
deleted file mode 100644
index d0fccc3df..000000000
--- a/GraphBLAS/CUDA/rmm_log.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-[1307177][10:34:14:662160][info  ] ----- RMM LOG BEGIN [PTDS DISABLED] -----
-[1307177][10:34:14:662829][error ] [A][Stream 0x1][Upstream 262144B][FAILURE maximum pool size exceeded]
diff --git a/GraphBLAS/CUDA/spGEMMfixtures.hpp b/GraphBLAS/CUDA/spGEMMfixtures.hpp
deleted file mode 100644
index ec6e1c5b9..000000000
--- a/GraphBLAS/CUDA/spGEMMfixtures.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-// SpGEMM Test Fixtures
-// Provides test setup and teardown, data generators and covers
-// all 12 cases for the masked GEMM ( C, M, A, B) in GraphBLAS
-// Connects to the jitFactory for launches.
-
-
-#pragma once
-#include <cassert>
-#include <cmath>
-#include <random>
-#include <algorithm>
-#include <cstdint>
-#include "test/jitTestFactory.hpp"
-#include <gtest/gtest.h>
-
-
-//Test generators using jitify
-
-
-TEST(SpGEMMvsvsTest, PlusTimesLongBoolIntInt) {
-  test_spGEMM_vsvs_factory<int64_t, uint8_t, int, int>(5, 32, 256, 128, "PLUS_TIMES"); 
-}
-
-TEST(SpGEMMvsvsTest, PlusTimesInt4Test ) {
-
-  test_spGEMM_vsvs_factory<int, int, int, int>(5, 32, 256, 128, "PLUS_TIMES"); 
-}
-
-TEST(SpGEMMvsvsTest, PlusTimesInt4TestMed ) {
-
-  test_spGEMM_vsvs_factory<int, int, int, int>(5, 4096, 25600, 25600, "PLUS_TIMES"); 
-}
-
-TEST(SpGEMMvsvsTest, PlusTimesFloat4Test ) {
-
-  test_spGEMM_vsvs_factory<float, float, float, float>(5, 32, 256, 128, "PLUS_TIMES"); 
-}
-
-TEST(SpGEMMvsdnTest, PlusTimesInt4Test) {
-
-  test_spGEMM_vsdn_factory<int, int, int, int>(5, 32, 256, 32*32, "PLUS_TIMES"); 
-}
-TEST(SpGEMMvsdnTest, PlusTimesInt4TestMed) {
-
-  test_spGEMM_vsdn_factory<int, int, int, int>(5, 256, 4096, 256*256, "PLUS_TIMES"); 
-}
-
-TEST( Reductions, PlusFloat) {
-  test_reducefactoryUM<float>(4096, "PLUS");
-}
-
-TEST( Reductions, PlusDouble) {
-  test_reducefactoryUM<double>(4096, "PLUS");
-}
-
-TEST( Reductions, MinFloat) {
-  test_reducefactoryUM<float>(32,"MIN");
-}
-
-TEST( Reductions, MinInt) {
-  test_reducefactoryUM<int>(32,"MIN");
-}
-
-TEST( Reductions, MaxInt) {
-  test_reducefactoryUM<int>(32,"MAX");
-}
-
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dense_phase1.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_dense_phase1.cuh
new file mode 100644
index 000000000..253b720bf
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dense_phase1.cuh
@@ -0,0 +1,169 @@
+//------------------------------------------------------------------------------
+// templates/GB_jit_AxB_phase1.cuh: symbolic load balancing and data partition
+// to assign work to different 'buckets' for later compute
+//------------------------------------------------------------------------------
+
+//  This kernel scans the non-zero pattern in A and B, takes into account the
+//  mask and computes total work required to form C. Then it classifies each
+//  dot product into a set of buckets for efficient compute. 
+
+#pragma once
+
+#define GB_CUDA_KERNEL
+#include <limits>
+#include "GB_cuda_kernel.h"
+#include "GB_cuda_buckets.h"
+#include <cub/block/block_scan.cuh>
+#include <cooperative_groups.h>
+
+using namespace cooperative_groups;
+//------------------------------------------------------------------------------
+// GB_AxB_dense_phase1: lookup i,j pairs and store in Mi, Ci 
+//------------------------------------------------------------------------------
+
+// GB_AxB_dense_phase1 is a CUDA kernel that scans all entries in M and
+// assigns i,j coordinates for each entries and stores in Mi and Ci. 
+
+
+template<typename T_M, uint64_t srcode, int chunk_size = 128>
+__global__ void AxB_dense_phase1
+(
+    // input/output:
+    GrB_Matrix C,           // final output matrix
+    // inputs, not modified:
+    const GrB_Matrix M,     // mask matrix
+    const GrB_Matrix A,     // input matrix
+    const GrB_Matrix B      // input matrix
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get C, M, A, and B
+    //--------------------------------------------------------------------------
+
+    const int64_t *__restrict__ Mh = M->h ;
+    const int64_t *__restrict__ Mp = M->p ;
+    const int64_t *__restrict__ Mi = M->i ;
+    const T_M *__restrict__ Mx = (T_M*) M->x ; // not accessed if M structural
+    const int64_t mnvec = M->nvec ;
+    const int64_t mvlen = M->vlen ;
+    const int64_t mnz =  GB_nnz(M) ;
+    const bool M_is_hyper = M->h != NULL ;
+
+    const int64_t *__restrict__ Ah = A->h ;
+    const int64_t *__restrict__ Ap = A->p ;
+    const int64_t *__restrict__ Ai = A->i ;
+    const int64_t avlen = A->vlen ;
+    const int64_t anz = GB_nnz(A) ;
+    const bool A_is_hyper = A->h != NULL ;
+
+    const int64_t *__restrict__ Bh = B->h ;
+    const int64_t *__restrict__ Bp = B->p ;
+    const int64_t *__restrict__ Bi = B->i ;
+    const int64_t bvlen = B->vlen ;
+    const int64_t bnz = GB_nnz(B);
+    const bool B_is_hyper = B->h != NULL ;
+
+    // int64_t *restrict Cp = C->p ;    // copy of Mp
+    // int64_t *restrict Ch = C->h ;    // copy of Mh
+    int64_t *__restrict__ Ci = C->i ;   // for zombies, or bucket assignment
+
+    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
+    // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
+    // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
+    // where bucket is the bucket assignment for C(i,j).
+    // bucket can be recovered from Ci by bucket = Ci & 0xF
+
+    // ASSERT (mnz > 0) ;
+    // ASSERT (gridDim.x <= mnz) ;
+
+    // shared cache used for coordinate search
+    __shared__ int64_t ks [chunk_size] ;
+
+
+    //--------------------------------------------------------------------------
+    // assign all entries of C to the buckets
+    //--------------------------------------------------------------------------
+
+    // all threads in this block will compute the same values for these:
+    int64_t pfirst, plast, kfirst, klast ;
+
+    int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ;
+    //      (mnz + chunk_size -1)/chunk_size;
+    for ( int64_t chunk = blockIdx.x;
+                  chunk < chunk_max;
+                  chunk += gridDim.x )
+    {
+
+        //----------------------------------------------------------------------
+        // determine the work done by this iteration, "chunk"
+        //----------------------------------------------------------------------
+
+        // The slice for each task contains entries pfirst:plast-1 of M and C.
+        // This iteration "chunk" computes Ci and Cx [pfirst...plast-1], using
+        // Mi and Mx [pfirst:plast-1].  All threads in the thread block are
+        // used for this "chunk".
+        pfirst = chunk_size * chunk ;
+        plast  = pfirst + chunk_size ;
+        // plast = GB_IMIN (plast, mnz) ;
+        if (plast > mnz) plast = mnz ;
+        int64_t my_chunk_size = plast - pfirst ;
+
+        // find the first vector of the slice for this chunk: the
+        // vector that owns the entry Mi [pfirst] and Mx [pfirst].
+        kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec, mvlen) ;
+
+        // find the last vector of the slice for task blockIdx.x: the
+        // vector that owns the entry Mi [plast-1] and Mx [plast-1].
+        klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec, mvlen);
+
+        // number of vectors in C and M for this "chunk" iteration, where
+        // Mp [kfirst:klast] will be operated on.
+        int64_t nk = klast - kfirst + 1 ;
+
+        //----------------------------------------------------------------------
+        // fill ks to find all indices
+        //----------------------------------------------------------------------
+
+        // search for k values for each entry pfirst:plast-1
+        float slope = ((float) nk) / ((float) my_chunk_size) ;
+        int64_t mnvec1 = mnvec - 1 ;
+        for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x)
+        {
+            // get a rough estimate of k for the kkth entry in ks
+            int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ;
+            // k cannot be smaller than kfirst, but might be bigger than
+            // mnvec-1, so ensure it is in the valid range, kfirst to mnvec-1
+            // k = GB_IMIN (k, mnvec-1) ;
+            if (k > mnvec1) k = mnvec1 ; 
+            // look for p in Mp, where p is in range pfirst:plast-1
+            // where pfirst >= 0 and plast < mnz
+            int64_t p = kk + pfirst ;
+            // linear-time search for the k value of the pth entry
+            while ( Mp [ k + 1 ] <= p ) k++ ;
+            while ( Mp [ k     ] >  p ) k-- ;
+            ks [kk] = k ;
+        }
+        this_thread_block().sync();
+
+        //----------------------------------------------------------------------
+        // assign entries in C(i,j) to the buckets
+        //----------------------------------------------------------------------
+
+        // if B is hypersparse, bpleft ... TODO describe
+        // int64_t bpleft = 0 ;
+
+        for ( int64_t pM = pfirst + threadIdx.x;
+                      pM < pfirst + my_chunk_size;
+                      pM += blockDim.x )
+        {
+            int64_t k = ks [pM - pfirst] ;  // get the k value of Mi,Mx [pM].
+            int64_t i = Mi [ pM ] ;
+            int64_t j = k ; // HACK, does not need to be initialized here
+
+            Ci[pM] = (k<<4) ;
+        }
+    }
+
+}
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh
index 42e9a434a..7816b0e98 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh
@@ -1,12 +1,11 @@
-
 //------------------------------------------------------------------------------
 // AxB_dot3_phase3_dndn.cu 
 //------------------------------------------------------------------------------
 
 // This CUDA kernel produces the semi-ring product of two
-// sparse matrices of types T_A and T_B and common index space size n, to a  
-// output matrix of type T_C. The matrices are sparse, with different numbers
-// of non-zeros and different sparsity patterns. 
+// dense matrices of types T_A and T_B and common index space size n, to a  
+// output matrix of type T_C. The matrices are dense, with uniform
+// non-zeros and sparsity patterns. 
 // ie. we want to produce C = A'*B in the sense of the given semi-ring.
 
 // This version uses a simple warp-based dense dot product algorithm, when the
@@ -31,10 +30,23 @@
 //  GrB_Matrix B           <- input matrix B
 //  int sz                 <- size parameter (not used) 
 
+/* fixme: This kernel needs to be split into 4 methods:
+
+        (A bitmap) * (B bitmap)
+        (A full ) * (B bitmap)
+        (A bitmap) * (B full)
+        (A full) * (B full)
+
+    The buckets are not needed at all.  A single pass can be done.
+    C and M would still be sparse or hypersparse.
+
+    See also denseDotProduct.cu.
+*/
+
 #pragma once
 #include <limits>
 #include <cstdint>
-#include "matrix.h"
+#include "GB_cuda_kernel.h"
 
 #include <cooperative_groups.h>
 
@@ -72,7 +84,7 @@ T block_ReduceSum(thread_block g, T val, T Ident)
 
   //tile.sync();                    // Wait for all partial reductions
 
-  if (wid > 0 || gridDim.x == 1 ) return val;
+  if (wid > 0 ) return val;
 
   //read from shared memory only if that warp existed
   val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] :  Ident  ;
@@ -83,19 +95,23 @@ T block_ReduceSum(thread_block g, T val, T Ident)
 }
 
 
-template< typename T_C, typename T_A, typename T_B>
-__global__ void AxB_dot3_phase3_dndn 
+template<
+    typename T_C, typename T_A, typename T_B,
+    typename T_Z, typename T_X, typename T_Y,
+    uint64_t srcode>
+__global__ void AxB_dot3_phase3_dndn
 (
-    int64_t start,
-    int64_t end,
-    int64_t *Bucket,
     GrB_Matrix C,
     GrB_Matrix M,
     GrB_Matrix A,
-    GrB_Matrix B,
-    int sz
+    GrB_Matrix B
 )
 {
+    // TODO: Figure out how to use graphblas-specific INFINITY macro
+    #ifndef INFINITY
+    #define INFINITY std::numeric_limits<T_C>::max()
+    #endif
+
    const T_A *__restrict__ Ax = (T_A *)A->x  ;
    const T_B *__restrict__ Bx = (T_B *)B->x  ;
          T_C *__restrict__ Cx = (T_C *)C->x  ;
@@ -105,71 +121,134 @@ __global__ void AxB_dot3_phase3_dndn
    const int64_t *__restrict__ Bi = B->i ;
    const int64_t *__restrict__ Ap = A->p ;
    const int64_t *__restrict__ Bp = B->p ;
+   #if GB_A_IS_BITMAP
+   const int8_t  *__restrict__ Ab = A->b ;
+   #endif
+   #if GB_B_IS_BITMAP
+   const int8_t  *__restrict__ Bb = B->b ;
+   #endif
 
     // zombie count
     int zc = 0;
+    // dot pair and index in bucket
     int64_t pair_id;
 
+    int64_t start = 0;
+    int64_t end   = M->p[M->nvec];
+
     // total items to be inspected
     int64_t nnzA = 0;
     int64_t nnzB = 0;
     int s = blockDim.x;
 
     // Main loop over pairs 
-    for (pair_id = start + blockIdx.x; //warp per pair 
-         pair_id < end;  
-         pair_id += gridDim.x ){
+    for ( int64_t kk  = start + blockIdx.x; //warp per pair 
+                  kk  < end;  
+                  kk += gridDim.x ){
 
+         pair_id = kk ;
          int64_t i = Mi[pair_id];
          int64_t j = Ci[pair_id] >> 4;
 
-         int64_t pA = Ap[i];
-         int64_t xend   = Ap[i+1];
+         int64_t pA   = (A->vlen)*i;
+         int64_t xend = pA +(A->vlen);
          nnzA = xend - pA;
 
-         int64_t pB = Bp[j];
-         int64_t yend   = Bp[j+1];
+         int64_t pB   = (B->vlen)*j;
+         int64_t yend = pB +(B->vlen);
          nnzB = yend - pB;
 
-    if (threadIdx.x == 0 ){
-        printf("tid=%d, i,j = %d,%d  nnzA= %d, nnzB=%d\n",
-               threadIdx.x, (int)i,(int)j,  (int)nnzA, (int)nnzB);
-    }
-    __syncthreads();
+//      if (threadIdx.x == 0 ){
+//          printf("tid=%d, i,j = %d,%d  nnzA= %d, nnzB=%d\n",
+//                 threadIdx.x, (int)i,(int)j,  (int)nnzA, (int)nnzB);
+//      }
+//      __syncthreads();
 
     
     // convert global data pointer to the local pointer of this block
-    T_A  aki; // *xdata = &Ax[xstart]; 
-    T_B  bkj; // *ydata = &Bx[ystart];
-    T_C  cij;
-
-    GB_GETA ( aki=(T_C)Ax[pA+threadIdx.x] ) ;             // aki = A(0,i)
-    GB_GETB ( bkj=(T_C)Bx[pB+threadIdx.x] ) ;             // bkj = B(0,j)
-    GB_C_MULT ( cij, aki, bkj ) ;                        // cij = aki * bkj
-
-    for ( int tid = threadIdx.x + s; tid < nnzA; tid+= s) { 
-          // cij += A(k,i) * B(k,j)
-          // GB_DOT_TERMINAL ( cij ) ;             // break if cij == terminal
-          GB_GETA ( aki=(T_C)Ax[pA+tid] ) ;         // aki = A(k,i)
-          GB_GETB ( bkj=(T_C)Bx[pB+tid] ) ;        // bkj = B(k,j)
-          GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
-    }
-
+    GB_DECLAREA (aki) ;
+    GB_DECLAREB (bkj) ;
+
+    #if GB_A_IS_FULL && GB_B_IS_FULL
+
+        T_Z cij ; // = GB_IDENTITY ; not needed
+        GB_GETA ( aki, Ax, pA+threadIdx.x) ;        // aki = A(0,i)
+        GB_GETB ( bkj, Bx, pB+threadIdx.x) ;        // bkj = B(0,j)
+        GB_C_MULT ( cij, aki, bkj ) ;               // cij = aki * bkj
+        for ( int64_t k = threadIdx.x + s; k < nnzA; k+= s) { 
+              // cij += A(k,i) * B(k,j)
+              GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
+              GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
+              GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
+        }
+
+    #elif GB_A_IS_BITMAP && GB_B_IS_BITMAP
+
+        T_Z cij = GB_IDENTITY ;
+        bool cij_exists = false ;
+        for ( int64_t k = threadIdx.x ; k < nnzA; k+= s) { 
+              GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
+              GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
+              int8_t b = (Ab [pA+k] && Bb [pB+k]) ;
+              cij_exists |= b ;
+              if (b)
+              {
+                  GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
+              }
+        }
+
+    #elif GB_A_IS_FULL && GB_B_IS_BITMAP
+
+        T_Z cij = GB_IDENTITY ;
+        bool cij_exists = false ;
+        for ( int tid = threadIdx.x ; tid < nnzA; tid+= s) { 
+            if (Bb [pB+tid])
+            {
+              GB_GETA (aki, Ax, pA+tid) ;           // aki = A(k,i)
+              GB_GETB (bkj, Bx, pB+tid) ;           // bkj = B(k,j)
+              GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
+              cij_exists = true ;
+            }
+        }
+
+    #elif GB_A_IS_BITMAP && GB_B_IS_FULL
+
+        T_Z cij = GB_IDENTITY ;
+        bool cij_exists = false ;
+        for ( int tid = threadIdx.x ; tid < nnzA; tid+= s) { 
+            if (Ab [pB+tid])
+            {
+              GB_GETA (aki, Ax, pA+tid) ;           // aki = A(k,i)
+              GB_GETB (bkj, Bx, pB+tid) ;           // bkj = B(k,j)
+              GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
+              cij_exists = true ;
+            }
+        }
+
+    #endif
 
     //--------------------------------------------------------------------------
     // reduce per-thread sums to a single scalar
     //--------------------------------------------------------------------------
+
+    // FIXME: need to check if cij_exists for any thread, for the 3
+    // cases of bitmap*bitmap, full*bitmap, and bitmap*full, and if not,
+    // C(i,j) is a zombie.
+
+    #if !GB_C_ISO
     thread_block_tile<32> tile = tiled_partition<32>( this_thread_block() );
-    cij = warp_ReduceSum<T_C, 32> ( tile, cij);
+    cij = warp_ReduceSum<T_Z, 32> ( tile, cij);
+    #endif
 
     // write result for this block to global mem
     if (threadIdx.x == 0)
     {
        //printf("tid: %d final sum after reduce = %d\n", threadIdx.x, sum);
        GB_PUTC( Cx[pair_id]=(T_C)cij ) ;
-       GB_PUTC( Ci[pair_id]=i ) ;
+       Ci[pair_id]=i ;
     }
     //__syncthreads ( ) ;
+    // never have block zombies to add to C->nzombies
   }
 
 }
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh
index 3230eecdd..c9f9413e8 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh
@@ -36,7 +36,7 @@
 #include <limits>
 #include <cstdint>
 #include <cooperative_groups.h>
-#include "matrix.h"
+#include "GB_cuda_kernel.h"
 
 // Using tile size fixed at compile time, we don't need shared memory
 #define tile_sz 32 
@@ -49,7 +49,9 @@ T GB_reduce_sum(thread_block_tile<warp_sz> g, T val)
 {
     // Each iteration halves the number of active threads
     // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
+    // Temporary T is necessary to handle arbirary ops
+    #pragma unroll
+    for (int i = warp_sz >> 1; i > 0; i >>= 1)
     {
         T next = g.shfl_down( val, i);
         val = GB_ADD( val, next ) ;
@@ -63,21 +65,23 @@ T reduce_plus(thread_block_tile<warp_sz> g, T val)
 {
     // Each iteration halves the number of active threads
     // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
+    #pragma unroll
+    for (int i = warp_sz >> 1; i > 0; i >>= 1)
     {
         val += g.shfl_down( val, i) ;
     }
     return val; // note: only thread 0 will return full sum and flag value
-}
-
-#define intersects_per_thread 8
+} 
 
-template< typename T_C, typename T_A, typename T_B>
+template<
+    typename T_C, typename T_A, typename T_B,
+    typename T_Z, typename T_X, typename T_Y,
+    uint64_t srcode>
 __global__ void AxB_dot3_phase3_mp
 (
     int64_t start,
     int64_t end,
-    int64_t *Bucket,
+    int64_t *Bucket,    // do the work in Bucket [start:end-1]
     GrB_Matrix C,
     GrB_Matrix M,
     GrB_Matrix A,
@@ -86,19 +90,23 @@ __global__ void AxB_dot3_phase3_mp
 )
 {
 
-    C->jumbled = true;
-    T_A *Ax = (T_A*)A->x;
-    T_B *Bx = (T_B*)B->x;
-    T_C *Cx = (T_C*)C->x;
-    int64_t *Ci = C->i;
-    int64_t *Mi = M->i;
-    int64_t *Ai = A->i;
-    int64_t *Bi = B->i;
-    int64_t *Ap = A->p;
-    int64_t *Bp = B->p;
+    // TODO: Figure out how to use graphblas-specific INFINITY macro
+    #ifndef INFINITY
+    #define INFINITY std::numeric_limits<T_C>::max()
+    #endif
+
+    const T_A *__restrict__ Ax = (T_A *)A->x  ;
+    const T_B *__restrict__ Bx = (T_B *)B->x  ;
+          T_C *__restrict__ Cx = (T_C *)C->x  ;
+          int64_t *__restrict__ Ci = C->i ;
+    const int64_t *__restrict__ Mi = M->i ;
+    const int64_t *__restrict__ Ai = A->i ;
+    const int64_t *__restrict__ Bi = B->i ;
+    const int64_t *__restrict__ Ap = A->p ;
+    const int64_t *__restrict__ Bp = B->p ;
 
     // zombie count
-    int zc = 0;
+    int64_t zc = 0;
 
     int64_t pair_id;
 
@@ -109,148 +117,241 @@ __global__ void AxB_dot3_phase3_mp
     int b = blockIdx.x ;
 
     // total items to be inspected
-    int64_t nnzA = 0;
-    int64_t nnzB = 0;
-    int64_t n_intersect = 0;
+    int64_t ainz = 0;
+    int64_t bjnz = 0;
 
     thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
-
-    int parts = blockDim.x; //(n_intersect+ intersects_per_thread -1)/ intersects_per_thread; 
-
-    // int has_zombies = 0 ;
+    int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ;
 
     // Main loop over pairs 
-    for (pair_id = start+ blockIdx.x; //warp per pair 
-         pair_id < end;  
-         pair_id += gridDim.x )
+    int64_t kk ;
+    for (kk = start+ blockIdx.x; //warp per pair 
+         kk < end;  
+         kk += gridDim.x )
     {
 
+         pair_id = all_in_one ? kk : Bucket [kk] ;
+         //pair_id = kk ;
          int64_t i = Mi[pair_id];
          int64_t j = Ci[pair_id] >> 4;
 
-         int64_t xstart = Ap[i];
-         int64_t xend   = Ap[i+1];
-         nnzA = xend - xstart;
-
-         int64_t ystart = Bp[j];
-         int64_t yend   = Bp[j+1];
-         nnzB = yend - ystart;
-
-//         if(threadIdx.x == 0 && j == 139 && i == 945)
-//             printf("blk%d tid=%d, nnzA=%d, nnzB=%d\n", blockIdx.x, tid_global, nnzA, nnzB);
-//
-         n_intersect = GB_IMIN( xend -xstart, yend -ystart); 
-    /* 
-    if (threadIdx.x ==0 ) {
-      printf("block %d  doing dot %lld  i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j);
-    }
-    */
+         // find A(:,i)
+         int64_t pA_start = Ap[i];        // pA_start
+         int64_t pA_end   = Ap[i+1];      // pA_end
+         ainz = pA_end - pA_start;          // ainz
+
+         GB_DECLAREA (aki) ;
+         GB_DECLAREB (bkj) ;
+         T_Z cij = GB_IDENTITY ;
+
+        // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.
+        // just check if cij > 0
+
+         int cij_exists  = 0 ;
+         //printf(" thd%u has init value %f\n",tid, cij);
+         
+         #define shared_vector_size 128 
+         __shared__ int64_t Ai_s[shared_vector_size];
+         int shared_steps_A = (ainz + shared_vector_size -1)/shared_vector_size;
+
+         int64_t step_end = (shared_steps_A <= 1? ainz : shared_vector_size);
+         for ( int64_t i = tid; i< step_end; i+= blockDim.x)
+         {   Ai_s[i] = Ai[ i + pA_start]; }   
+         this_thread_block().sync();
+         
+
+         // find B(:,j)
+         int64_t pB_start = Bp[j];        // pB_start
+         int64_t pB_end   = Bp[j+1];      // pB_end
+         bjnz = pB_end - pB_start;          // bjnz
+         int shared_steps_B = (bjnz + shared_vector_size -1)/shared_vector_size;
+         
+         __shared__ int64_t Bj_s[shared_vector_size];
+
+         step_end = (shared_steps_B <= 1 ? bjnz : shared_vector_size);
+         for ( int64_t i =tid; i< step_end; i+= blockDim.x)
+         {   Bj_s[i] = Bi[ i + pB_start]; }   
+         this_thread_block().sync();
+     
+  //if (threadIdx.x ==0 ) {
+  //  printf("block %d  doing dot %lld  i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j);
+  //  printf("block %d  doing dot %lld  ainz,bjnz= %lld,%lld, A_steps=%d, B_steps=%d\n", 
+  //          blockIdx.x, pair_id, ainz, bjnz, shared_steps_A, shared_steps_B);
+  //}
+  //this_thread_block().sync();
+    
     //we want more than one intersection per thread
-    int64_t nxy = nnzA + nnzB;
+    while ( (shared_steps_A > 0) && (shared_steps_B > 0) )
+    {
+        int64_t awork = pA_end - pA_start;
+        int64_t bwork = pB_end - pB_start;
+        if ( shared_steps_A > 1) awork = shared_vector_size;  
+        if ( shared_steps_B > 1) bwork = shared_vector_size;  
+        int64_t nxy = awork + bwork;
 
-    int work_per_thread = (nxy +parts -1)/parts;
-    int diag = GB_IMIN( work_per_thread*tid, nxy);
-    int diag_end = GB_IMIN( diag + work_per_thread, nxy);
-    //printf(" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\n",tid, parts, work_per_thread, diag, diag_end); 
+    
 
-    int x_min = GB_IMAX( (int)(diag - nnzB), 0);
-    int x_max = GB_IMIN( diag, nnzA);
+    int work_per_thread = (nxy + blockDim.x -1)/blockDim.x;  // ceil Divide by 32 = blockDim.x 
+    int diag     = GB_IMIN( work_per_thread*tid, nxy);
+    int diag_end = GB_IMIN( diag + work_per_thread, nxy);
+    //printf(" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\n",tid, blockDim.x, work_per_thread, diag, diag_end); 
+
+  //if (1) //(threadIdx.x == 0)
+  //{
+  //    printf ("pair %ld tid%d work_per_thread %d nxy %ld parts %d diag %d diag_end %d Astep=%d, Bstep=%d\n",
+  //        pair_id, threadIdx.x, work_per_thread, nxy, blockDim.x, diag, diag_end,shared_steps_A,shared_steps_B) ;
+  //}
+  //this_thread_block().sync();
+    
+    int x_min = GB_IMAX( (diag - bwork) , 0); //bwork takes place of bjnz
+    int x_max = GB_IMIN( diag, awork);      //awork takes place of ainz
 
-    //printf("start thd%u x_min = %u x_max = %u\n", tid_global, x_min,x_max);
     while ( x_min < x_max) { //binary search for correct diag break
-      int pivot = (x_min +x_max)/2;
-      if ( Ai[pivot + xstart] < Bi[ diag -pivot -1 + ystart]) {
-         x_min = pivot +1;
-      }
-      else {
-         x_max = pivot;
-      }
+      int pivot = (x_min +x_max) >> 1;
+      //printf("start search thd%u piv=%u xmin,xmax = %u,%u diag_end=%d\n", tid_global, pivot, x_min, x_max, diag_end);
+      int64_t Apiv =  Ai_s[pivot] ;
+      int64_t Bpiv = Bj_s[diag -pivot -1] ;
+
+   // if ( Apiv < Bpiv ) {
+   //    x_min = pivot +1;
+   // }
+   // else {
+   //    x_max = pivot;
+   // }
+      x_min = (pivot + 1)* (Apiv < Bpiv)   + x_min * (1 - (Apiv < Bpiv));
+      x_max = pivot * (1 - (Apiv < Bpiv))  + x_max * (Apiv < Bpiv);
+
     }
+    //printf("start search thd%u xcoord= %u diag=%d, diag_end=%d\n", tid_global, x_min, diag, diag_end);
+
     int xcoord = x_min;
     int ycoord = diag -x_min -1;
-    if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+xstart] == Bi[ycoord+ystart]) ) { 
+    int64_t Atest = Ai_s[xcoord] ;
+    int64_t Btest = Bj_s[ycoord] ;
+    if ( (diag > 0)
+      && (diag < nxy ) 
+      && (ycoord >= 0 ) 
+      && (Atest == Btest)
+      ) 
+    { 
        diag--; //adjust for intersection incrementing both pointers 
     }
     // two start points are known now
-    int tx_start = xcoord +xstart;
-    int ty_start = diag -xcoord +ystart; 
+    int tx_start = xcoord; // +pA_start;
+    int ty_start = diag -xcoord; // +pB_start; 
 
     //if (x_start != y_start)
     //   printf("start thd%u  xs,ys = %i,%i\n", tid_global, x_start, y_start);
 
-    x_min = GB_IMAX( (int)(diag_end - nnzB), 0);
-    x_max = GB_IMIN( diag_end, nnzA);
+    x_min = GB_IMAX( (diag_end - bwork), 0); //bwork replace bjnz
+    x_max = GB_IMIN( diag_end, awork);      //awork replace ainz
 
     while ( x_min < x_max) {
-       int pivot = (x_min +x_max)/2;
-       //printf("thd%u pre_sw piv=%u diag_e = %u  xmin,xmax=%u,%u\n", tid_global, pivot, diag_end,x_min, x_max);
-       if ( Ai[pivot+ xstart] < Bi[ diag_end -pivot -1 +ystart]) {
-          x_min = pivot +1;
-       }
-       else {
-          x_max = pivot;
-       }
-       //printf("thd%u piv=%u xmin,xmax = %u,%u\n", tid_global, pivot, x_min, x_max);
+      int pivot = (x_min +x_max) >> 1;
+      int64_t Apiv = Ai_s[pivot] ;
+      int64_t Bpiv = Bj_s[diag_end -pivot -1] ;
+      
+    //if ( Apiv < Bpiv ) {
+    //   x_min = pivot +1;
+    //}
+    //else {
+    //   x_max = pivot;
+    //}
+      x_min = (pivot + 1)* (Apiv < Bpiv)   + x_min * (1 - (Apiv < Bpiv));
+      x_max = pivot * (1 - (Apiv < Bpiv))  + x_max * (Apiv < Bpiv);
+
     }
+    //printf("end search thd%u x_coord = %u diag=%d, diag_end=%d\n", tid_global, x_min, diag, diag_end);
     xcoord = x_min;
     ycoord = diag_end -x_min -1;
-    if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +xstart] == Bi[ycoord + ystart]) ) { 
-        diag--; //adjust for intersection incrementing both pointers  
-    }
-    // two end points are known now
-    int tx_end = xcoord +xstart; 
-    int ty_end = diag_end - xcoord + ystart; 
 
-    T_A aki;
-    T_B bkj;
-    T_C cij = GB_IDENTITY ;
-
-    // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.
-    // just check if cij > 0
-
-    int cij_exists  = 0 ;
-    //printf(" thd%u has init value %f\n",tid, cij);
+    // two end points are known now
+    int tx_end = xcoord; // +pA_start; 
+    int ty_end = diag_end - xcoord; // + pB_start; 
 
     //merge-path dot product
-    int k = tx_start;
-    int l = ty_start;
-
-//    if(threadIdx.x == 0 && j == 139) {
+    int64_t pA = tx_start;       // pA
+    int64_t pB = ty_start;       // pB
+
+  //if (1) // threadIdx.x == 0)
+  //{
+  //    printf ("%d tx_start %d\n", threadIdx.x, tx_start) ;
+  //    printf ("%d tx_end   %d\n", threadIdx.x, tx_end  ) ;
+  //    printf ("%d ty_start %d\n", threadIdx.x, ty_start) ;
+  //    printf ("%d ty_end   %d\n", threadIdx.x, ty_end  ) ;
+  //}
+  //this_thread_block().sync();
+
+//    if(threadIdx.x == 0 ) {
 //        printf("blk%d, thd%d k=%d, l=%d, tx_start=%d, ty_start=%d, tx_end=%d, ty_end=%d\n", blockIdx.x, tid_global, k, l, tx_start, ty_start, tx_end, ty_end);
 //    }
+//  this_thread_block().sync();
 
-    while ( k < tx_end && l < ty_end && nnzA != 0 && nnzB != 0)
+    while ( pA < tx_end && pB < ty_end ) 
     {
-        if (Ai [k] == Bi [l])
-        {
-            GB_GETA ( aki=(T_C)Ax[k] ) ;
-            GB_GETB ( bkj=(T_C)Bx[l] ) ;
-            if (cij_exists)
-            {
-                T_C t = GB_MULT( (T_C)aki, (T_C)bkj );
-                GB_ADD_F (cij, t ) ;
-//                    if(j == 139 && i == 945)
-//                        printf("blk%d thd%d ix at %lld  %lld cij += %d * %d \n", blockIdx.x, tid_global, Ai[k], Bi[l], aki, bkj);
-            }
-            else
+        int64_t Aind = Ai_s[pA] ;
+        int64_t Bind = Bj_s[pB] ;
+        #if GB_IS_PLUS_PAIR_REAL_SEMIRING && GB_ZTYPE_IGNORE_OVERFLOW
+            cij += (Aind == Bind) ;
+        #else
+            if (Aind == Bind)
             {
-                cij_exists = 1 ;
-                cij = GB_MULT ( (T_C)aki, (T_C)bkj ) ;
-//                    if(j == 139 && i == 945)
-//                        printf("blk%d thd%d ix at %lld %lld  cij = %d * %d, k=%d, l=%d i=%lld j=%lld \n", blockIdx.x, tid_global, Ai[k], Bi[l], Ax[k], Bx[l], k, l, i, j);
+                // cij += aki + bkj
+                GB_DOT_MERGE (pA + pA_start, pB + pB_start) ;
+                // TODO check terminal condition
             }
-            // TODO check terminal condition
-            k+= 1;
-            l+= 1;
-//                if(j == 139 && i == 945)
-//                    printf(" block%u work value = %d, exists = %d\n", b, cij, cij_exists);
-        }
-        else
-        {
-            k += ( Ai[k] < Bi[l] ) ;
-            l += ( Ai[k] > Bi[l] ) ;
-        }
+        #endif
+        pA += (Aind <= Bind) ;
+        pB += (Aind >= Bind) ;
+    }
+    GB_CIJ_EXIST_POSTCHECK ;
+
+    this_thread_block().sync();
+
+    if  (  (shared_steps_A >= 1) 
+        && (shared_steps_B >= 1) 
+        && ( Ai_s[awork-1] == Bj_s[bwork-1]) ) {
+
+       pA_start += shared_vector_size;
+       shared_steps_A -= 1;
+       if (shared_steps_A == 0) break;
+       pB_start += shared_vector_size;
+       shared_steps_B -= 1;
+       if (shared_steps_B == 0) break;
+
+       step_end = ( (pA_end - pA_start) < shared_vector_size ? (pA_end - pA_start) : shared_vector_size);
+       for ( int64_t i = tid; i< step_end; i+= blockDim.x)
+       {   Ai_s[i] = Ai[ i + pA_start]; }   
+       this_thread_block().sync();
+
+       step_end = ( (pB_end - pB_start) < shared_vector_size ? (pB_end - pB_start) : shared_vector_size);
+       for ( int64_t i = tid; i< step_end; i+= blockDim.x)
+       {   Bj_s[i] = Bi[ i + pB_start]; }   
+       this_thread_block().sync();
+       
+    } 
+    else if ( (shared_steps_A >= 1) && (Ai_s[awork-1] < Bj_s[bwork-1] )) {
+       pA_start += shared_vector_size;
+       shared_steps_A -= 1;
+       if (shared_steps_A == 0) break;
+
+       step_end= ( (pA_end - pA_start) < shared_vector_size ? (pA_end - pA_start) : shared_vector_size);
+       for ( int64_t i = tid; i< step_end; i+= blockDim.x)
+       {   Ai_s[i] = Ai[ i + pA_start]; }   
+       this_thread_block().sync();
     }
+     
+    else if ( (shared_steps_B >= 1) && (Bj_s[bwork-1] < Ai_s[awork-1]) ) {
+       pB_start += shared_vector_size;
+       shared_steps_B -= 1;
+       if (shared_steps_B == 0) break;
+
+       step_end = ( (pB_end - pB_start) < shared_vector_size ? (pB_end - pB_start) : shared_vector_size);
+       for ( int64_t i = tid; i< step_end; i+= blockDim.x)
+       {   Bj_s[i] = Bi[ i + pB_start]; }   
+       this_thread_block().sync();
+     }
+    } // end while shared_steps A > 0 && shared_steps_B >0
 
     //tile.sync( ) ;
     //--------------------------------------------------------------------------
@@ -265,23 +366,22 @@ __global__ void AxB_dot3_phase3_mp
     */
 
     // Do vote here for control.
-    cij_exists  = tile.any( cij_exists);
-    //tile.sync();
+    cij_exists = tile.any( cij_exists);
+    tile.sync();
 
+    #if !GB_C_ISO
     if (cij_exists)
     {
-       cij = GB_reduce_sum<T_C, tile_sz>( tile, cij );
-       
+       cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );
     }
-    // else has_zombies = 1;
+    #endif
 
-
-        //__syncthreads();
-    //tile.sync( );
     // write result for this block to global mem
     if (tid == 0)
     {
         //printf ("final %d : %d exists = %d\n", b,  cij, cij_exists) ;
+//      if (mydump) printf ("Result for (%ld,%ld): %d\n", i, j, cij_exists); 
+
         if (cij_exists)
         {
 //
@@ -289,15 +389,15 @@ __global__ void AxB_dot3_phase3_mp
 //                printf("what's the deal here? %d, %ld\n", cij, i);
 //            }
 
-            //printf(" cij = %d\n", cij);
+           // printf(" cij = %d\n", cij);
            GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;
-           GB_PUTC ( Ci[pair_id]=i ) ;
+           Ci[pair_id] = i ;
         }
         else
         {
-           printf(" dot %d is a zombie\n", pair_id);
+           // printf(" dot %d is a zombie\n", pair_id);
            zc++;
-           GB_PUTC ( Ci[pair_id]=GB_FLIP (i) ) ;
+           Ci[pair_id]=GB_FLIP (i) ;
         }
     }
     //__syncthreads(); 
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh
index dbafd38b9..703015eae 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh
@@ -6,21 +6,38 @@
 //  Template on <T_C, T_A, T_B, T_X, T_Y, T_Z >
 //  Parameters:
 
-//  int64_t start          <- beginning of bucket  
-//  int64_t end            <- end of bucket
-//  int64_t *Bucket        <- index of each pair in this bucket
 //  matrix<T_C> *C         <- C result matrix 
 //  matrix<T_C> *M         <- Mask matrix 
 //  matrix<T_A> *A         <- A matrix to multiply, sparse 
 //  matrix<T_B> *B         <- B matrix to multiply, dense in sparse format? 
-//  int sz                 <- size hint for smaller vector
 //******************************************************************************
+
+/* fixme: This kernel needs to be split into 4 methods.  Perhaps a single
+    file with #ifdef's could be used to keep the code size down.
+
+        (A sparse or hypersparse) * (B bitmap)
+        (A sparse or hypersparse) * (B full)
+        (A bitmap) * (B sparse or hypersparse)
+        (A full) * (B sparse or hypersparse)
+
+    The buckets are not needed, unless the sparse matrix needs to be
+    split into "very sparse vectors" (one thread per dot) and "longer
+    sparse vectors" (one warp or threadblock cooperates on a single dot).
+    Then only 2 buckets are needed ... or the work could be done in a single
+    pass, and the test for these 2 cases could be done on the fly.
+
+    The buckets are entirely different from the general case when both A and
+    B are sparse.
+
+    C and M would still be sparse or hypersparse.
+*/
+
 #pragma once
 
 #include <limits>
 #include <cstdint>
 #include <stdio.h>
-#include "matrix.h"
+#include "GB_cuda_kernel.h"
 
 #include <cooperative_groups.h>
 
@@ -45,19 +62,23 @@ __device__ T reduce_sum(thread_block_tile<warpSize> g, T val)
 }
 
 
-template< typename T_C, typename T_A, typename T_B>
+template<
+    typename T_C, typename T_A, typename T_B,
+    typename T_Z, typename T_X, typename T_Y,
+    uint64_t srcode>
 __global__ void AxB_dot3_phase3_spdn
 ( 
-  int64_t start, 
-  int64_t end,
-  int64_t *Bucket, 
   GrB_Matrix C, 
   GrB_Matrix M, 
   GrB_Matrix A, 
-  GrB_Matrix B,
-  int sz 
+  GrB_Matrix B
 )
 {
+    // TODO: Figure out how to use graphblas-specific INFINITY macro
+    #ifndef INFINITY
+    #define INFINITY std::numeric_limits<T_C>::max()
+    #endif
+
    const T_A *__restrict__ Ax = (T_A *)A->x  ;
    const T_B *__restrict__ Bx = (T_B *)B->x  ;
          T_C *__restrict__ Cx = (T_C *)C->x  ;
@@ -71,35 +92,26 @@ __global__ void AxB_dot3_phase3_spdn
 //   typedef cub::BlockReduce<int, 32> BlockReduce;
 //   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-   // sz = expected non-zeros per dot 
-   int m = 256/sz;
-   int nvec = end - start;
-   int dpt = nvec/32;
-   m = dpt < m ? dpt : m;
 //   if( threadIdx.x ==0)
 //      printf("thd:%d %d dots/thrd, nvec = %d blockDim=%d\n",threadIdx.x, sz, nvec, blockDim.x);
 //   __syncthreads();
-   int dots = (nvec +m -1)/m;
-
-//   printf("dots=%d, m=%d, dpt=%d\n", dots, m, dpt);
-   int zc = 0;
-     
-   for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;
-             tid < dots;
-             tid += blockDim.x * gridDim.x) {
-      int pair_id, im; 
+
+      int64_t pair_id; 
+
+      int64_t start = 0;
+      int64_t end = M->p[M->nvec];
 //       if (threadIdx.x ==0)
 //         printf("thd%u pi=%lld\n",tid, start+threadIdx.x);
 //       __syncthreads();
 
-      for (pair_id = start+tid, im = 0; 
-           im < m && pair_id < end;  
-           ++im,     pair_id += dots ){
+      for (int64_t kk = start +threadIdx.x +blockIdx.x*blockDim.x; 
+                   kk < end ;  
+                   kk += gridDim.x*blockDim.x  ){
 
+         pair_id =  kk ;
          int64_t i = Mi[pair_id];  // cols from mask
 
-         // TODO: column of Ci / 16?
-         int64_t j = Ci[pair_id] >> 4;  // row number of C
+         int64_t j = Ci[pair_id] >> 4;  // row number of C previously encoded in phase1
 
          //printf("tid=%d, i=%lu, j=%lu\n", threadIdx.x, i, j);
 
@@ -107,16 +119,24 @@ __global__ void AxB_dot3_phase3_spdn
 //         printf("thd%u i,j=%lld,%lld\n",tid, i,j);
 //      __syncthreads();
 
-          // Prime row offsets for both A and B
+          // FIXME:  this should use flags on A and B instead
+          // Prep row offsets for both A and B
           int64_t pA       = Ap[i];   // row of C
           int64_t pA_end   = Ap[i+1];
           int64_t nnzA   = pA_end - pA;
+
+#if GB_B_IS_SPARSE
           int64_t pB       = Bp[j];   // col of C
           int64_t pB_end   = Bp[j+1];
+#elif GB_B_IS_FULL
+          int64_t pB   = (B->vlen)*j;
+          int64_t pB_end = pB +(B->vlen);
+#endif
+
           int64_t nnzB   = pB_end - pB;
-          T_A aki;
-          T_B bkj;
-          T_C cij;
+          GB_DECLAREA (aki) ;
+          GB_DECLAREB (bkj) ;
+          T_Z cij = GB_IDENTITY ;
 
           int zombie_count = 0;
 
@@ -132,14 +152,13 @@ __global__ void AxB_dot3_phase3_spdn
                */
               int64_t k = Bi [pB] ;               // first row index of B(:,j)
               // cij = A(k,i) * B(k,j)
-              GB_GETA ( aki=(T_C)Ax[pA+k] ) ;           // aki = A(k,i)
-              GB_GETB ( bkj=(T_C)Bx[pB] ) ;           // bkj = B(k,j)
-
+              GB_GETA ( aki, Ax, pA+k ) ;           // aki = A(k,i)
+              GB_GETB ( bkj, Bx, pB ) ;           // bkj = B(k,j)
 
               // TODO: Check tha GB_C_MULT applies the identity automatically since cij has not been initialized
               GB_C_MULT ( cij, aki, bkj ) ;           // cij = aki * bkj
 
-              //printf("A_dense: tid=%d, pair_id=%d, i=%lu, j=%lu, nnzA=%lu, nnzB=%lu, k[B]=%lu, aki=%d, bkj=%d, cij=%d\n", threadIdx.x, pair_id, i, j, nnzA, nnzB, k, aki, bkj, cij);
+              printf("A_dense: tid=%d, pair_id=%d, i=%lu, j=%lu, nnzA=%lu, nnzB=%lu, k[B]=%lu, aki=%d, bkj=%d, cij=%d\n", threadIdx.x, pair_id, i, j, nnzA, nnzB, k, aki, bkj, cij);
 
               /**
                *
@@ -149,8 +168,8 @@ __global__ void AxB_dot3_phase3_spdn
                   //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
                   int64_t k = Bi [p] ;                // next row index of B(:,j)
                   // cij += A(k,i) * B(k,j)
-                  GB_GETA ( aki=(T_C)Ax[pA+k] ) ;           // aki = A(k,i)
-                  GB_GETB ( bkj=(T_C)Bx[p] ) ;           // bkj = B(k,j)
+                  GB_GETA ( aki, Ax, pA+k ) ;           // aki = A(k,i)
+                  GB_GETB ( bkj, Bx, p ) ;              // bkj = B(k,j)
                   GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
                   //printf("in_loop: tid=%d, pair_id=%d, i=%lu, j=%lu, nnzA=%lu, nnzB=%lu, k[B]=%lu, aki=%d, bkj=%d, cij=%d\n", threadIdx.x, pair_id, i, j, nnzA, nnzB, k, aki, bkj, cij);
               }
@@ -160,10 +179,10 @@ __global__ void AxB_dot3_phase3_spdn
           {
               int64_t k = Ai [pA] ;               // first col index of A(i, :)
               // cij = A(i,k) * B(j,k)
-              GB_GETA ( aki=(T_C)Ax[ pA ] ) ;           // aki = A(i,k)
+              GB_GETA ( aki, Ax, pA ) ;           // aki = A(i,k)
 
               // Jump straight to position in B vector (since we know it's dense)
-              GB_GETB ( bkj=(T_C)Bx[ pB+k ] ) ;           // bkj = B(k,j)
+              GB_GETB ( bkj, Bx, pB+k ) ;           // bkj = B(k,j)
 
               GB_C_MULT ( cij, aki, bkj) ;           // cij = aki * bkj
               //printf("B_dense: tid=%d, pair_id=%d, i=%lu, j=%lu, nnzA=%lu, nnzB=%lu, k[B]=%lu, aki=%d, bkj=%d, cij=%d\n", threadIdx.x, pair_id, i, j, nnzA, nnzB, k, aki, bkj, cij);
@@ -173,9 +192,9 @@ __global__ void AxB_dot3_phase3_spdn
                   //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
                   int64_t k = Ai [p] ;                // next row index of A(:,i)
                   // cij += A(k,i) * B(k,j)
-                  GB_GETA ( aki=(T_C)Ax[ p ] ) ;           // aki = A(i,k)
-                  GB_GETB ( bkj=(T_C)Bx[ pB+k] ) ;           // bkj = B(j,k)
-                  GB_MULTADD ( cij, aki, bkj) ;        // cij += aik * bjk
+                  GB_GETA ( aki, Ax, p ) ;              // aki = A(i,k)
+                  GB_GETB ( bkj, Bx, pB+k) ;            // bkj = B(j,k)
+                  GB_MULTADD ( cij, aki, bkj) ;         // cij += aik * bjk
                   //printf("in_loop: tid=%d, pair_id=%d, i=%lu, j=%lu, nnzA=%lu, nnzB=%lu, k[B]=%lu, aki=%d, bkj=%d, cij=%d\n", threadIdx.x, pair_id, i, j, nnzA, nnzB, k, aki, bkj, cij);
               }
           }
@@ -196,8 +215,8 @@ __global__ void AxB_dot3_phase3_spdn
 //            // cij = A(k,i) * B(k,j)
 //
 ////             printf("tid=%d, A is dense, k=%ld, i=%ld\n", threadIdx.x, k, i);
-//            GB_GETA ( aki=(T_C)Ax[pA + i] ) ;           // aki = A(k,i)
-//            GB_GETB ( bkj=(T_C)Bx[pB] ) ;           // bkj = B(k,j)
+//            GB_GETA ( aki, Ax,pA + i ) ;           // aki = A(k,i)
+//            GB_GETB ( bkj, Bx,pB ) ;           // bkj = B(k,j)
 //            cij = GB_MULT(aki, bkj ) ;           // cij = aki * bkj
 //
 //        }
@@ -218,8 +237,8 @@ __global__ void AxB_dot3_phase3_spdn
 //            // cij = A(k,i) * B(k,j)
 //
 ////             printf("tid=%d, A is dense, k=%ld, i=%ld\n", threadIdx.x, k, i);
-//            GB_GETA ( aki=(T_C)Ax[pA + i] ) ;           // aki = A(k,i)
-//            GB_GETB ( bkj=(T_C)Bx[pB] ) ;           // bkj = B(k,j)
+//            GB_GETA ( aki, Ax,pA + i ) ;           // aki = A(k,i)
+//            GB_GETB ( bkj, Bx,pB ) ;           // bkj = B(k,j)
 //            cij = GB_MULT(aki, bkj ) ;           // cij = aki * bkj
 //
 //            for (int64_t p = pB+1 ; p < pB_end ; p++)
@@ -227,8 +246,8 @@ __global__ void AxB_dot3_phase3_spdn
 //                //GB_DOT_TERMINAL (cij) ;           // break if cij == terminal
 //                int64_t k = Bi [p] ;                // next row index of B(:,j)
 //                // cij += A(k,i) * B(k,j)
-//                GB_GETA ( aki=(T_C)Ax[A->vlen * i + k] ) ;      // aki = A(k,i)
-//                GB_GETB ( bkj=(T_C)Bx[p] ) ;                    // bkj = B(k,j)
+//                GB_GETA ( aki, Ax,A->vlen * i + k ) ;      // aki = A(k,i)
+//                GB_GETB ( bkj, Bx,p ) ;                    // bkj = B(k,j)
 //                cij = GB_ADD ( cij, GB_MULT(aki, bkj ) ) ;      // cij += aki * bkj
 //            }
 //        }
@@ -261,8 +280,8 @@ __global__ void AxB_dot3_phase3_spdn
 //            int64_t k = Ai [pA] ;               // first row index of A(:,i)
 ////             printf("tid=%d, B is dense, k=%ld, j=%ld\n", threadIdx.x, k, j);
 //            // cij = A(k,i) * B(k,j)
-//            GB_GETA ( aki= (T_C)Ax[ pA ] ) ;           // aki = A(k,i)
-//            GB_GETB ( bkj=(T_C)Bx[ B->vlen*k+j ] ) ;           // bkj = B(k,j)
+//            GB_GETA ( aki, Ax, pA  ) ;           // aki = A(k,i)
+//            GB_GETB ( bkj, Bx, B->vlen*k+j ) ;           // bkj = B(k,j)
 //
 //            cij =  GB_MULT(aki, bkj) ;           // cij = aki * bkj
 ////             printf("aki=%d, bkj=%d, cij=%d\n", aki, bkj, cij);
@@ -272,8 +291,8 @@ __global__ void AxB_dot3_phase3_spdn
 //                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
 //                int64_t k = Ai [p] ;                // next row index of A(:,i)
 //                // cij += A(k,i) * B(k,j)
-//                GB_GETA ( aki=(T_C)Ax[ p ] ) ;           // aki = A(k,i)
-//                GB_GETB ( bkj=(T_C)Bx[ B->vlen*k+j] ) ;           // bkj = B(k,j)
+//                GB_GETA ( aki,Ax, p  ) ;           // aki = A(k,i)
+//                GB_GETB ( bkj,Bx, B->vlen*k+j ) ;           // bkj = B(k,j)
 //                cij = GB_ADD ( cij, GB_MULT(aki, bkj) );        // cij += aki * bkj
 ////                printf("aki=%d, bkj=%d, cij=%d\n", aki, bkj, cij);
 //            }
@@ -284,8 +303,8 @@ __global__ void AxB_dot3_phase3_spdn
 //             }
 //         }
 
-         GB_PUTC( Ci[pair_id]=i ) ;
-         GB_PUTC( Cx[pair_id]=cij ) ;
+         Ci[pair_id]=i ;
+         GB_PUTC( Cx[pair_id]=(T_C) cij ) ;
 
 //         int zc = BlockReduce(temp_storage).Sum(zombie_count);
           thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
@@ -294,7 +313,4 @@ __global__ void AxB_dot3_phase3_spdn
          if(threadIdx.x == 0 && zc > 0)
             atomicAdd(&(C->nzombies), zc);
       }
-  
-   }
-   
 }
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cuh
deleted file mode 100644
index 33d651f39..000000000
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cuh
+++ /dev/null
@@ -1,234 +0,0 @@
-//------------------------------------------------------------------------------
-// spGEMM_very_sparse_sparse.cu 
-//------------------------------------------------------------------------------
-
-// The spGEM_vssp CUDA kernel produces the semi-ring product of two
-// sparse matrices of types T_A and T_B and common index space size n, to a  
-// output matrix of type T_C. The matrices are sparse, with different numbers
-// of non-zeros and different sparsity patterns. 
-// ie. we want to produce C = A'*B in the sense of the given semi-ring.
-
-// This version uses a binary-search algorithm, when the sizes nnzA and nnzB
-// are far apart in size, neither is very spare nor dense, for any size of N.
-
-// Both the grid and block are 1D, so blockDim.x is the # threads in a
-// threadblock, and the # of threadblocks is grid.x
-
-// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
-// of active threads = min( min(nzA, nzB), 32) 
-
-// Thus, each t in threadblock b owns a part of the set of pairs in the 
-// sparse-sparse bucket of work. The job for each pair of vectors is to find 
-// the intersection of the index sets Ai and Bi, perform the semi-ring dot 
-// product on those items in the intersection, and finally
-// on exit write it to Cx [pair].
-
-//  int64_t start          <- start of vector pairs for this kernel
-//  int64_t end            <- end of vector pairs for this kernel
-//  int64_t *Bucket        <- array of pair indices for all kernels 
-//  GrB_Matrix C         <- result matrix 
-//  GrB_Matrix M         <- mask matrix
-//  GrB_Matrix A         <- input matrix A
-//  GrB_Matrix B         <- input matrix B
-#pragma once
-
-#include <limits>
-#include <cstdint>
-#include <cooperative_groups.h>
-#include "matrix.h"
-
-// Using tile size fixed at compile time, we don't need shared memory
-#define tile_sz 32 
-
-using namespace cooperative_groups;
-
-template< typename T, int warpSize >
-__device__ T reduce_sum(thread_block_tile<warpSize> g, T val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
-    {
-        val += g.shfl_down(val,i) ;
-    }
-    return val; // note: only thread 0 will return full sum
-}
-
-#define intersects_per_thread 8
-
-template< typename T_C, typename T_A, typename T_B>
-__global__ void AxB_dot3_phase3_vssp
-(
-    int64_t start,
-    int64_t end,
-    int64_t *Bucket,
-    GrB_Matrix C,
-    GrB_Matrix M,
-    GrB_Matrix A,
-    GrB_Matrix B,
-    int sz
-)
-{
-   // Typed pointers to access data in A,B,C
-   T_A *Ax = (T_A*)A->x;
-   T_B *Bx = (T_B*)B->x;
-   T_C *Cx = (T_C*)C->x;
-   int64_t *Ci = C->i;
-   int64_t *Mi = M->i;
-   int64_t *Ai = A->i;
-   int64_t *Bi = B->i;
-   int64_t *Ap = A->p;
-   int64_t *Bp = B->p;
-
-   // sz = expected non-zeros per dot 
-   int m = 256/sz;
-   int nvecs = end - start;
-   int dpt = nvecs/(gridDim.x*32);
-   
-   int dots = (nvecs +dpt -1)/dpt; 
-
-   // zombie count
-   int zc = 0;
-   int64_t pair_id, im;
-
-   // set thread ID
-   unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
-   unsigned int tid = threadIdx.x;
-
-   unsigned long int b = blockIdx.x ;
-
-   // Main loop over pairs 
-   for (pair_id = start+ tid_global, im = 0; 
-        pair_id < end && im < m;  
-        pair_id += gridDim.x*blockDim.x, ++im){
-
-        int64_t i = Mi[pair_id];
-        int64_t j = Ci[pair_id] >> 4;
-
-        if( j < 0) //Pre-zombie
-        {
-            zc++;
-            continue;
-        }
-
-        int64_t pA      = Ap[i];
-        int64_t pA_end  = Ap[i+1];
-        int64_t nnzA = pA_end - pA;
-
-        int64_t pB      = B->p[j];
-        int64_t pB_end  = B->p[j+1];
-        int64_t nnzB = pB_end - pB;
-
-        //Search for each nonzero in the smaller vector to find intersection 
-        bool cij_exists = false;
-
-        T_A aki;
-        T_B bkj;
-        T_C cij;
-
-        if (nnzA <= nnzB) {
-            //----------------------------------------------------------------------
-            // A(:,i) is very sparse compared to B(:,j)
-            //----------------------------------------------------------------------
-
-            while (pA < pA_end && pB < pB_end)
-            {
-                int64_t ia = Ai [pA] ;
-                int64_t ib = Bi [pB] ;
-                if (ia < ib)
-                { 
-                    // A(ia,i) appears before B(ib,j)
-                    pA++ ;
-                }
-                else if (ib < ia)
-                { 
-                    // B(ib,j) appears before A(ia,i)
-                    // discard all entries B(ib:ia-1,j)
-                    int64_t pleft = pB + 1 ;
-                    int64_t pright = pB_end - 1 ;
-                    GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ;
-                    //ASSERT (pleft > pB) ;
-                    pB = pleft ;
-                }
-                else // ia == ib == k
-                { 
-                    // A(k,i) and B(k,j) are the next entries to merge
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cij_exists = true ;
-                    break ;
-                    #else
-                    GB_DOT_MERGE ;
-                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
-                    pA++ ;
-                    pB++ ;
-                    #endif
-                }
-            }
-        }
-        else {
-            //----------------------------------------------------------------------
-            // B(:,j) is very sparse compared to A(:,i)
-            //----------------------------------------------------------------------
-
-            while (pA < pA_end && pB < pB_end)
-            {
-                int64_t ia = Ai [pA] ;
-                int64_t ib = Bi [pB] ;
-                if (ia < ib)
-                { 
-                    // A(ia,i) appears before B(ib,j)
-                    // discard all entries A(ia:ib-1,i)
-                    int64_t pleft = pA + 1 ;
-                    int64_t pright = pA_end - 1 ;
-                    GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ;
-                    //ASSERT (pleft > pA) ;
-                    pA = pleft ;
-                }
-                else if (ib < ia)
-                { 
-                    // B(ib,j) appears before A(ia,i)
-                    pB++ ;
-                }
-                else // ia == ib == k
-                { 
-                    // A(k,i) and B(k,j) are the next entries to merge
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cij_exists = true ;
-                    break ;
-                    #else
-                    GB_DOT_MERGE ;
-                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
-                    pA++ ;
-                    pB++ ;
-                    #endif
-                }
-            }
-
-        }
-        if ( cij_exists){
-           GB_PUTC ( Ci[pair_id]=i ) ;
-           GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;
-        }
-        else {
-           zc++; 
-           //printf(" %lld, %lld is zombie %d!\n",i,j,zc);
-           GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;
-        }
-
-
-    }
-
-    //--------------------------------------------------------------------------
-    // reduce sum per-thread values to a single scalar
-    //--------------------------------------------------------------------------
-    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
-    zc = reduce_sum<int,tile_sz>(tile, zc);
-
-    if( threadIdx.x ==0) {
-      //printf("warp %d zombie count = %d\n", blockIdx.x, zc);
-      atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc);
-      //printf(" Czombie = %lld\n",C->nzombies);
-    }
-
-}
-
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cuh
index 698136b6c..8cba2757b 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cuh
@@ -24,9 +24,10 @@
 #define GB_CUDA_KERNEL
 #include <limits>
 #include <cstdint>
+#include <cmath>
 #include <stdio.h>
 #include <cooperative_groups.h>
-#include "matrix.h"
+#include "GB_cuda_kernel.h"
 
 using namespace cooperative_groups;
 
@@ -36,32 +37,40 @@ T warp_ReduceSumPlus( thread_block_tile<tile_sz> g, T val)
 {
     // Each iteration halves the number of active threads
     // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2) {
-        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
+    /*
+    #pragma unroll
+    for (int i = tile_sz >> 1; i > 0; i >>= 1) {
         val +=  g.shfl_down( val, i);
     }
+    */
+    val +=  g.shfl_down( val, 16);
+    val +=  g.shfl_down( val, 8);
+    val +=  g.shfl_down( val, 4);
+    val +=  g.shfl_down( val, 2);
+    val +=  g.shfl_down( val, 1);
     return val; // note: only thread 0 will return full sum
 }
-
+/*
 template< typename T, int tile_sz>
 __inline__ __device__ 
 T warp_Reduce( thread_block_tile<tile_sz> g, T val)
 {
     // Each iteration halves the number of active threads
     // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2) {
+    #pragma unroll
+    for (int i = tile_sz >> 1; i > 0; i >>= 1) {
         T next = g.shfl_down( val, i) ;
-        val = GB_ADD( sum, next ) ; 
+        val = GB_ADD( val, next ) ; 
     }
     return val; // note: only thread 0 will return full sum
 }
+*/
 
 template<typename T, int warpSize>
 __inline__ __device__
 T block_ReduceSum(thread_block g, T val)
 {
   static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
-  
 
   int lane = threadIdx.x & 31 ; // % warpSize;
   int wid  = threadIdx.x >> 5 ; // / warpSize;
@@ -72,37 +81,40 @@ T block_ReduceSum(thread_block g, T val)
 
   // Wait for all partial reductions
   if (lane==0) shared[wid]=val; // Write reduced value to shared memory
-  __syncthreads();              // Wait for all partial reductions
-    for(int i = threadIdx.x; i < warpSize; i+= blockDim.x) {
-            printf("blockIdx.x=%d, wid=%d, val=%lld\n", blockIdx.x, i, shared[i]);
-    }
+  g.sync();                     // Wait for all partial reductions
 
-//  if (wid > 0 || gridDim.x == 1 ) return val;
+  //if (wid > 0 ) return val;
 
   //read from shared memory only if that warp existed
   val = (threadIdx.x <  (blockDim.x / warpSize ) ) ? shared[lane] : 0;
-  printf("thd%d warp loaded val = %d\n", threadIdx.x, lane, val);
 
   if (wid==0) val = warp_ReduceSumPlus<T, warpSize>( tile, val); //Final reduce within first warp
 
   return val;
 }
 
-template< typename T_C, typename T_A, typename T_B>
+template<
+    typename T_C, typename T_A, typename T_B,
+    typename T_Z, typename T_X, typename T_Y, uint64_t srcode>
 __global__ void AxB_dot3_phase3_vsvs
 ( 
   int64_t start,
   int64_t end,
-  int64_t *Bucket,
+  int64_t *Bucket,  // do the work in Bucket [start:end-1]
   GrB_Matrix C,
   GrB_Matrix M,
   GrB_Matrix A,
   GrB_Matrix B,
-  int sz
+  int sz            // unused
 )
 {
-//    printf("start=%lu, end=%lu\n", start, end);
-   int dots = end - start;
+
+    // TODO: Figure out how to use graphblas-specific INFINITY macro
+    #ifndef INFINITY
+    #define INFINITY std::numeric_limits<T_C>::max()
+    #endif
+
+    int64_t dots = end - start;
    // sz = expected non-zeros per dot
 //   /*
 //   int m = (gridDim.x*blockDim.x)*256/sz;
@@ -113,55 +125,48 @@ __global__ void AxB_dot3_phase3_vsvs
 //   */
    const T_A *__restrict__ Ax = (T_A *)A->x  ;
    const T_B *__restrict__ Bx = (T_B *)B->x  ;
-   T_C *__restrict__ Cx = (T_C *)C->x  ;
-   int64_t *__restrict__ Ci = C->i ;
+         T_C *__restrict__ Cx = (T_C *)C->x  ;
+         int64_t *__restrict__ Ci = C->i ;
    const int64_t *__restrict__ Mi = M->i ;
    const int64_t *__restrict__ Ai = A->i ;
    const int64_t *__restrict__ Bi = B->i ;
    const int64_t *__restrict__ Ap = A->p ;
    const int64_t *__restrict__ Bp = B->p ;
 
-   int pfirst, plast;
-
-    //#define GB_PARTITION(k1,k2,n,tid,nthreads)                                  \
+    //int64_t pfirst, plast;
 
-    GB_PARTITION (pfirst, plast, dots, blockIdx.x, gridDim.x ) ;
-//   if( threadIdx.x ==0 )
-//   {
-//   if( threadIdx.x ==0 )
-//   {
-//      printf("block%d %d dots/thrd, start,end = %ld,%ld pf,pl=%d,%d blockDim=%d\n",
-//               blockIdx.x, (dots + blockDim.x*gridDim.x -1)/(blockDim.x*gridDim.x),
-//               start, end, pfirst, plast, blockDim.x);
-//   }
-//   __syncthreads();
+    //GB_PARTITION (pfirst, plast, dots, blockIdx.x, gridDim.x ) ;
 
+    int64_t my_nzombies = 0 ;
 
-   int zc = 0 ;
-     
-   int64_t pair_id;
+    int all_in_one = ( (end - start) == (M->p)[(M->nvec)] ) ;
 
-   //for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;
-   //          tid < dots;
-   //          tid += blockDim.x * gridDim.x)
-   for ( int tid = pfirst+ threadIdx.x ;
-             tid < plast;
-             tid += blockDim.x )
-   {
-         pair_id = Bucket[ start + tid ];
+  //for ( int64_t kk = pfirst+ threadIdx.x ;
+  //              kk < plast;
+  //              kk += blockDim.x )
+    for ( int64_t kk = start+ threadIdx.x +blockDim.x*blockIdx.x ;
+                  kk < end;
+                  kk += blockDim.x*gridDim.x )
+    {
+         int64_t pair_id = all_in_one ? kk : Bucket[ kk ];
 
+         // HACK: assumes C and M are sparse, not hypersparse
          int64_t i = Mi [pair_id] ;
-         int64_t j = Ci [pair_id]>>4 ; 
-         if (j < 0) continue; //don't operate on zombies
-       printf("start=%d, tid=%d, pair_id=%lu, (i,j)=%lu,%lu\n", pfirst, tid, pair_id,i,j);
+         int64_t j = Ci [pair_id]>>4 ;      // this is "k", not "j"
+         // C, M hypersparse, we do: j = Mh [k].
+         // Note Ch == Mh, even with zombies in C.
+
+         // HACK: assumes A is sparse, not hypersparse
          int64_t pA       = Ap[i] ;
          int64_t pA_end   = Ap[i+1] ;
+
+         // HACK: assumes B is sparse, not hypersparse
          int64_t pB       = Bp[j] ;
          int64_t pB_end   = Bp[j+1] ;
 
-         T_A aki;
-         T_B bkj;
-         T_C cij ;
+         GB_DECLAREA (aki) ;
+         GB_DECLAREB (bkj) ;
+         T_Z cij = GB_IDENTITY ;
 
          bool cij_exists = false;
 
@@ -169,48 +174,36 @@ __global__ void AxB_dot3_phase3_vsvs
          {
             int64_t ia = Ai [pA] ;
             int64_t ib = Bi [pB] ;
-            if( ia == ib)
-            { 
-                // A(k,i) and B(k,j) are the next entries to merge
-                #if defined ( GB_PHASE_1_OF_2 )
-                cij_exists = true ;
-                break ;
-                #else
-                GB_DOT_MERGE ;
-                //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
-                pA++ ;
-                pB++ ;
-                #endif
-            }
-            else 
-            {
-                // A(ia,i) appears before B(ib,j)
-                pA += ( ia < ib);
-                // B(ib,j) appears before A(ia,i)
-                pB += ( ib < ia);
-            }
+            #if GB_IS_PLUS_PAIR_REAL_SEMIRING && GB_ZTYPE_IGNORE_OVERFLOW
+                cij += (ia == ib) ;
+            #else
+                if (ia == ib)
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    GB_DOT_MERGE (pA, pB) ;
+                    //GB_DOT_TERMINAL (cij) ;   // break if cij == terminal
+                }
+            #endif
+            pA += ( ia <= ib);  // incr pA if A(ia,i) at or before B(ib,j)
+            pB += ( ib <= ia);  // incr pB if B(ib,j) at or before A(ia,i)
          }
+         GB_CIJ_EXIST_POSTCHECK ;
          if (cij_exists){
-            GB_PUTC ( Ci[pair_id] = i ) ;
+            Ci[pair_id] = i ;
             GB_PUTC ( Cx[pair_id] = (T_C)cij ) ;
          }
          else{
-            printf(" %lld, %lld is zombie %d!\n",i,j,zc);
-            zc++; 
-            GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;
+            my_nzombies++;
+            Ci[pair_id] = GB_FLIP( i ) ;
          }
    }
-  
-   __syncthreads();
-
-   printf("thd%d zombie count = %d\n",threadIdx.x,zc);
-   zc = block_ReduceSum<int , 32>( this_thread_block(), zc);
-   __syncthreads();
-   if( threadIdx.x == 0 && zc > 0) {
-      printf("block%d zombie count = %d\n", blockIdx.x, zc);
-      atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc);
-//      C->nzombies += (unsigned long long int)zc;
-      printf("blk:%d Czombie = %lld\n", blockIdx.x,C->nzombies);
-   }
+   this_thread_block().sync(); 
+
+   my_nzombies = block_ReduceSum<int64_t , 32>( this_thread_block(), my_nzombies);
+   this_thread_block().sync(); 
 
+   if( threadIdx.x == 0 && my_nzombies > 0) {
+      atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)my_nzombies);
+   }
 }
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cuh
deleted file mode 100644
index 2386148cc..000000000
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cuh
+++ /dev/null
@@ -1,388 +0,0 @@
-//------------------------------------------------------------------------------
-// AxB_dot3_phase3_warpix.cu 
-//------------------------------------------------------------------------------
-
-// This CUDA kernel produces the semi-ring product of two
-// sparse matrices of types T_A and T_B and common index space size n, to a  
-// output matrix of type T_C. The matrices are sparse, with different numbers
-// of non-zeros and different sparsity patterns. 
-// ie. we want to produce C = A'*B in the sense of the given semi-ring.
-
-// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are 
-// relatively close in size, neither is very spare nor dense, for any size of N.
-// Handles arbitrary sparsity patterns with guaranteed load balance.
-
-// Both the grid and block are 1D, so blockDim.x is the # threads in a
-// threadblock, and the # of threadblocks is grid.x
-
-// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
-// of active threads = min( min(g_xnz, g_ynz), 32) 
-
-// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job
-// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot
-// product on those items in the intersection, and finally reduce this data to a scalar, 
-// on exit write it to g_odata [b].
-
-//  int64_t start          <- start of vector pairs for this kernel
-//  int64_t end            <- end of vector pairs for this kernel
-//  int64_t *Bucket        <- array of pair indices for all kernels 
-//  matrix<T_C> *C         <- result matrix 
-//  matrix<T_M> *M         <- mask matrix
-//  matrix<T_A> *A         <- input matrix A
-//  matrix<T_B> *B         <- input matrix B
-
-#pragma once
-#define GB_CUDA_KERNEL
-#include <limits>
-#include <cstdint>
-#include "matrix.h"
-#include <cooperative_groups.h>
-
-// Using tile size fixed at compile time, we don't need shared memory
-#define tile_sz 32 
-
-using namespace cooperative_groups;
-
-template< typename T, int warp_sz>
-__device__ __inline__ 
-T GB_reduce_sum(thread_block_tile<warp_sz> g, T val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
-    {
-        T next = g.shfl_down( val, i);
-        val = GB_ADD( val, next ) ;
-    }
-    return val;
-}
-
-template< typename T, int warp_sz>
-__device__ __inline__ 
-T reduce_plus(thread_block_tile<warp_sz> g, T val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
-    {
-        val += g.shfl_down( val, i) ;
-    }
-    return val; // note: only thread 0 will return full sum and flag value
-}
-
-#define intersects_per_thread 8
-
-template< typename T_C, typename T_A, typename T_B>
-__global__ void AxB_dot3_phase3_warpix
-(
-    int64_t start,
-    int64_t end,
-    int64_t *__restrict__ Bucket,
-    GrB_Matrix C,
-    GrB_Matrix M,
-    GrB_Matrix A,
-    GrB_Matrix B,
-    int sz
-)
-{
-    T_A *__restrict__ Ax = (T_A*)A->x;
-    T_B *__restrict__ Bx = (T_B*)B->x;
-    T_C *__restrict__ Cx = (T_C*)C->x;
-    int64_t *__restrict__ Ci = C->i;
-    int64_t *__restrict__ Mi = M->i;
-    int64_t *__restrict__ Mp = M->p;
-    int64_t *__restrict__ Ai = A->i;
-    int64_t *__restrict__ Bi = B->i;
-    int64_t *__restrict__ Ap = A->p;
-    int64_t *__restrict__ Bp = B->p;
-
-    int64_t mnvec = M->nvec;
-
-    // zombie count
-    int zc;
-
-    int64_t pair_id;
-
-    // set thread ID
-    int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
-    int tid = threadIdx.x;
-    int b = blockIdx.x ;
-
-    // total items to be inspected
-    int64_t nnzA = 0;
-    int64_t nnzB = 0;
-
-    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
-
-    //int parts = gridDim.x; //Each warp is a part
-
-    //Find our part of the work bucket
-    int64_t pfirst, plast, kfirst, klast ;
-    GB_PARTITION (pfirst, plast, end-start, b, gridDim.x ) ;
-    /* 
-    if( tid ==0 ) {
-       printf("block%d is alive, pf,pl=%ld,%ld \n", b, pfirst, plast);
-    }
-    __syncthreads();
-    */
-    
-    
-    __shared__ int64_t As[256];
-    __shared__ int64_t Bs[256];
-    __shared__ T_A Axs[256]; 
-    __shared__ T_B Bxs[256]; 
-
-   /* 
-    int Bpl[9]; // local offsets into shared for multiple vectors of B
-    int shr_vec[8] ; //columns of B we see in this task
-
-    pair_id = Bucket[pfirst];
-    int64_t i = Mi[pair_id] ;
-    int vecs = 1 ;
-    int last_vec = i;
-    shr_vec[0] = i;
-    for (int id =1; id< plast-pfirst; id++)
-    {
-         pair_id = Bucket[pfirst+id];
-         i = Mi[pair_id];
-         if (i == last_vec) continue;
-         vecs++;
-         shr_vec[vecs] = i;
-         last_vec = i;
-    }
-    int all_loaded = 0;
-
-    Bpl[0] = 0;
-    for ( int k = 0; k < vecs; k++)
-    {   
-        int64_t pA       = Ap[ shr_vec[k] ]; 
-        int64_t pA_end   = Ap[ shr_vec[k] +1]; 
-        nnzA = pA_end - pA;
-        Bpl[k+1] = Bpl[k] + nnzA;
-        for (int i = tid ; i < nnzA; i+= blockDim.x)
-        {
-           As[ Bpl[k] +i ] = Ai[ pA + i ] ; 
-        }
-        __syncthreads();
-    }
-
-    //pre-load columns of B, which will be reused, to shared memory
-    //Due to loading a contigious block with stride 1 this is fast
-        
-    all_loaded = (Bpl[vecs] < 256 );
-    if( tid == 0 ) {
-       printf("block%d loaded %d vals from B, vecs=%d, all_loaded=%d\n",
-                 b, Bpl[vecs], vecs, all_loaded );
-    }
-    __syncthreads();
-
-
-    // reset counter
-    */
-    // Main loop over pairs 
-    for (int id = start + pfirst; // loop on pairs 
-         id < start+ plast;  
-         id ++ )
-    {
-         int64_t pair_id = Bucket[id];
-          
-         int64_t i = Mi[pair_id];
-         int64_t j = Ci[pair_id] >> 4;
-
-         int64_t pA       = Ap[i];
-         int64_t pA_end   = Ap[i+1];
-         nnzA = pA_end - pA;
-
-         int64_t pB       = Bp[j]; 
-         int64_t pB_end   = Bp[j+1]; 
-         nnzB = pB_end - pB;
-
-         zc = 0 ;
-         int j_last = -1 ;
-         
-         
-    // No search, this warp does all the work
-
-    int tx_start = pA;
-    int tx_end   = pA_end;
-    int ty_start = pB;
-    int ty_end   = pB_end;
-
-    for ( int i = tid; i < nnzA ; i+= blockDim.x)
-    {
-       As [i] = Ai[ pA + i];
-       Axs[i] = Ax[ pA + i];
-    }
-    __syncthreads();
-
-    if ( j != j_last) { 
-        for ( int i = tid; i < nnzB ; i+= blockDim.x)
-        {
-           Bs [i] = Bi[ pB + i];
-           Bxs[i] = Bx[ pB + i];
-        }
-        __syncthreads();
-        j_last = j;
-    }
-    
-
-    /*     
-    if ( tid==0 ) {
-      //printf("block %d dot %lld i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j);
-      printf("block%d dot %ld(i,j)=(%ld,%ld) xs,xe= %d,%d ys,ye = %d,%d \n", 
-               b, pair_id, i, j, tx_start,tx_end, ty_start, ty_end);
-      //for(int a = 0; a < nnzA; a++) printf(" As[%d]:%ld ",a, As[j]);
-    }
-    tile.sync();
-    */
-    
-    
-
-    // Warp intersection: balanced by design, no idle threads. 
-    // Each 32 thread warp will handle 32 comparisons per loop.
-    // Either A or B takes stride 4, other takes stride 8
-    // For this version A strides 4, B strides 8
-    T_A aki;
-    T_B bkj;
-    T_Z cij = GB_IDENTITY ;
-    int Astride = nnzA > nnzB ? 8 : 4;
-    int Ashift  = nnzA > nnzB ? 3 : 2;
-    int Amask   = nnzA > nnzB ? 7 : 3;
-    int Bstride = nnzB >= nnzA ? 8 : 4;
-    //printf(" Astride = %d, Bstride = %d\n", Astride, Bstride);
-
-    // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.
-    // just check if cij > 0
-
-    int cij_exists  = 0 ;
-
-    //Warp intersection dot product
-    int bitty_row = tid &  Amask ;
-    int bitty_col = tid >> Ashift ;
-
-    int k = tx_start + bitty_row ;
-    int l = ty_start + bitty_col ;
-
-    //Ai[k] = As[ k -pA ];  for lookup
-    //Bi[l] = Bs[ l -pB ]; 
-
-
-    int inc_k,inc_l;
-
-    int active = ( ( k < tx_end) && (l < ty_end ) );
-       
-    /*    
-    printf("block%d tid%d  Ai,As=%ld,%ld Bi,Bs=%ld,%ld  k,l =%d,%d active:%d\n",
-                    b,tid, Ai[k], As[k -pA], Bi[l], Bs[l -pB],
-                    k, l,  active );
-    */
-                    
-    
-    while ( tile.any(active) )
-    {
-       inc_k = 0;
-       inc_l = 0;
-       int kp = k-pA;
-       int lp = l-pB;
-       if ( active )
-       { 
-          coalesced_group g = coalesced_threads();
-          if ( g.thread_rank() == g.size()-1)
-          {
-             inc_k = ( As[kp] <= Bs[lp] ) ;
-             inc_l = ( Bs[lp] <= As[kp] ) ;
-             // printf("block%d tid%d inc_k= %d inc_l = %d\n",b, tid, inc_k, inc_l );
-          }
-          //tile.sync();
-
-          if ( As [kp] == Bs [lp] )
-          {
-              //Axs[kp] = Ax[k];
-              //Bxs[lp] = Bx[l];
-
-              GB_GETA ( aki=(T_Z)Axs[kp] ) ;
-              GB_GETB ( bkj=(T_Z)Bxs[lp] ) ;
-              if (cij_exists)
-              {
-                T_Z t = GB_MULT( (T_Z) aki, (T_Z) bkj);
-                GB_ADD_F( cij, t ) ;
-                //printf("block%d  thd%d ix at %ld(%ld)  cij += %d * %d\n",b, tid, Ai[k], As[kp], aki, bkj);
-              }
-              else
-              {
-                cij_exists = 1 ;
-                cij = GB_MULT ( (T_Z) aki, (T_Z) bkj) ;
-                //printf("  thd%d ix at %ld(%ld)  cij = %d * %d \n", tid, Ai[k], Ais[kp], aki, bkj);
-              }
-          }
-          // TODO check terminal condition
-          //printf(" block%u work value = %d, exists = %d\n", b, cij, cij_exists);
-          //printf("block%d tid%d k,l = %d,%d Ai,Bi = %ld,%ld \n", b, tid, k, l, Ai[k], Bi[l] );
-       }
-       //tile.sync();
-       //inc_k = tile.shfl_down( inc_k, 31-tid);
-       if( tile.any(inc_k) ) {
-          k =1+ tile.shfl_down(k,31-tid) + bitty_row ; // tid%Astride;
-          //Ais [k-pA] = As[k-pA];
-          //Axs [bitty_row] = Ax[k];
-       }
-       if( tile.any(inc_l) ) {
-          l =1+ tile.shfl_down(l,31-tid) + bitty_col ; // tid/Astride;
-          //Bis [l-pB] = Bs[l-pB];
-          //Bxs [bitty_col] = Bx[l];
-       }
-       active = ( ( k < tx_end) && (l < ty_end ) );
-       //printf("block%d tid = %d k = %d l= %d active=%d\n", b, tid, k, l,active);
-    }
-    tile.sync();
-
-    //--------------------------------------------------------------------------
-    // reduce sum per-thread values to a single scalar, get OR of flag
-    //--------------------------------------------------------------------------
-
-    // Do vote here for control.
-    cij_exists  = tile.any( cij_exists);
-    tile.sync();
-
-    if (cij_exists)
-    {
-       cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );
-    }
-    tile.sync();
-    
-
-    // Atomic write result for this block to global mem
-    if (tid == 0)
-    {
-        //printf ("final %d : %d exists = %d\n", b,  cij, cij_exists) ;
-        if (cij_exists)
-        {
-           //printf("block%d i,j =%ld,%ld cij = %d\n",b, i, j, cij);
-           GB_PUTC( Cx[pair_id] = (T_C) cij ) ;
-           GB_PUTC ( Ci[pair_id] = i ) ;
-           
-        }
-        else
-        {
-            //printf(" dot %d is a zombie\n", pair_id);
-            zc++;
-            GB_PUTC ( Ci[pair_id] = GB_FLIP (i) ) ;
-        }
-    
-    //__syncthreads(); 
-  
-
-       if( zc > 0)
-       {
-          printf("warp %d zombie count = %d\n", blockIdx.x, zc);
-          atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc);
-          printf("blk:%d Czombie = %lld\n",blockIdx.x,C->zombies);
-       }
-
-    }
-    tile.sync();
-    /*
-    */
-  }
-}
-
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_phase1.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_phase1.cuh
index e49b97dc6..7084e252b 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_phase1.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_phase1.cuh
@@ -1,24 +1,37 @@
 //------------------------------------------------------------------------------
-// templates/GB_AxB_cuda_dot3_phase1: symbolic load balancing and data partition
+// templates/GB_jit_AxB_phase1.cuh: symbolic load balancing and data partition
 // to assign work to different 'buckets' for later compute
 //------------------------------------------------------------------------------
 
 //  This kernel scans the non-zero pattern in A and B, takes into account the
 //  mask and computes total work required to form C. Then it classifies each
 //  dot product into a set of buckets for efficient compute. 
+
 #pragma once
 
 #define GB_CUDA_KERNEL
 #include <limits>
-#include "matrix.h"
+#include "GB_cuda_kernel.h"
 #include "GB_cuda_buckets.h"
 #include <cub/block/block_scan.cuh>
+#include <cooperative_groups.h>
 
+using namespace cooperative_groups;
 //------------------------------------------------------------------------------
-// GB_bucket_assignment
+// GB_bucket_code:  assign the dot product for C(i,j) to a specific bucket
 //------------------------------------------------------------------------------
 
-// assign the dot product C(i,j) = A(:,i)'*B(:,j) to a specific bucket
+// Assigns the dot product C(i,j) = A(:,i)'*B(:,j) to a specific bucket.  Both
+// A(:,i) and B(:,j) are non-empty when this method is called.
+
+// GB_BUCKET_ZOMBIE:    C(i,j) is a prezombie, either A(:,i) or B(:,j) are
+//                      empty.
+
+// GB_BUCKET_VSVS       both A(:,i) and B(:,j) are very sparse.
+
+// GB_BUCKET_MERGEPATH  both A(:,i) and B(:,j) are sparse, but neither are
+//                      very sparse
+
 __device__ static inline GB_bucket_code GB_bucket_assignment
 (
     int64_t ainz,       // # of entries A(:,i), always > 0
@@ -27,143 +40,25 @@ __device__ static inline GB_bucket_code GB_bucket_assignment
 )
 {
 
-    int b = 0 ; // no bucket assigned yet
+#if 0
 
-    // GB_BUCKET (condition,bucket) :  assigns an entry to a bucket,
-    // if the condition holds, but without using any if statements.
-    // An entry is assigned once and not reassigned.
+    // GB_BUCKET (condition,bucket) :  assigns an entry to a bucket, if the
+    // condition holds, but without using if statements (which are slow).  An
+    // entry is assigned once and not reassigned.
 
     // If the bucket b has not assigned, it is b = 0.  The GB_BUCKET function
     // tests this case, and if the condition is also true, the expression
-    // (b==0) * condition * (bucket+1) becomes equal to bucket+1.  This
-    // value is added to b, which is zero, so the final result is that b
-    // is set to bucket+1.
+    // (b==0) * condition * (bucket+1) becomes equal to bucket+1.  This value
+    // is added to b, which is zero, so the final result is that b is set to
+    // bucket+1.
 
-    // If the bucket b has been assigned already, we have b > 0.  Thus,
-    // the expression ((b==0) * condition * (bucket+1)) becomes zero.
-    // When added to b, the result is that b doesn't change, so the bucket
-    // assignment b is unmodified.
+    // If the bucket b has been assigned already, we have b > 0.  Thus, the
+    // expression ((b==0) * condition * (bucket+1)) becomes zero.  When added
+    // to b, the result is that b doesn't change, so the bucket assignment b is
+    // unmodified.
 
     #define GB_BUCKET(condition,bucket) \
         b = (((b == 0) * (condition)) * (bucket+1)) + b ;
-
-//  if (ia_last < ib_first || ib_last < ia_first)
-    { 
-
-        //----------------------------------------------------------------------
-        // pattern of A(:,i) and B(:,j) do not overlap
-        //----------------------------------------------------------------------
-
-        // The patterns of A(:,i) and B(:,j) are always sorted.  If the last
-        // entry in A(:,i) comes before the first entry in B(:,j), or visa
-        // versa, then there is no work to do since C(i,j) must be a zombie.
-
-        //GB_BUCKET (ia_last < ib_first || ib_last < ia_first, GB_BUCKET_ZOMBIE);
-
-    }
-//  else if (bjnz == vlen && ainz == vlen && vlen > 256)
-    {
-
-        //----------------------------------------------------------------------
-        // both A(:,i) and B(:,j) are dense
-        //----------------------------------------------------------------------
-
-        // No search of A(:,i) or B(:,j) is needed.  Total work is O(vlen).
-        // The intersection is non-empty, so C(i,j) cannot be a zombie.
-
-        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit
-
-        GB_BUCKET (bjnz == vlen && ainz == vlen && vlen > 256, GB_BUCKET_DNDN) ;
-
-    }
-//  else if (ainz == vlen)
-    {
- 
-        //----------------------------------------------------------------------
-        // A(:,i) is dense and B(:,j) is sparse
-        //----------------------------------------------------------------------
- 
-        // No search of A(:,i) is needed.  Total work is O(bjnz), via a linear
-        // time scan of B(:,j).  Since A(:,i) is dense and B(:,j) is non-empty,
-        // the intersection is non-empty, so C(i,j) cannot be a zombie.
-
-        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit
-        // Two buckets are used, depending on bjnz.
-        GB_BUCKET (ainz == vlen && bjnz <  256, GB_BUCKET_DNVS) ;
-        GB_BUCKET (ainz == vlen && bjnz >= 256, GB_BUCKET_DNSP) ;
- 
-    }
-//  else if (bjnz == vlen)
-    {
-
-        //----------------------------------------------------------------------
-        // A(:,i) is sparse and B(:,j) is dense
-        //----------------------------------------------------------------------
-
-        // No search of B(:,j) is needed.  Total work is O(ainz), via a linear
-        // time scan of A(:,i).  Since B(:,j) is dense and A(:,i) is non-empty,
-        // the intersection is non-empty, so C(i,j) cannot be a zombie.
-
-        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit
-        // Two buckets are used, depending on ainz.
-        GB_BUCKET (bjnz == vlen && ainz <  256, GB_BUCKET_VSDN) ;
-        GB_BUCKET (bjnz == vlen && ainz >= 256, GB_BUCKET_SPDN) ;
-
-    }
-//  else if ((ainz > 32 * bjnz && bjnz < 256)
-//        || (bjnz > 32 * ainz && ainz < 256))
-    {
-
-        //----------------------------------------------------------------------
-        // A(:,i) is very sparse compared to B(:,j), or visa versa
-        //----------------------------------------------------------------------
-
-        // Since B(:,j) is small, and much smaller than A(:,i), the efficient
-        // way to compute C(i,j) is a linear scan of B(:,j).  For each B(k,j),
-        // a binary search for the index A(k,i) is done.  The expected work to
-        // compute C(i,j) is thus O(bjnz * log2 (ainz)).  If A(:,i) is very
-        // sparse compared to B(:,j), the opposite is done inside the kernel.
-
-        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit
-
-        GB_BUCKET ((ainz > 32 * bjnz && bjnz < 256)
-                || (bjnz > 32 * ainz && ainz < 256), GB_BUCKET_VSSP) ;
-
-    }
-//  else if (ainz + bjnz <= 4)
-    {
-
-        //----------------------------------------------------------------------
-        // both A(:,i) and B(:,j) are very tiny (total size 4 or less)
-        //----------------------------------------------------------------------
-
-        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
-        //GB_BUCKET (ainz + bjnz <= 4, GB_BUCKET_VSVS_4) ;
-
-    }
-//  else if (ainz + bjnz <= 16)
-    {
-
-        //----------------------------------------------------------------------
-        // both A(:,i) and B(:,j) are tiny (total size 16 or less)
-        //----------------------------------------------------------------------
-
-        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
-        //GB_BUCKET (ainz + bjnz <= 16, GB_BUCKET_VSVS_16) ;
-
-    }
-//  else if (ainz + bjnz <= 64)
-    {
-
-        //----------------------------------------------------------------------
-        // both A(:,i) and B(:,j) are small (total size 64 or less)
-        //----------------------------------------------------------------------
-
-        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
-        //GB_BUCKET (ainz + bjnz <= 64, GB_BUCKET_VSVS_64) ;
-
-    }
-//  else if (ainz + bjnz <= 256)
     {
 
         //----------------------------------------------------------------------
@@ -171,16 +66,19 @@ __device__ static inline GB_bucket_code GB_bucket_assignment
         //----------------------------------------------------------------------
 
         // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
-        GB_BUCKET (ainz + bjnz <= 256, GB_BUCKET_VSVS_256) ;
+        GB_BUCKET (ainz + bjnz <= 128, GB_BUCKET_VSVS) ;
 
     }
-//  else
     {
 
         //----------------------------------------------------------------------
         // default: use the merge-path method
         //----------------------------------------------------------------------
 
+        // A(:,i) and B(:,j) are both sparse, but not very sparse.  The total #
+        // of entries in both vectors are > 256, so the merge-path path method
+        // is used.
+
         // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_mp.cu.jit
         GB_BUCKET (true, GB_BUCKET_MERGEPATH) ;
     }
@@ -188,461 +86,271 @@ __device__ static inline GB_bucket_code GB_bucket_assignment
     // subtract one to undo the "bucket+1" assignment in the
     // GB_BUCKET macro assignment expression.
     return (GB_bucket_code) (b-1) ;
+#endif
+
 }
 
 
-//--------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // GB_AxB_cuda_phase1: build nanobuckets, hunt for pre-zombies
-//--------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 // GB_AxB_cuda_dot3_phase1 is a CUDA kernel that scans all entries in C and
-// assigns them to each of the 12 buckets.  The output is a 12-by-blockDim array of
-// bucket counts, per threadblock (the nanobucket array).  Each of the blockDim.x 
-// threads has its own set of 12 bucket counts.  Each threadblock in this
-// kernel then computes the first part of the cumulative sum of the
-// nanobuckets, and writes it to global memory.
+// assigns them to each of the NBUCKETS buckets.  The output is a
+// NBUCKETS-by-blockDim array of bucket counts, per threadblock (the nanobucket
+// array).  Each of the blockDim.x threads has its own set of NBUCKETS bucket
+// counts.  Each threadblock in this kernel then computes the first part of the
+// cumulative sum of the nanobuckets, and writes it to global memory.
 
 // The kernel also computes Ci, of size nnz(C), which contains the
 // zombie assignment or bucket assignment for non-zombies in C.
 
-template<typename Type_M, uint64_t srcode>
+// FIXME: use 2 buckets?  mp and vsvs?  What if all entries are in one bucket;
+// can we skip the bucket creation?
+
+template<typename T_M, uint64_t srcode, int chunk_size = 128>
 __global__ void AxB_phase1
 (
     // outputs, preallocated in global memory:
-    int64_t *nanobuckets,       // array of size 12-blockDim.x-by-gridDim.x
-    int64_t *blockbucket,       // bucket counts, of size 12-by-gridDim.x
+    int64_t *nanobuckets,   // array of size NBUCKETS-blockDim.x-by-gridDim.x
+    int64_t *blockbucket,   // bucket counts, of size NBUCKETS-by-gridDim.x
     // input/output:
-    GrB_Matrix C,               // final output matrix
+    GrB_Matrix C,           // final output matrix
     // inputs, not modified:
-    const GrB_Matrix M,         // mask matrix
-    const GrB_Matrix A,         // input matrix
-    const GrB_Matrix B          // input matrix
+    const GrB_Matrix M,     // mask matrix
+    const GrB_Matrix A,     // input matrix
+    const GrB_Matrix B      // input matrix
 )
 {
 
     //--------------------------------------------------------------------------
     // get C, M, A, and B
     //--------------------------------------------------------------------------
-    
+
     const int64_t *__restrict__ Mh = M->h ;
     const int64_t *__restrict__ Mp = M->p ;
     const int64_t *__restrict__ Mi = M->i ;
-    const Type_M *__restrict__ Mx = (Type_M*)M->x ;    // not accessed if M is structural
+    const T_M *__restrict__ Mx = (T_M*) M->x ; // not accessed if M structural
     const int64_t mnvec = M->nvec ;
     const int64_t mvlen = M->vlen ;
-    const int64_t mnz =  GB_nnz(M) ;
+    const int64_t mnz =  M->p[M->nvec]; //GB_nnz(M) ;
     const bool M_is_hyper = M->h != NULL ;
 
     const int64_t *__restrict__ Ah = A->h ;
     const int64_t *__restrict__ Ap = A->p ;
     const int64_t *__restrict__ Ai = A->i ;
     const int64_t avlen = A->vlen ;
-    const int64_t anz = GB_nnz(A) ;
+    const int64_t anz = A->p[A->nvec]; //GB_nnz(A) ;
     const bool A_is_hyper = A->h != NULL ;
 
     const int64_t *__restrict__ Bh = B->h ;
     const int64_t *__restrict__ Bp = B->p ;
     const int64_t *__restrict__ Bi = B->i ;
     const int64_t bvlen = B->vlen ;
-    const int64_t bnz = GB_nnz(B);
+    const int64_t bnz = A->p[A->nvec]; //GB_nnz(B);
     const bool B_is_hyper = B->h != NULL ;
 
     // int64_t *restrict Cp = C->p ;    // copy of Mp
     // int64_t *restrict Ch = C->h ;    // copy of Mh
-    int64_t *__restrict__ Ci = C->i ;       // for zombies, or bucket assignment
+    int64_t *__restrict__ Ci = C->i ;   // for zombies, or bucket assignment
 
     // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
     // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
     // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
-    // where bucket is the bucket assignment for C(i,j). 
+    // where bucket is the bucket assignment for C(i,j).
     // bucket can be recovered from Ci by bucket = Ci & 0xF
 
     //--------------------------------------------------------------------------
     // clear the bucket counters
     //--------------------------------------------------------------------------
+    int64_t my_bucket[NBUCKETS];
 
-    //ASSERT (mnz > 0) ;
-    //ASSERT (gridDim.x <= mnz) ;
-
-
-    // each thread uses 12 bucket counters, held in register
-    int64_t my_bucket_0  = 0 ;
-    int64_t my_bucket_1  = 0 ;
-    int64_t my_bucket_2  = 0 ;
-    int64_t my_bucket_3  = 0 ;
-    int64_t my_bucket_4  = 0 ;
-    int64_t my_bucket_5  = 0 ;
-    int64_t my_bucket_6  = 0 ;
-    int64_t my_bucket_7  = 0 ;
-    int64_t my_bucket_8  = 0 ;
-    int64_t my_bucket_9  = 0 ;
-    int64_t my_bucket_10 = 0 ;
-    int64_t my_bucket_11 = 0 ;
-
-    // Registers cannot be indexed (!) so this macro is used instead.
-    // The bucket registers are indexed by the GB_bucket_code enum.
-    #define GB_BUCKET_COUNT(bucket)                 \
-    {                                               \
-        switch (bucket)                             \
-        {                                           \
-            case  0: my_bucket_0++  ; break ;       \
-            case  1: my_bucket_1++  ; break ;       \
-            case  2: my_bucket_2++  ; break ;       \
-            case  3: my_bucket_3++  ; break ;       \
-            case  4: my_bucket_4++  ; break ;       \
-            case  5: my_bucket_5++  ; break ;       \
-            case  6: my_bucket_6++  ; break ;       \
-            case  7: my_bucket_7++  ; break ;       \
-            case  8: my_bucket_8++  ; break ;       \
-            case  9: my_bucket_9++  ; break ;       \
-            case 10: my_bucket_10++ ; break ;       \
-            case 11: my_bucket_11++ ; break ;       \
-        }                                           \
-    }
-     /*
-    if(threadIdx.x==0 ) {
-       printf(" in phase1 kernel, mnz,anz,bnz= %ld,%ld,%ld\n",mnz,anz,bnz); 
+    // ASSERT (mnz > 0) ;
+    // ASSERT (gridDim.x <= mnz) ;
+
+    // each thread uses NBUCKETS bucket counters, held in register
+    #pragma unroll
+    for(int b = 0; b < NBUCKETS; ++b) {
+        my_bucket[b] = 0;
     }
-    __syncthreads();
-     */
-     #define pointerchunk 256
 
-     __shared__ int64_t Mps[pointerchunk];
-     __shared__ int64_t ks [chunksize];
+    __shared__ int64_t ks [chunk_size] ;
 
-    __syncthreads();
-    if (threadIdx.x==0 && blockIdx.x == 0)
-    {
-//        printf ("Here in phase1, what I see is this:\n") ;
-//        printf ("MX(pM) is: %s\n", GB_XSTR (MX (pM))) ;
-//        printf ("GB_MULT(x,y) is: %s\n", GB_XSTR (GB_MULT (x,y))) ;
-//        printf ("GB_ADD(x,y)  is: %s\n", GB_XSTR (GB_ADD (x,y))) ;
-        // #define GB_GETA(blob)
-        // #define GB_GETB(blob)
-        // #define GB_MULT(x,y) (1)
-        // #define GB_ADD(x,y) ((x) + (y))
-        // #define GB_IDENTITY (0)
-        // #define GB_TERMINAL_CONDITION(cij) (false)
-        // #define GB_IF_TERMINAL_BREAK  
-        // #define GB_PUTC(blob) blob
-        // #define GB_MTYPE void
-        // #define MX(p) true
-        // #define GB_MASK_COMP false
-        // #define GB_NO_MASK false
-        // #define GB_C_IS_SPARSE 1
-        // #define GB_C_IS_HYPER  0
-        // #define GB_C_IS_BITMAP 0
-        // #define GB_C_IS_FULL   0
-        // #define GB_M_IS_SPARSE 1
-        // #define GB_M_IS_HYPER  0
-        // #define GB_M_IS_BITMAP 0
-        // #define GB_M_IS_FULL   0
-        // #define GB_A_IS_SPARSE 1
-        // #define GB_A_IS_HYPER  0
-        // #define GB_A_IS_BITMAP 0
-        // #define GB_A_IS_FULL   0
-        // #define GB_B_IS_SPARSE 1
-        // #define GB_B_IS_HYPER  0
-        // #define GB_B_IS_BITMAP 0
-        // #define GB_B_IS_FULL   0
-    }
-    __syncthreads();
 
     //--------------------------------------------------------------------------
-    // compute the task descriptor
+    // assign all entries of C to the buckets
     //--------------------------------------------------------------------------
 
     // all threads in this block will compute the same values for these:
     int64_t pfirst, plast, kfirst, klast ;
 
-    int64_t chunk_max= (mnz + chunksize -1)/chunksize;
+    int64_t chunk_max = GB_ICEIL (mnz, chunk_size) ;
+    //      (mnz + chunk_size -1)/chunk_size;
     for ( int64_t chunk = blockIdx.x;
                   chunk < chunk_max;
-                  chunk += gridDim.x ) 
+                  chunk += gridDim.x )
     {
 
-      // The slice for each task contains entries pfirst:plast-1 of M and C.
-      //GB_PARTITION (pfirst, plast, mnz, chunk, (mnz+1023)/1024 ) ;
-      pfirst = chunksize * chunk ; 
-      plast  = GB_IMIN( chunksize * (chunk+1), mnz ) ;
-
-      int64_t chunk_end;
-      if ( mnz > chunksize) chunk_end = GB_IMIN(  chunksize, 
-                                                  mnz - chunksize*(chunk) ) ; 
-      else chunk_end = mnz;
-
-      // find the first vector of the slice for this chunk: the
-      // vector that owns the entry Ai [pfirst] and Ax [pfirst].
-      kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec, mvlen) ;
-      //if( pfirst ==0) kfirst = 0;
-
-      // find the last vector of the slice for task blockIdx.x: the
-      // vector that owns the entry Ai [plast-1] and Ax [plast-1].
-      klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec, mvlen) ;
-
-      int64_t k_end = GB_IMIN(  pointerchunk ,  klast - kfirst +2 ) ;
-        
-//      if( threadIdx.x ==0)
-//      {
-//         printf("chunk%ld pfirst,plast,ch_end =%ld,%ld,%ld kfirst,klast,kend = %ld,%ld,%ld\n",
-//                 chunk, pfirst, plast, chunk_end, kfirst, klast, k_end ) ;
-//      }
-      __syncthreads();
-      
-      
-     
-      // load pointer values for this chunk
-      for ( int64_t i = threadIdx.x; i< k_end; i+= blockDim.x)
-      {
-          Mps[i] = Mp[i + kfirst];
-      }
-      __syncthreads();
-      if (threadIdx.x == 0)
-      {
-//        for (int64_t i = 0 ; i < k_end ; i++)
-//        {
-//            printf ("Mps [%d] = %ld\n", i, Mps [i]) ;
-//        }
-      }
-      __syncthreads();
-
-
-      // search for k values for each entry
-      float slope = (float)(mnvec)/(float)(mnz* chunksize) ;
-      for ( int64_t i =  threadIdx.x; i< chunk_end; i+= blockDim.x)
-      {   
-          ks[i] = kfirst + slope*( float )(i);
-          while ( Mps[ ks[i] - kfirst + 1 ] <= (i+pfirst) )
-             ks[i]++;
-          while ( Mps[ ks[i] - kfirst     ] >  (i+pfirst) )
-             ks[i]--;
-      }
-      __syncthreads();
-      if (threadIdx.x == 0)
-      {
-//        for (int64_t i = 0 ; i < chunksize ; i++)
-//        {
-//            printf ("ks [%d] = %ld\n", i, ks [i]) ;
-//        }
-      }
-      __syncthreads();
-
-
-    //ASSERT (0 <= kfirst && kfirst <= klast && klast < mnvec) ;
-    /*
-    if (threadIdx.x ==0 ) {
-       printf ("threadblock %d  after ksearch pfirst %ld plast %ld kfirst %ld klast %ld\n",
-                blockIdx.x, pfirst, plast, kfirst, klast) ;
-    }
-    __syncthreads();
-    */
+        //----------------------------------------------------------------------
+        // determine the work done by this iteration, "chunk"
+        //----------------------------------------------------------------------
 
-    //--------------------------------------------------------------------------
-    // assign entries in C(i,j) to the buckets
-    //--------------------------------------------------------------------------
+        // The slice for each task contains entries pfirst:plast-1 of M and C.
+        // This iteration "chunk" computes Ci and Cx [pfirst...plast-1], using
+        // Mi and Mx [pfirst:plast-1].  All threads in the thread block are
+        // used for this "chunk".
+        pfirst = chunk_size * chunk ;
+        plast  = pfirst + chunk_size ;
+        // plast = GB_IMIN (plast, mnz) ;
+        if (plast > mnz) plast = mnz ;
+        int64_t my_chunk_size = plast - pfirst ;
+
+        // find the first vector of the slice for this chunk: the
+        // vector that owns the entry Mi [pfirst] and Mx [pfirst].
+        kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec, mvlen) ;
+
+        // find the last vector of the slice for task blockIdx.x: the
+        // vector that owns the entry Mi [plast-1] and Mx [plast-1].
+        klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec, mvlen);
+
+        // number of vectors in C and M for this "chunk" iteration, where
+        // Mp [kfirst:klast] will be operated on.
+        int64_t nk = klast - kfirst + 1 ;
 
-    // if B is hypersparse, bpleft ... TODO describe
-    // int64_t bpleft = 0 ;
-    
         //----------------------------------------------------------------------
-        // no binary search variant
+        // fill ks to find all indices
         //----------------------------------------------------------------------
 
-        //printf ("no binary search\n") ;
+        // search for k values for each entry pfirst:plast-1
+        float slope = ((float) nk) / ((float) my_chunk_size) ;
+        int64_t mnvec1 = mnvec - 1 ;
+        for (int64_t kk = threadIdx.x ; kk < my_chunk_size ; kk += blockDim.x)
+        {
+            // get a rough estimate of k for the kkth entry in ks
+            int64_t k = kfirst + (int64_t) (slope * ((float) kk)) ;
+            // k cannot be smaller than kfirst, but might be bigger than
+            // mnvec-1, so ensure it is in the valid range, kfirst to mnvec-1
+            // k = GB_IMIN (k, mnvec-1) ;
+            if (k > mnvec1) k = mnvec1 ; 
+            // look for p in Mp, where p is in range pfirst:plast-1
+            // where pfirst >= 0 and plast < mnz
+            int64_t p = kk + pfirst ;
+            // linear-time search for the k value of the pth entry
+            while ( Mp [ k + 1 ] <= p ) k++ ;
+            while ( Mp [ k     ] >  p ) k-- ;
+            ks [kk] = k ;
+        }
+        this_thread_block().sync();
+
+        //----------------------------------------------------------------------
+        // assign entries in C(i,j) to the buckets
+        //----------------------------------------------------------------------
+
+        // if B is hypersparse, bpleft ... TODO describe
+        // int64_t bpleft = 0 ;
 
-        //int32_t pM_start, pM_end ;
-        //for (int64_t pM = pfirst + threadIdx.x ; pM < plast ; pM += blockDim.x)
-            
-        //for (int64_t pM = pfirst; pM < plast; pM++ ) 
         for ( int64_t pM = pfirst + threadIdx.x;
-                      pM < pfirst + chunk_end;
+                      pM < pfirst + my_chunk_size;
                       pM += blockDim.x )
         {
             GB_bucket_code bucket = GB_BUCKET_ZOMBIE ;
-            int64_t k = ks[ pM - pfirst ] ;
-            //k += ( pM == Mp[k+1] ) ;
-//            printf ("tid%d  k %ld pM %ld MX(pM): %d\n", threadIdx.x, k, pM, MX (pM));
+            int64_t k = ks [pM - pfirst] ;  // get the k value of Mi,Mx [pM].
             int64_t i = Mi [ pM ] ;
-int64_t j = k ; // HACK, does not need to be initialized here
-
+            int64_t j = k ; // HACK, does not need to be initialized here
             if ( MX ( pM ) )
-            { 
+            {
 
-            // do a binary search for k (and j) that has this entry M(i,j)
-            //k = GB_search_for_vector_device (pM, Mp, k, klast) ;
+                // FIXME: handle the case where M, A, B are hypersparse
 
-// HACK
-j = k ;
-//          int64_t j = (Mh == NULL) ? k : Mh [k] ;
+                // HACK
+                j = k ;
+                //          int64_t j = (Mh == NULL) ? k : Mh [k] ;
 
-            //--------------------------------------------------------------
-            // get B(:,j)
-            //--------------------------------------------------------------
+                //--------------------------------------------------------------
+                // get B(:,j)
+                //--------------------------------------------------------------
 
-            int64_t pB, pB_end ;
-// HACK: for sparse only, not hypersparse
+                int64_t pB, pB_end ;
 
-pB = Bp [j] ;
-pB_end = Bp [j+1] ;
-//              GB_lookup_device (B_is_hyper, Bh, Bp, &bpleft, bnvec-1, j,
-//                  &pB, &pB_end) ;
+                // HACK: for sparse only, not hypersparse
+                pB     = Bp [j] ;
+                pB_end = Bp [j+1] ;
+                // GB_lookup_device (B_is_hyper, Bh, Bp, &bpleft, bnvec-1, j,
+                //                  &pB, &pB_end) ;
                 int64_t bjnz = pB_end - pB ;
                 if (bjnz > 0)
                 {
-                 //   int64_t ib_first = Bi [pB] ;
-                 //   int64_t ib_last  = Bi [pB_end-1] ;
 
                     //----------------------------------------------------------
                     // get A(:,i)
                     //----------------------------------------------------------
 
                     int64_t pA, pA_end ;
-                    //int64_t apleft = 0 ;
-// HACK: for sparse only, not hypersparse
-pA = Ap [i] ;
-pA_end = Ap [i+1] ;
-//                  GB_lookup_device (A_is_hyper, Ah, Ap, &apleft, anvec-1, i,
-//                      &pA, &pA_end) ;
+                    // int64_t apleft = 0 ;
+                    // HACK: for sparse only, not hypersparse
+                    pA     = Ap [i] ;
+                    pA_end = Ap [i+1] ;
+                    // GB_lookup_device (A_is_hyper, Ah, Ap, &apleft, anvec-1,
+                    //      i, &pA, &pA_end) ;
                     int64_t ainz = pA_end - pA ;
                     if (ainz > 0)
                     {
-                     //   int64_t ia_first = Ai [pA] ;
-                     //   int64_t ia_last  = Ai [pA_end-1] ;
-
-                        //------------------------------------------------------
                         // determine the bucket for C(i,j)
-                        //------------------------------------------------------
-
-                        //bucket = GB_BUCKET_MERGEPATH ;
-                        bucket= GB_bucket_assignment ( ainz, bjnz, bvlen) ;
-//                        printf ("tid%d  i %ld j %ld ainz %ld bjnz %ld: bucket %d\n",
-//                            threadIdx.x, i, j, ainz, bjnz, (int) bucket) ;
+                        bool vsvs = (ainz + bjnz <= 128) ;
+                        bucket = (GB_bucket_code)
+                           (  ((int) ( vsvs)) * ((int) GB_BUCKET_VSVS)
+                            + ((int) (!vsvs)) * ((int) GB_BUCKET_MERGEPATH)) ;
                     }
                 }
             }
 
-            if (bucket == GB_BUCKET_ZOMBIE)
-            {
-                // mark C(i,j) is a zombie
-//                printf ("tid%d pM=%d %d,%d prezombie\n",threadIdx.x,pM,i,j) ;
-                Ci [pM] = GB_FLIP (i) << 4 ;
-                // GB_BUCKET_COUNT (GB_BUCKET_ZOMBIE) ;
-                my_bucket_0++ ; //0 is the zombie bucket
-            }
-            else
-            {
-                // place C(i,j) in its bucket
-                Ci [pM] = (k << 4) + bucket ;
-                GB_BUCKET_COUNT (bucket) ;
-//                printf ("tid%d pM=%d %d,%d b=%d\n",threadIdx.x, pM, i,j, (int)bucket) ;
-            }
-         }
-            
-        
-    
+            Ci[pM] = (bucket == GB_BUCKET_ZOMBIE) * ( GB_FLIP(i) << 4)
+                   + (bucket != GB_BUCKET_ZOMBIE) * ((k<<4) + bucket) ;
+            my_bucket[bucket]++;
+        }
     }
-    __syncthreads();
+    this_thread_block().sync();
 
     //--------------------------------------------------------------------------
     // cumulative sum of each bucket
     //--------------------------------------------------------------------------
 
-    typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockCumSum; 
-    __shared__ typename BlockCumSum::TempStorage temp_storage;
+    typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockCumSum;
+    __shared__ typename BlockCumSum::TempStorage temp_storage ;
 
     // The taskbucket for this thread block is an array of size
-    // 12-by-blockDim.x, held by row.  Each thread owns one column of this
-    // taskbucket, the nanobucket.  The nanobucket is a column of length 12,
-    // with stride equal to blockDim.x.
+    // NBUCKETS-by-blockDim.x, held by row.  Each thread owns one column of
+    // this taskbucket, the nanobucket.  The nanobucket is a column of length
+    // NBUCKETS, with stride equal to blockDim.x.
     int64_t *nanobucket =
         nanobuckets + blockIdx.x * (NBUCKETS * blockDim.x) + threadIdx.x ;
 
-    #define CUMSUM_AND_STORE_NANOBUCKET(bucket) \
-        if( threadIdx.x == blockDim.x-1)                                    \
-            blockbucket [blockIdx.x + bucket * gridDim.x] =                 \
-            my_bucket_ ## bucket ;                                          \
-        BlockCumSum(temp_storage).ExclusiveSum                              \
-            ( my_bucket_ ## bucket, my_bucket_ ## bucket) ;                 \
-            __syncthreads();                                                \
-        nanobucket [bucket * blockDim.x] = my_bucket_ ## bucket ;
-
-    CUMSUM_AND_STORE_NANOBUCKET (0) ;
-    CUMSUM_AND_STORE_NANOBUCKET (1) ;
-    CUMSUM_AND_STORE_NANOBUCKET (2) ;
-    CUMSUM_AND_STORE_NANOBUCKET (3) ;
-    CUMSUM_AND_STORE_NANOBUCKET (4) ;
-    CUMSUM_AND_STORE_NANOBUCKET (5) ;
-    CUMSUM_AND_STORE_NANOBUCKET (6) ;
-    CUMSUM_AND_STORE_NANOBUCKET (7) ;
-    CUMSUM_AND_STORE_NANOBUCKET (8) ;
-    CUMSUM_AND_STORE_NANOBUCKET (9) ;
-    CUMSUM_AND_STORE_NANOBUCKET (10) ;
-    CUMSUM_AND_STORE_NANOBUCKET (11) ;
-
-    /*    
-    if(threadIdx.x +blockIdx.x*blockDim.x <= mnvec) //blockDim.x -1)
-    {
-       printf("thd %d blk%d nbucket0 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[0]);
-       printf("thd %d blk%d nbucket1 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[1*blockDim.x]);
-       printf("thd %d blk%d nbucket2 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[2*blockDim.x]);
-       printf("thd %d blk%d nbucket3 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[3*blockDim.x]);
-       printf("thd %d blk%d nbucket4 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[4*blockDim.x]);
-       printf("thd %d blk%d nbucket5 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[5*blockDim.x]);
-       printf("thd %d blk%d nbucket6 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[6*blockDim.x]);
-       printf("thd %d blk%d nbucket7 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[7*blockDim.x]);
-       printf("thd %d blk%d nbucket8 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[8*blockDim.x]);
-       printf("thd %d blk%d nbucket9 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[9*blockDim.x]);
-       printf("thd %d blk%d nbucket10 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[10*blockDim.x]);
-       printf("thd %d blk%d nbucket11 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[11*blockDim.x]);
+    #pragma unroll
+    for(int b = 0; b < NBUCKETS; ++b) {
+        if( threadIdx.x == blockDim.x-1) {
+            blockbucket [blockIdx.x + b * gridDim.x] = my_bucket[b] ;
+        }
+        this_thread_block().sync();
+
+        BlockCumSum(temp_storage).ExclusiveSum( my_bucket[b], my_bucket[b]) ;
+
+        this_thread_block().sync();
+
+        nanobucket [b * blockDim.x] = my_bucket[b] ;
     }
-    __syncthreads();
-    */
-        
+
     // The last thread now has the sum of all nanobuckets, which is then saved
     // to the global bucket counts.   blockbucket is an array of size
-    // 12-by-gridDim.x, held by row, with one column per thread block.
+    // NBUCKETS-by-gridDim.x, held by row, with one column per thread block.
     // The last thread saves its result in the column of this thread block.
     // Note that this write to global memory is not coalesced.
 
-    #define STORE_GLOBAL_BUCKET_COUNT(bucket)                    \
-        blockbucket [bucket * gridDim.x + blockIdx.x] +=         \
-            my_bucket_ ## bucket ;
-
-    if (threadIdx.x == blockDim.x - 1 ) 
+    if (threadIdx.x == blockDim.x - 1 )
     {
-        STORE_GLOBAL_BUCKET_COUNT (0) ;
-        STORE_GLOBAL_BUCKET_COUNT (1) ;
-        STORE_GLOBAL_BUCKET_COUNT (2) ;
-        STORE_GLOBAL_BUCKET_COUNT (3) ;
-        STORE_GLOBAL_BUCKET_COUNT (4) ;
-        STORE_GLOBAL_BUCKET_COUNT (5) ;
-        STORE_GLOBAL_BUCKET_COUNT (6) ;
-        STORE_GLOBAL_BUCKET_COUNT (7) ;
-        STORE_GLOBAL_BUCKET_COUNT (8) ;
-        STORE_GLOBAL_BUCKET_COUNT (9) ;
-        STORE_GLOBAL_BUCKET_COUNT (10) ;
-        STORE_GLOBAL_BUCKET_COUNT (11) ;
-    }
-    
-    /* 
-    if(threadIdx.x == blockDim.x -1){ 
-
-       printf("block%d bbucket0 has %ld entries\n",blockIdx.x, blockbucket[0*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket1 has %ld entries\n",blockIdx.x, blockbucket[1*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket2 has %ld entries\n",blockIdx.x, blockbucket[2*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket3 has %ld entries\n",blockIdx.x, blockbucket[3*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket4 has %ld entries\n",blockIdx.x, blockbucket[4*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket5 has %ld entries\n",blockIdx.x, blockbucket[5*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket6 has %ld entries\n",blockIdx.x, blockbucket[6*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket7 has %ld entries\n",blockIdx.x, blockbucket[7*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket8 has %ld entries\n",blockIdx.x, blockbucket[8*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket9 has %ld entries\n",blockIdx.x, blockbucket[9*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket10 has %ld entries\n",blockIdx.x, blockbucket[10*gridDim.x+blockIdx.x]);
-       printf("block%d bbucket11 has %ld entries\n",blockIdx.x, blockbucket[11*gridDim.x+blockIdx.x]);
-
+        #pragma unroll
+        for(int b = 0; b < NBUCKETS; ++b) {
+            blockbucket [b * gridDim.x + blockIdx.x] += my_bucket[b];
+        }
     }
-    __syncthreads();
-    */
-    
 }
 
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2.cuh
index f052e18b2..63d960640 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2.cuh
@@ -8,7 +8,7 @@
 #define GB_CUDA_KERNEL
 
 #include "GB_cuda_buckets.h"
-#include "matrix.h"
+#include "GB_cuda_kernel.h"
 #include <cooperative_groups.h>
 #include <cub/block/block_scan.cuh>
 
@@ -33,13 +33,13 @@ struct BlockPrefixCallbackOp
    }
 };
 
-__inline__ 
+__inline__
 __device__ void blockBucketExclusiveSum(int bucketId, int64_t *d_data, int nblocks)
 {
    #define blocksize  32
 
    // Specialize BlockScan for a 1D block of 32 threads
-   typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockScan; 
+   typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
 
    // Allocate shared memory for BlockScan
    __shared__ typename BlockScan::TempStorage temp_storage;
@@ -59,26 +59,26 @@ __device__ void blockBucketExclusiveSum(int bucketId, int64_t *d_data, int nbloc
     //printf("block %d entering sum\n",blockIdx.x);
       int loc = block_id + threadIdx.x;
       if ( loc < nblocks)
-      { 
+      {
         //printf("block %di loading tid=%d\n",block_id,tid);
-        data  = blockbucket[bucketId*nblocks    +loc ] ; 
+        data  = blockbucket[bucketId*nblocks +loc ] ;
       }
-      __syncthreads();
+      this_thread_block().sync();
 
-      //printf("bb%d_%d s0 before prefix= %ld \n", block_id,bucketId, 
-      //                     blockbucket[bucketId*nblocks + block_id+threadIdx.x] )  ; 
+      //printf("bb%d_%d s0 before prefix= %ld \n", block_id,bucketId,
+      //                     blockbucket[bucketId*nblocks +loc] )  ;
       // Collectively compute the block-wide exclusive prefix sum
       BlockScan(temp_storage).ExclusiveSum( data, data, prefix_op);
-      __syncthreads();
+      this_thread_block().sync();
 
       if ( loc < nblocks)
-      { 
-        blockbucket[bucketId*nblocks   +loc ]  = data  ; 
+      {
+        blockbucket[bucketId*nblocks   +loc ]  = data  ;
       }
-      __syncthreads();
+      //this_thread_block().sync();
+
+      //printf("bb%d_%d = %ld \n", block_id, bucketId, blockbucket[bucketId*nblocks +loc] )  ;
 
-        //printf("bb%d_%d = %ld \n", block_id, bucketId, blockbucket[bucketId*nblocks+block_id+threadIdx.x] )  ; 
-      
       data = 0;
    }
 }
@@ -104,19 +104,22 @@ __inline__ __device__ T block_ReduceSum(thread_block g, T val)
   thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
 
   // Each warp performs partial reduction
-  val = warp_ReduceSumPlus<T, warpSize>( tile, val);    
+  val = warp_ReduceSumPlus<T, warpSize>( tile, val);
 
   // Wait for all partial reductions
-  if (lane==0) { 
+  if (lane==0) {
      //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val);
      shared[wid]=val; // Write reduced value to shared memory
      //printf("thd%d stored warp %d sum %d\n", threadIdx.x, wid, val);
   }
-  __syncthreads();              // Wait for all partial reductions
+  this_thread_block().sync(); // Wait for all partial reductions
+
+  if (wid > 0 ) return val ;git2
 
-  if (wid > 0 ) return val ;
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x <  (blockDim.x / warpSize ) ) ? shared[lane] : 0;
   //Final reduce within first warp
-  if (wid==0) val = warp_ReduceSumPlus<T, warpSize>( tile, val) ; 
+  if (wid==0) val = warp_ReduceSumPlus<T, warpSize>( tile, val) ;
 
   return val;
 }
@@ -129,11 +132,11 @@ __inline__ __device__ T block_ReduceSum(thread_block g, T val)
 __global__ void AxB_phase2
 (
     // input, not modified:
-    int64_t *__restrict__ blockbucket,    // global bucket count, of size 12*nblocks
+    int64_t *__restrict__ blockbucket,    // global bucket count, of size NBUCKETS*nblocks
     // output:
     int64_t *__restrict__ offset,         // global offsets, for each bucket
     // inputs, not modified:
-    const int nblocks         // input number of blocks to reduce
+    const int nblocks        // input number of blocks to reduce across, ie size of vector for 1 bucket
 )
 {
 
@@ -141,96 +144,60 @@ __global__ void AxB_phase2
     // sum up the bucket counts of prior threadblocks
     //--------------------------------------------------------------------------
 
-    // blockbucket is an array of size 12-by-nblocks, held by row.  The
+    // blockbucket is an array of size NBUCKETS-by-nblocks, held by row.  The
     // entry blockbucket [bucket * nblocks + t] holds the # of entries
-    // in the bucket (in range 0 to 11) found by threadblock t.
-
-
-    //__shared__ uint64_t offset [12] ;
-    uint64_t s_0=0;
-    uint64_t s_1=0;
-    uint64_t s_2=0;
-    uint64_t s_3=0;
-    uint64_t s_4=0;
-    uint64_t s_5=0;
-    uint64_t s_6=0;
-    uint64_t s_7=0;
-    uint64_t s_8=0;
-    uint64_t s_9=0;
-    uint64_t s_10=0;
-    uint64_t s_11=0;
+    // in the bucket (in range 0 to NBUCKETS-1) found by threadblock t.
+
+    //__shared__ uint64_t offset [NBUCKETS] ;
+    uint64_t s[NBUCKETS];
+
+    #pragma unroll
+    for(int b = 0; b < NBUCKETS; ++b){
+        s[b] = 0;
+    }
 
     thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() );
 
-    //printf("block %d entering sum\n",blockIdx.x);
-    int tid = threadIdx.x  + blockIdx.x * blockDim.x;
-    #define reduceBucket( B )    \
-     for( tid = threadIdx.x + blockIdx.x*blockDim.x; \
-          tid < nblocks;  \
-          tid += blockDim.x*gridDim.x) \
-     {                           \
-        s_ ## B  += blockbucket[  B *nblocks +tid] ;  \
-     } \
-     __syncthreads(); \
-     s_ ## B  = warp_ReduceSumPlus<uint64_t , 32>( tile, s_ ## B); 
-
-     reduceBucket( 0 )
-     reduceBucket( 1 )
-     reduceBucket( 2 )
-     reduceBucket( 3 )
-     reduceBucket( 4 )
-     reduceBucket( 5 )
-     reduceBucket( 6 )
-     reduceBucket( 7 )
-     reduceBucket( 8 )
-     reduceBucket( 9 )
-     reduceBucket( 10 )
-     reduceBucket( 11 )
-
-
-        //printf("summing blk,tid=%d,%d\n",blockIdx.x,threadIdx.x);
-       if (threadIdx.x ==0 )
-       {
-           printf("s_0: %ld, s_1=%ld, s_10=%ld, s_11=%ld\n", s_0, s_1, s_10, s_11);
-          atomicAdd( (unsigned long long int*)&(offset[0]), s_0);
-          atomicAdd( (unsigned long long int*)&(offset[1]), s_1);
-          atomicAdd( (unsigned long long int*)&(offset[2]), s_2);
-          atomicAdd( (unsigned long long int*)&(offset[3]), s_3);
-          atomicAdd( (unsigned long long int*)&(offset[4]), s_4);
-          atomicAdd( (unsigned long long int*)&(offset[5]), s_5);
-          atomicAdd( (unsigned long long int*)&(offset[6]), s_6);
-          atomicAdd( (unsigned long long int*)&(offset[7]), s_7);
-          atomicAdd( (unsigned long long int*)&(offset[8]), s_8);
-          atomicAdd( (unsigned long long int*)&(offset[9]), s_9);
-          atomicAdd( (unsigned long long int*)&(offset[10]),s_10);
-          atomicAdd( (unsigned long long int*)&(offset[11]),s_11);
-       }
-       __syncthreads();
-       
-
-
-    if( gridDim.x >= 12)
+    //printf("block %d,dim %d entering sum %d nblocks\n",blockIdx.x, blockDim.x, nblocks);
+    int64_t tid = threadIdx.x  + blockIdx.x * blockDim.x;
+
+
+     #pragma unroll
+     for(int b = 0; b < NBUCKETS; ++b) {
+         for( tid = threadIdx.x + blockIdx.x * blockDim.x;
+              tid < nblocks;
+              tid += blockDim.x*gridDim.x) {
+            s[b]  += blockbucket[  b * nblocks +tid] ;
+         }
+         this_thread_block().sync(); 
+
+         s[b]  = warp_ReduceSumPlus<uint64_t , 32>( tile, s[b]);
+     }
+
+    if (threadIdx.x ==0 )
     {
-        // Cumulative sum across blocks for each bucket 
-        if (blockIdx.x <12)
-           blockBucketExclusiveSum( blockIdx.x, blockbucket, nblocks ) ;
+        #pragma unroll
+        for(int b = 0; b < NBUCKETS; ++b) {
+            atomicAdd( (unsigned long long int*)&(offset[b]), s[b]);
+        }
+    }
+    this_thread_block().sync(); 
+
+    if( gridDim.x >= NBUCKETS)
+    {
+        // Cumulative sum across blocks for each bucket
+        if (blockIdx.x <NBUCKETS) {
+            blockBucketExclusiveSum( blockIdx.x, blockbucket, nblocks ) ;
+        }
     }
     else
     {
         if (blockIdx.x == 0)
         {
-           blockBucketExclusiveSum( 0, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 1, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 2, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 3, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 4, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 5, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 6, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 7, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 8, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 9, blockbucket, nblocks ) ;
-           blockBucketExclusiveSum( 10, blockbucket, nblocks) ;
-           blockBucketExclusiveSum( 11, blockbucket, nblocks) ;
+            #pragma unroll
+            for(int b = 0; b < NBUCKETS; ++b) {
+                blockBucketExclusiveSum( b, blockbucket, nblocks ) ;
+            }
         }
     }
 } // phase2
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2end.cuh b/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2end.cuh
index 73f223253..81d10e28f 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2end.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_phase2end.cuh
@@ -8,30 +8,30 @@
 #define GB_CUDA_KERNEL
 
 #include "GB_cuda_buckets.h"
-#include "matrix.h"
-#include <cooperative_groups.h>
-#include <cub/block/block_scan.cuh>
+#include "GB_cuda_kernel.h"
+//#include <cooperative_groups.h>
+//#include <cub/block/block_scan.cuh>
 
-using namespace cooperative_groups;
+//using namespace cooperative_groups;
 
 __global__
 void AxB_phase2end
-        (
-                // input, not modified:
-                int64_t *__restrict__ nanobuckets,    // array of size 12-blockDim.x-by-nblocks
-                const int64_t *__restrict__ blockbucket,    // global bucket count, of size 12*nblocks
-                // output:
-                const int64_t *__restrict__ bucketp,        // global bucket cumsum, of size 13
-                int64_t *__restrict__ bucket,         // global buckets, of size cnz (== mnz)
-                const int64_t *__restrict__ offset,         // global offsets, for each bucket
-                // inputs, not modified:
-                const GrB_Matrix C,            // output matrix
-                const int64_t cnz        // number of entries in C and M
-        )
+    (
+        // input, not modified:
+        const int64_t *__restrict__ nanobuckets,    // array of size NBUCKETS-blockDim.x-by-nblocks
+        const int64_t *__restrict__ blockbucket,    // global bucket count, of size NBUCKETS*nblocks
+        // output:
+        const int64_t *__restrict__ bucketp,        // global bucket cumsum, of size NBUCKETS+1
+              int64_t *__restrict__ bucket,         // global buckets, of size cnz (== mnz)
+        const int64_t *__restrict__ offset,         // global offsets, for each bucket
+        // inputs, not modified:
+        const GrB_Matrix C,      // output matrix
+        const int64_t cnz        // number of entries in C and M
+    )
 {
 
     //--------------------------------------------------------------------------
-    // get C and M
+    // get C information 
     //--------------------------------------------------------------------------
 
     // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
@@ -41,47 +41,39 @@ void AxB_phase2end
     // need k, just the bucket for each entry C(i,j).
 
     int64_t *__restrict__ Ci = C->i ;       // for zombies, or bucket assignment
-    int64_t *__restrict__ Mp = C->p ;       // for offset calculations
-    int64_t mnvec = C->nvec;
+    //int64_t *Mp = C->p ;       // for offset calculations
+    //int64_t mnvec = C->nvec;
 
     //--------------------------------------------------------------------------
     // load and shift the nanobuckets for this thread block
     //--------------------------------------------------------------------------
 
     // The taskbucket for this threadblock is an array of size
-    // 12-by-blockDim.x, held by row.  It forms a 2D array within the 3D
+    // NBUCKETS-by-blockDim.x, held by row.  It forms a 2D array within the 3D
     // nanobuckets array.
-    int64_t *__restrict__ taskbucket = nanobuckets + blockIdx.x * (NBUCKETS * blockDim.x) ;
+    const int64_t *taskbucket = nanobuckets + blockIdx.x * (NBUCKETS * blockDim.x) ;
 
     //printf("block%d thd%d blockbucket= %ld\n", blockIdx.x, threadIdx.x,
     //                                           blockbucket[blockIdx.x*gridDim.x+blockIdx.x]);
 
     // Each thread in this threadblock owns one column of this taskbucket, for
-    // its set of 12 nanobuckets.  The nanobuckets are a column of length 12,
+    // its set of NBUCKETS nanobuckets.  The nanobuckets are a column of length NBUCKETS,
     // with stride equal to blockDim.x.
-    int64_t *__restrict__ nanobucket = taskbucket + threadIdx.x;
-
-    // Each thread loads its 12 nanobucket values into registers.
-#define LOAD_NANOBUCKET(bucket)                     \
-        int64_t my_bucket_ ## bucket =                  \
-            nanobucket [bucket * blockDim.x]        \
-         + blockbucket [bucket * gridDim.x + blockIdx.x]\
-         + bucketp [bucket] ;
-
-    LOAD_NANOBUCKET (0) ;
-    LOAD_NANOBUCKET (1) ;
-    LOAD_NANOBUCKET (2) ;
-    LOAD_NANOBUCKET (3) ;
-    LOAD_NANOBUCKET (4) ;
-    LOAD_NANOBUCKET (5) ;
-    LOAD_NANOBUCKET (6) ;
-    LOAD_NANOBUCKET (7) ;
-    LOAD_NANOBUCKET (8) ;
-    LOAD_NANOBUCKET (9) ;
-    LOAD_NANOBUCKET (10) ;
-    LOAD_NANOBUCKET (11) ;
-
-    // Now each thread has an index into the global set of 12 buckets,
+    const int64_t *nanobucket = taskbucket + threadIdx.x;
+
+    // Each thread loads its NBUCKETS nanobucket values into registers.
+    int64_t my_bucket[NBUCKETS];
+
+    #pragma unroll 
+    for(int b = 0; b < NBUCKETS; ++b) {
+        my_bucket[b] = nanobucket [b * blockDim.x]
+                     + blockbucket [b * gridDim.x + blockIdx.x]
+                     + bucketp [b] ;
+
+    //if(b==3) printf("blk:%d tid: %d my_buck[%d]=%lu \n", blockIdx.x, threadIdx.x,  b, my_bucket[b]);
+    }
+
+    // Now each thread has an index into the global set of NBUCKETS buckets,
     // held in bucket, of where to place its own entries.
 
     //--------------------------------------------------------------------------
@@ -92,66 +84,56 @@ void AxB_phase2end
     // C, which is the part of C operated on by this threadblock.
     int64_t pfirst, plast ;
 
-    /*
-    for ( int tid_global = threadIdx.x + blockIdx.x * blockDim.x ;
-              tid_global < (mnvec+7)/8 ;
-              tid_global += blockDim.x * gridDim.x)
-    */
+    __shared__ int64_t bucket_idx[chunksize];
+  //__shared__ int64_t bucket_s[NBUCKETS][chunksize];
+
     int chunk_max= (cnz + chunksize -1)/chunksize;
     for ( int chunk = blockIdx.x;
           chunk < chunk_max;
           chunk += gridDim.x )
     {
 
-        //GB_PARTITION (pfirst, plast, cnz, tid_global, (mnvec+7)/8 ) ;
         pfirst = chunksize * chunk ;
         plast  = GB_IMIN( chunksize * (chunk+1), cnz ) ;
 
-        int chunk_end;
-        if ( cnz > chunksize) chunk_end = GB_IMIN(  chunksize,
-                                                    cnz - chunksize*(chunk) );
-        else chunk_end = cnz;
-
-        // find the first vector of the slice for task blockIdx.x: the
-        // vector that owns the entry Ai [pfirst] and Ax [pfirst].
-        //kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec) ;
-
-        // find the last vector of the slice for task blockIdx.x: the
-        // vector that owns the entry Ai [plast-1] and Ax [plast-1].
-        //klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec) ;
-
-
-        for ( int p = pfirst + threadIdx.x;
-              p < pfirst + chunk_end;
+        for ( int64_t p = pfirst + threadIdx.x;
+              p < plast ;
               p += blockDim.x )
         {
             // get the entry C(i,j), and extract its bucket.  Then
             // place the entry C(i,j) in the global bucket it belongs to.
+            int tid = p - pfirst;
 
             // TODO: these writes to global are not coalesced.  Instead: each
-            // threadblock could buffer its writes to 12 buffers and when the
+            // threadblock could buffer its writes to NBUCKETS buffers and when the
             // buffers are full they can be written to global.
             int ibucket = Ci[p] & 0xF;
             //printf(" thd: %d p,Ci[p] = %ld,%ld,%d\n", threadIdx.x, p, Ci[p], irow );
-            switch (ibucket)
-            {
-                case  0: bucket [my_bucket_0++ ] = p ; Ci[p] = Ci[p] >>4; break ; //unshift zombies
-                case  1: bucket [my_bucket_1++ ] = p ; break ;
-                case  2: bucket [my_bucket_2++ ] = p ; break ;
-                case  3: bucket [my_bucket_3++ ] = p ; break ;
-                case  4: bucket [my_bucket_4++ ] = p ; break ;
-                case  5: bucket [my_bucket_5++ ] = p ; break ;
-                case  6: bucket [my_bucket_6++ ] = p ; break ;
-                case  7: bucket [my_bucket_7++ ] = p ; break ;
-                case  8: bucket [my_bucket_8++ ] = p ; break ;
-                case  9: bucket [my_bucket_9++ ] = p ; break ;
-                case 10: bucket [my_bucket_10++] = p ; break ;
-                case 11: bucket [my_bucket_11++] = p ; break ;
-                default: break;
-            }
 
+            //bucket[my_bucket[ibucket]++] = p;
+          //int idx = (my_bucket[ibucket]  - pfirst); 
+          //my_bucket[ibucket] +=  1; //blockDim.x;
+          //int idx = (my_bucket[ibucket]++ - pfirst) & 0x7F;
+          //bucket_s[ibucket][ idx ] = p;
+            bucket_idx[tid] = my_bucket[ibucket]++;
+            Ci[p] = (ibucket==0) * (Ci[p] >> 4) + (ibucket > 0)* Ci[p];
+          //if(ibucket == 0) {
+          ////    bucket[my_bucket[0]++] = p;
+          //    Ci[p] = Ci[p] >> 4;
+          //} else {
+          //  bucket[my_bucket[ibucket]++] = p;
+          //}
         }
-        //__syncthreads();
+        for ( int64_t p = pfirst + threadIdx.x; p < plast ; p+= blockDim.x ){
+            int tid = p - pfirst;
+          //int ibucket = Ci[p] & 0xF;
+          //bucket[ p ] = bucket_s[ibucket][tid];
+            bucket [ bucket_idx[tid]  ]  = p;
+          //printf("ibucket = %d tid=%d p=%lu idx = %lu  val = %lu \n",ibucket, threadIdx.x,p, tid, bucket_s[ibucket][tid]);
+          //printf("ibucket = %d tid=%d p=%lu idx = %lu  \n",ibucket, threadIdx.x, p, bucket_idx[tid]);
+
+        }
+
     }
 }
 
diff --git a/GraphBLAS/CUDA/templates/GB_jit_reduceNonZombiesWarp.cuh b/GraphBLAS/CUDA/templates/GB_jit_reduceNonZombiesWarp.cuh
index 2264e9f13..6b6c4857b 100644
--- a/GraphBLAS/CUDA/templates/GB_jit_reduceNonZombiesWarp.cuh
+++ b/GraphBLAS/CUDA/templates/GB_jit_reduceNonZombiesWarp.cuh
@@ -18,34 +18,43 @@
 #define GB_CUDA_KERNEL
 #include <limits>
 #include <type_traits>
-#include "matrix.h"
+#include "GB_cuda_kernel.h"
 #include "GB_cuda_atomics.cuh"
 #include <cstdint>
 #include <cooperative_groups.h>
 
-// TODO: Temporary
-#define GB_IDENTITY 0
-#define GB_ADD(a, b) a + b
-
 using namespace cooperative_groups;
 
-template< typename T, int tile_sz>
+template< typename T, int tile_sz, int rcode>
 __inline__ __device__ 
 T warp_ReduceSum( thread_block_tile<tile_sz> g, T val)
 {
     // Each iteration halves the number of active threads
     // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2) {
+    /*
+    #pragma unroll
+    for (int i = tile_sz >> 1; i > 0; i >>= 1) {
         T fold = g.shfl_down( val, i);
-        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
         val = GB_ADD( val, fold );
+        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
     }
+    */
+        T fold = g.shfl_down( val, 16);
+        val = GB_ADD( val, fold );
+         fold = g.shfl_down( val, 8);
+        val = GB_ADD( val, fold );
+         fold = g.shfl_down( val, 4);
+        val = GB_ADD( val, fold );
+         fold = g.shfl_down( val, 2);
+        val = GB_ADD( val, fold );
+         fold = g.shfl_down( val, 1);
+        val = GB_ADD( val, fold );
     //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x,  val);
     return val; // note: only thread 0 will return full sum
 }
 
 
-template<typename T, int warpSize>
+template<typename T, int warpSize, int rcode>
 __inline__ __device__
 T block_ReduceSum(thread_block g, T val)
 {
@@ -54,8 +63,13 @@ T block_ReduceSum(thread_block g, T val)
   int wid  = threadIdx.x >> 5 ; // / warpSize;
   thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
 
-  // Each warp performs partial reduction
-  val = warp_ReduceSum<T, warpSize>( tile, val);    
+    // TODO: Figure out how to use graphblas-specific INFINITY macro
+    #ifndef INFINITY
+    #define INFINITY std::numeric_limits<T>::max()
+    #endif
+
+    // Each warp performs partial reduction
+  val = warp_ReduceSum<T, warpSize, rcode>( tile, val);
 
   // Wait for all partial reductions
   if (lane==0) { 
@@ -63,35 +77,41 @@ T block_ReduceSum(thread_block g, T val)
      shared[wid] = val; // Write reduced value to shared memory
      //printf("thd%d stored warp%d sum %d\n", threadIdx.x, wid, val);
   }
-  __syncthreads();              // Wait for all partial reductions
+  this_thread_block().sync();     // Wait for all partial reductions
 
-  if (wid > 0 ) return val;
+  //if (wid > 0 ) return val;
   //read from shared memory only if that warp existed
-  else { 
+  //else { 
     val = (threadIdx.x < (blockDim.x / warpSize) ) ? shared[lane] : GB_IDENTITY ;
     //if (lane < (blockDim.x/ warpSize) ) printf("thd%d warp%d loaded val = %d\n", threadIdx.x, lane, val);
-    val = warp_ReduceSum<T, warpSize>( tile, val); //Final reduce within first warp
-  }
+    val = warp_ReduceSum<T, warpSize, rcode>( tile, val); //Final reduce within first warp
+  //}
 
   return val;
 }
 
 
-template< typename T, typename Accum, bool atomic_reduce = true>
+template< typename T, typename Accum, int rcode, bool atomic_reduce = true>
 __global__ void reduceNonZombiesWarp
 (
     GrB_Matrix A,
-    GrB_Scalar O,      // array of size grid.x if atomic_reduce==false and size 1 if atomic_reduce==true
-    int64_t N,  // number of edges for sparse, size of x array for full/bitmap
+    GrB_Scalar R,      // array of size grid.x if atomic_reduce==false and size 1 if atomic_reduce==true
+    int64_t N,         // number of edges for sparse, size of x array for full/bitmap
     bool is_sparse
 )
 {
+
+    // TODO: Figure out how to use graphblas-specific INFINITY macro
+    #ifndef INFINITY
+    #define INFINITY std::numeric_limits<T>::max()
+    #endif
+
     // set thread ID
     int tid = threadIdx.x ;
 
-    int64_t *index = A->i;
-    T *g_idata = (T*) A->x;
-    Accum *g_odata = (Accum*) O->x;
+    const int64_t *__restrict__ index = A->i;
+    const T *__restrict__ g_idata = (T*) A->x;
+    Accum *g_odata = (Accum*) R->x;
 
     // each thread tid reduces its result into sum
     Accum sum = (Accum) GB_IDENTITY;
@@ -99,21 +119,25 @@ __global__ void reduceNonZombiesWarp
     for(int i = blockIdx.x * blockDim.x + threadIdx.x; 
         i < N;
         i += blockDim.x * gridDim.x) {
+
         if (is_sparse && index[i] < 0) continue; // skip zombies
+        //T fold = index[i] < 0 ? GB_IDENTITY : g_idata[i];
         T fold = g_idata[i];
         sum = GB_ADD( sum, fold );
     }
-    __syncthreads();
+    this_thread_block().sync(); 
+
     //--------------------------------------------------------------------------
     // reduce work [0..s-1] to a single scalar
     //--------------------------------------------------------------------------
     // this assumes blockDim is a multiple of 32
-    sum = block_ReduceSum< T, 32 >( this_thread_block(), sum) ; 
+    sum = block_ReduceSum< T, 32, rcode >( this_thread_block(), sum) ;
+    this_thread_block().sync(); 
 
     // write result for this block to global mem
     if (tid == 0)
     {
-        // TODO: Assuming sum for now (liek the rest of the kernel)
+        // TODO: Assuming sum for now (like the rest of the kernel)
         if(atomic_reduce) {
             atomic_add<Accum>(g_odata, sum);
         } else {
diff --git a/GraphBLAS/CUDA/templates/GB_nnz.h b/GraphBLAS/CUDA/templates/GB_nnz.h
new file mode 120000
index 000000000..6d99cfd21
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_nnz.h
@@ -0,0 +1 @@
+../../Source/GB_nnz.h
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/templates/GB_nnz_full_template.c b/GraphBLAS/CUDA/templates/GB_nnz_full_template.c
new file mode 120000
index 000000000..3bfe991a8
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_nnz_full_template.c
@@ -0,0 +1 @@
+../../Source/Template/GB_nnz_full_template.c
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/templates/GB_nnz_held_template.c b/GraphBLAS/CUDA/templates/GB_nnz_held_template.c
new file mode 120000
index 000000000..d6e70af28
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_nnz_held_template.c
@@ -0,0 +1 @@
+../../Source/Template/GB_nnz_held_template.c
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/templates/GB_nnz_max_template.c b/GraphBLAS/CUDA/templates/GB_nnz_max_template.c
new file mode 120000
index 000000000..0a56cbde7
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_nnz_max_template.c
@@ -0,0 +1 @@
+../../Source/Template/GB_nnz_max_template.c
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/templates/GB_nnz_template.c b/GraphBLAS/CUDA/templates/GB_nnz_template.c
new file mode 120000
index 000000000..b61f1d201
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_nnz_template.c
@@ -0,0 +1 @@
+../../Source/Template/GB_nnz_template.c
\ No newline at end of file
diff --git a/GraphBLAS/CUDA/templates/denseDotProduct.cu b/GraphBLAS/CUDA/templates/denseDotProduct.cu
deleted file mode 100644
index cf3634329..000000000
--- a/GraphBLAS/CUDA/templates/denseDotProduct.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-//------------------------------------------------------------------------------
-// denseDotProduct.cu 
-//------------------------------------------------------------------------------
-
-// The denseDotProduct CUDA kernel produces the semi-ring dot product of two
-// vectors of types T1 and T2 and common size n, to a vector odata of type T3.
-// ie. we want to produce dot(x,y) in the sense of the given semi-ring.
-
-// Both the grid and block are 1D, so blockDim.x is the # threads in a
-// threadblock, and the # of threadblocks is grid.x
-
-// Let b = blockIdx.x, and let s be blockDim.x.
-// Each threadblock owns s*8 contiguous items in the input data.
-
-// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)].  It's job
-// is to reduce this data to a scalar, and write it to g_odata [b].
-
-#include <limits>
-#include <cooperative_groups.h>
-
-using namespace cooperative_groups;
-
-template< typename T3, int tile_sz>
-__inline__ __device__ 
-T3 warp_ReduceSum(thread_block_tile<tile_sz> g, T3 val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
-    {
-        T3 fold = g.shfl_down( val, i);
-        val = ADD( val, fold );
-    }
-    return val; // note: only thread 0 will return full sum
-}
-
-template<typename T3, int warpSize>
-__inline__ __device__
-T3 block_ReduceSum(thread_block g, T3 val)
-{
-  static __shared__ T3 shared[warpSize]; // Shared mem for 32 partial sums
-  int lane = threadIdx.x % warpSize;
-  int wid = threadIdx.x / warpSize;
-  thread_block_tile<warpSize> tile = tiled_partition<warpSize>(g);
-
-  // Each warp performs partial reduction
-  val = warp_ReduceSum<T3,warpSize>(tile, val);    
-
-  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
-
-  __syncthreads();              // Wait for all partial reductions
-
-  //read from shared memory only if that warp existed
-  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : (T3)MONOID_IDENTITY3;
-
-  
-  if (wid==0) val = warp_ReduceSum<T3,warpSize>(tile,val); //Final reduce within first warp
-
-  return val;
-}
-
-template< typename T1, typename T2, typename T3>
-__global__ void denseDotProduct
-(
-    T1 *g_xdata,     // array of size n, type T1
-    T2 *g_ydata,     // array of size n, type T2
-    T3 *g_odata,       // array of size grid.x, type T3
-    unsigned int n
-)
-{
-    // set thread ID
-    unsigned int tid = threadIdx.x ;
-
-    // this threadblock b owns g_idata [block_start ... block_end-1]
-    unsigned long int s = blockDim.x ;
-    unsigned long int b = blockIdx.x ;
-    unsigned long int block_start = b * s * 8 ;
-    unsigned long int block_end   = (b + 1) * s * 8 ;
-
-    /*
-    if (tid == 0)
-    {
-        printf ("block %d: [%lu ... %ld]\n", b, block_start, block_end-1) ;
-    }
-    */
-
-    /*
-    if (tid == 0 && b == 0)
-    {
-        printf ("type is size %d\n", sizeof (T)) ;
-        for (int k = 0 ; k < n ; k++) printf ("%4d: %g\n", k, (double) g_idata [k]) ;
-        printf ("\n") ;
-    }
-    */
-
-    // each thread tid reduces its result into sum 
-    T3 sum;
-
-    // nothing to do
-    if (block_start > block_end) { return ; }
-
-    // convert global data pointer to the local pointer of this block
-    T1 *xdata = g_xdata + block_start ;
-    T2 *ydata = g_ydata + block_start ;
-
-    T1 x0, x1, x2, x3, x4, x5, x6, x7 ;
-    T2 y0, y1, y2, y3, y4, y5, y6, y7 ;
-
-    if (block_end <= n)
-    {
-        // unrolling 8
-        x0 = xdata [tid] ;
-        x1 = xdata [tid +     s] ;
-        x2 = xdata [tid + 2 * s] ;
-        x3 = xdata [tid + 3 * s] ;
-        x4 = xdata [tid + 4 * s] ;
-        x5 = xdata [tid + 5 * s] ;
-        x6 = xdata [tid + 6 * s] ;
-        x7 = xdata [tid + 7 * s] ;
-
-        y0 = ydata [tid] ;
-        y1 = ydata [tid +     s] ;
-        y2 = ydata [tid + 2 * s] ;
-        y3 = ydata [tid + 3 * s] ;
-        y4 = ydata [tid + 4 * s] ;
-        y5 = ydata [tid + 5 * s] ;
-        y6 = ydata [tid + 6 * s] ;
-        y7 = ydata [tid + 7 * s] ;
-        /*
-        if (b == 0)
-        {
-            printf ("block zero: here is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
-                (double) x0, (double) x1, (double) x2, (double) x3,
-                (double) x4, (double) x5, (double) x6, (double) x7) ;
-        }
-        */
-
-    }
-    else
-    {
-        // the last block has size less than 8*s
-        #define XDATA(i) ((i < lastblocksize) ? xdata [i] : MONOID_IDENTITY1)
-        #define YDATA(i) ((i < lastblocksize) ? ydata [i] : MONOID_IDENTITY2)
-        int lastblocksize = n - block_start ;
-        x0 = XDATA (tid) ;
-        x1 = XDATA (tid +     s) ;
-        x2 = XDATA (tid + 2 * s) ;
-        x3 = XDATA (tid + 3 * s) ;
-        x4 = XDATA (tid + 4 * s) ;
-        x5 = XDATA (tid + 5 * s) ;
-        x6 = XDATA (tid + 6 * s) ;
-        x7 = XDATA (tid + 7 * s) ;
-
-        y0 = YDATA (tid) ;
-        y1 = YDATA (tid +     s) ;
-        y2 = YDATA (tid + 2 * s) ;
-        y3 = YDATA (tid + 3 * s) ;
-        y4 = YDATA (tid + 4 * s) ;
-        y5 = YDATA (tid + 5 * s) ;
-        y6 = YDATA (tid + 6 * s) ;
-        y7 = YDATA (tid + 7 * s) ;
-    }
-
-    //work [tid] = mul(x0,y0) + mul(x1,y1) + mul(x2,y2) + mul(x3,y3)
-    //               + mul(x4,y4) + mul(x5,y5) + mul(x6,y6)+ mul(x7,y7) ;
-          sum  = ADD( MUL(x0,y0) , ADD( MUL(x1,y1) , ADD( MUL(x2,y2), 
-                 ADD( MUL(x3,y3) , ADD( MUL(x4,y4) , ADD( MUL(x5,y5), 
-                 ADD( MUL(x6,y6) , MUL(x7,y7)))))))) ;
-
-        /*
-        if (b == 0)
-        {
-            printf ("block zero: still is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
-                (double) x0, (double) x1, (double) x2, (double) x3,
-                (double) x4, (double) x5, (double) x6, (double) x7) ;
-        }
-
-        if (b == 0)
-        {
-            printf ("block zero: here is tid %d result %g  is %g\n",
-            tid, sum,
-            (double) (x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7)) ;
-        }
-        */
-
-    __syncthreads ( ) ;
-
-    //--------------------------------------------------------------------------
-    // reduce per-thread sums to a single scalar
-    //--------------------------------------------------------------------------
-
-    sum = block_ReduceSum<T3, 32>( this_thread_block(), sum); 
-
-    // write result for this block to global mem
-    if (tid == 0)
-    {
-        printf ("final %d : %g\n", b, (T3) sum) ;
-        g_odata [b] = sum ;
-    }
-}
-
diff --git a/GraphBLAS/CUDA/templates/sparseDotProduct.cu b/GraphBLAS/CUDA/templates/sparseDotProduct.cu
deleted file mode 100644
index 71c15c4e9..000000000
--- a/GraphBLAS/CUDA/templates/sparseDotProduct.cu
+++ /dev/null
@@ -1,188 +0,0 @@
-//------------------------------------------------------------------------------
-// sparseDotProduct_merge_path.cu 
-//------------------------------------------------------------------------------
-
-// The sparseDotProduct CUDA kernel produces the semi-ring dot product of two
-// sparse vectors of types T1 and T2 and common index space size n, to a scalar 
-// odata of type T3. The vectors are sparse, with different numbers of non-zeros.
-// ie. we want to produce dot(x,y) in the sense of the given semi-ring.
-
-// This version uses a merge-path algorithm, when the sizes g_xnz and g_ynz are 
-// relatively close in size, but for any size of N.
-
-// Both the grid and block are 1D, so blockDim.x is the # threads in a
-// threadblock, and the # of threadblocks is grid.x
-
-// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
-// of active threads = min( min(g_xnz, g_ynz), 32) 
-
-// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job
-// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot
-// product on those items in the intersection, and finally reduce this data to a scalar, 
-// on exit write it to g_odata [b].
-
-#include <limits>
-#include <cooperative_groups.h>
-
-using namespace cooperative_groups;
-
-template< typename T, int tile_sz>
-__device__ T reduce_sum(thread_block_tile<tile_sz> g, T val)
-{
-    // Each iteration halves the number of active threads
-    // Each thread adds its partial sum[i] to sum[lane+i]
-    for (int i = g.size() / 2; i > 0; i /= 2)
-    {
-        val = ADD( val, g.shfl_down(val,i) );
-        //if (g.thread_rank() ==0)
-        //    printf("in reduce_sum i=%i val = %f\n", i, val);
-    }
-    return val; // note: only thread 0 will return full sum
-}
-
-#define INTMIN( A, B) ( (A) < (B) ) ?  (A) : (B)
-#define INTMAX( A, B) ( (A) > (B) ) ?  (A) : (B)
-#define intersects_per_thread 4
-
-template< typename T1, typename T2, typename T3>
-__global__ void sparseDotProduct
-(
-    unsigned int g_xnz,       // Number of non-zeros in x
-    unsigned int *g_xi,       // Non-zero indices in x, size xnz
-    T1 *g_xdata,              // array of size xnz, type T1
-    unsigned int g_ynz,       // Number of non-zeros in y
-    unsigned int *g_yi,       // Non-zero indices in y, size ynz
-    T2 *g_ydata,              // array of size ynz, type T2
-    T3 *g_odata               // array of size grid.x, type T3
-)
-{
-    // set thread ID
-    unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
-    unsigned int tid = threadIdx.x;
-
-    unsigned long int b = blockIdx.x ;
-
-    // total items to be inspected
-    unsigned int nxy = (g_xnz + g_ynz);
-
-    //largest possible number of intersections is the smaller nz
-    unsigned int n_intersect = INTMIN( g_xnz, g_ynz); 
-
-    //we want more than one intersection per thread
-    unsigned int parts = (n_intersect+ intersects_per_thread -1)/ intersects_per_thread; 
-
-    unsigned int work_per_thread = (nxy +parts -1)/parts;
-    unsigned int diag = INTMIN( work_per_thread*tid_global, nxy);
-    unsigned int diag_end = INTMIN( diag + work_per_thread, nxy);
-    //printf(" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\n",tid, parts, work_per_thread, diag, diag_end); 
-
-   unsigned int x_min = INTMAX( (int)(diag - g_ynz), 0);
-   unsigned int x_max = INTMIN( diag, g_xnz);
-
-   //printf("start thd%u x_min = %u x_max = %u\n", tid_global, x_min,x_max);
-   while ( x_min < x_max) { //binary search for correct diag break
-      unsigned int pivot = (x_min +x_max)/2;
-      if ( g_xi[pivot] < g_yi[ diag -pivot -1]) {
-         x_min = pivot +1;
-      }
-      else {
-         x_max = pivot;
-      }
-   }
-   int xcoord = x_min;
-   int ycoord = diag -x_min -1;
-   if (( diag > 0) &&(diag < (g_xnz+g_ynz)) && (g_xi[xcoord] == g_yi[ycoord]) ) { 
-       diag--; //adjust for intersection incrementing both pointers 
-   }
-   // two start points are known now
-   int x_start = xcoord;
-   int y_start = diag -xcoord; 
-
-   //if (x_start != y_start)
-   //   printf("start thd%u  xs,ys = %i,%i\n", tid_global, x_start, y_start);
-
-   x_min = INTMAX( (int)(diag_end - g_ynz), 0);
-   x_max = INTMIN( diag_end, g_xnz);
-
-   while ( x_min < x_max) {
-      unsigned int pivot = (x_min +x_max)/2;
-      //printf("thd%u pre_sw piv=%u diag_e = %u  xmin,xmax=%u,%u\n", tid_global, pivot, diag_end,x_min, x_max);
-      if ( g_xi[pivot] < g_yi[ diag_end -pivot -1]) {
-         x_min = pivot +1;
-      }
-      else {
-         x_max = pivot;
-      }
-      //printf("thd%u piv=%u xmin,xmax = %u,%u\n", tid_global, pivot, x_min, x_max);
-   }
-   xcoord = x_min;
-   ycoord = diag_end -x_min -1;
-   if ( (diag_end < (g_xnz+g_ynz)) && (g_xi[xcoord] == g_yi[ycoord]) ) { 
-       diag--; //adjust for intersection incrementing both pointers  
-   }
-   // two end points are known now
-   int x_end = xcoord; 
-   int y_end = diag_end - xcoord; 
-
-   /* 
-   if (tid == 0 && b == 0) {
-        printf ("type1 is size %d\n", sizeof (T1)) ;
-        for (int k = 0 ; k < g_xnz ; k++) printf ("%4d: %g,", k, (T1) g_xdata [k]) ;
-        printf ("\n") ;
-        printf ("type2 is size %d\n", sizeof (T2)) ;
-        for (int k = 0 ; k < g_ynz ; k++) printf ("%4d: %g,", k, (T2) g_ydata [k]) ;
-        printf ("\n") ;
-    }
-    __syncthreads();
-    */
-
-    T3 sum = (T3) 0;
-    //printf(" thd%u has init value %f\n",tid, sum);
-
-    // nothing to do
-    if ( (x_start >= x_end) || (y_start >= y_end) ) { return ; }
-
-    //merge-path dot product
-    int k = x_start;
-    int l = y_start;
-    while ( k < x_end && l < y_end )
-    {
-       if      ( g_xi[k] < g_yi[l] ) k += 1;
-       else if ( g_xi[k] > g_yi[l] ) l += 1; 
-       else {
-          //printf("  thd%d ix at %u \n",tid_global,g_xi[k]);
-          //printf("   sum += %f * %f \n",tid,g_xdata[k],g_ydata[l]);
-          //sum = ADD( sum, MUL( g_xdata[k], g_ydata[l]));
-          MULADD( sum, g_xdata[k], g_ydata[l]);
-          //printf(" thd%u work value = %f\n",tid_global, sum);
-          k+= 1;
-          l+= 1;
-       }
-
-    }
-
-    __syncthreads ( ) ;
-    /*
-    if (1)
-    {
-        printf ("thd%u done with intersect and multiply, val = %f\n",tid_global, sum) ;
-    }
-    __syncthreads ( ) ;
-    */
-
-    //--------------------------------------------------------------------------
-    // reduce sum per-thread values to a single scalar
-    //--------------------------------------------------------------------------
-    // Using tile size fixed at compile time, we don't need shared memory
-    #define tile_sz 32 
-    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
-    T3 block_sum = reduce_sum<T3,tile_sz>(tile, sum);
-
-    // write result for this block to global mem
-    if (tid == 0)
-    {
-        printf ("final %d : %g\n", b,  block_sum) ;
-        g_odata [b] = block_sum ;
-    }
-}
-
diff --git a/GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu.jit b/GraphBLAS/CUDA/templates/stdbool.h
similarity index 100%
rename from GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu.jit
rename to GraphBLAS/CUDA/templates/stdbool.h
diff --git a/GraphBLAS/CUDA/templates/stuff.cu b/GraphBLAS/CUDA/templates/stuff.cu
deleted file mode 100644
index 9241fd1eb..000000000
--- a/GraphBLAS/CUDA/templates/stuff.cu
+++ /dev/null
@@ -1,9 +0,0 @@
- val = ADD( val, g.shfl_down( val, i) );
-
-
-    t = g.shfl_down( val, i) ;
-    val = ADD( val, t );
-
-    GB_ADD (val, t) ;           // statment  val = GB_ADD_FUNCTION (val, t)
-
-
diff --git a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp
index 7604c75fb..758131f20 100644
--- a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp
+++ b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.cpp
@@ -2,7 +2,7 @@
 
 // Test AxB_dot3_cuda kernels 
 // Using data generators and test classes, cover
-// all 12 cases for the masked GEMM ( C, M, A, B) in GraphBLAS
+// all NBUCKETS cases for the masked GEMM ( C, M, A, B) in GraphBLAS
 // Tests Semirings, data types and a range of data input sizes and shapes
 // Connects to the jitFactory for launches.
 
diff --git a/GraphBLAS/CUDA/test/cuda_tests_template.cpp b/GraphBLAS/CUDA/test/cuda_tests_template.cpp
index e6256bb34..d2ccaa29b 100644
--- a/GraphBLAS/CUDA/test/cuda_tests_template.cpp
+++ b/GraphBLAS/CUDA/test/cuda_tests_template.cpp
@@ -2,7 +2,7 @@
 
 // Test AxB_dot3_cuda kernels 
 // Using data generators and test classes, cover
-// all 12 cases for the masked GEMM ( C, M, A, B) in GraphBLAS
+// all NBUCKETS cases for the masked GEMM ( C, M, A, B) in GraphBLAS
 // Tests Semirings, data types and a range of data input sizes and shapes
 // Connects to the jitFactory for launches.
 
@@ -11,6 +11,7 @@
 #include <random>
 #include <algorithm>
 #include <cstdint>
+#include "problem_spec.hpp"
 #include "jitTestFactory.hpp"
 #include "../GB_cuda_buckets.h"
 
diff --git a/GraphBLAS/CUDA/test/dataFactory.hpp b/GraphBLAS/CUDA/test/dataFactory.hpp
index 52932423a..691d29fd5 100644
--- a/GraphBLAS/CUDA/test/dataFactory.hpp
+++ b/GraphBLAS/CUDA/test/dataFactory.hpp
@@ -9,7 +9,6 @@
 
 #include "GB.h"
 #include "../GB_cuda_type_wrap.hpp"
-#include "../GB_Matrix_allocate.h"
 #include "test_utility.hpp"
 #include "../GB_cuda_error.h"
 
@@ -64,6 +63,13 @@ class matrix : public Managed {
          return mat;
      }
 
+     ~matrix() {
+        if(mat != NULL) {
+            GrB_Matrix_free(&mat);
+            mat = NULL;
+        }
+    }
+
      uint64_t get_zombie_count() { return mat->nzombies;}
 
      void clear() {
@@ -74,20 +80,17 @@ class matrix : public Managed {
          GrB_Type type = cuda::jit::to_grb_type<T>();
 
          GRB_TRY (GrB_Matrix_new (&mat, type, nrows_, ncols_)) ;
+
          // GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL,
             // GxB_SPARSE) ;
             // or:
             // GxB_HYPERSPARSE, GxB_BITMAP, GxB_FULL
-
-//         mat = GB_Matrix_allocate(
-//            type,   /// <<<<<<<BUG HERE, was NULL, which is broken
-//            sizeof(T), nrows, ncols, 2, false, false, Nz, -1);
      }
 
 
     void fill_random( int64_t nnz, int gxb_sparsity_control, int gxb_format, std::int64_t seed = 12345ULL, T val_min = 0.0, T val_max = 2.0 , bool debug_print = false) {
 
-        std::cout << "inside fill, using seed "<< seed << std::endl;
+//        std::cout << "inside fill_random, using seed "<< seed << std::endl;
         alloc();
 
         double inv_sparsity ;
@@ -100,12 +103,12 @@ class matrix : public Managed {
         {
             inv_sparsity = ceil(((double)nrows_*ncols_)/nnz);   //= values not taken per value occupied in index space
         }
-
-        std::cout<< "fill_random nrows="<< nrows_<<"ncols=" << ncols_ <<" need "<< nnz<<" values, invsparse = "<<inv_sparsity<<std::endl;
-        std::cout<< "fill_random"<<" after alloc values"<<std::endl;
-        std::cout<<"vdim ready "<<std::endl;
-        std::cout<<"vlen ready "<<std::endl;
-        std::cout<<"ready to fill p"<<std::endl;
+//
+//        std::cout<< "fill_random nrows="<< nrows_<<"ncols=" << ncols_ <<" need "<< nnz<<" values, invsparse = "<<inv_sparsity<<std::endl;
+//        std::cout<< "fill_random"<<" after alloc values"<<std::endl;
+//        std::cout<<"vdim ready "<<std::endl;
+//        std::cout<<"vlen ready "<<std::endl;
+//        std::cout<<"ready to fill p"<<std::endl;
 
         bool make_symmetric = false;
         bool no_self_edges = false;
@@ -115,7 +118,7 @@ class matrix : public Managed {
 
         if (nnz < 0 || inv_sparsity == 1.)
         {
-            std::cout<<"filling dense"<<std::endl;
+//            std::cout<<"filling dense"<<std::endl;
             for (int64_t i = 0 ; i < nrows_ ; i++)
             {
                 for (int64_t j = 0 ; j < ncols_ ; j++)
@@ -135,10 +138,12 @@ class matrix : public Managed {
                     }
                 }
             }
+
+//            std::cout << "done." << std::endl;
         }
         else
         {
-            std::cout<<"filling sparse"<<std::endl;
+//            std::cout<<"filling sparse"<<std::endl;
             unordered_set<std::int64_t> row_lookup;
             unordered_set<std::int64_t> key_lookup;
             for ( int co = 0; co < 2*nrows_; co++ )
@@ -151,7 +156,7 @@ class matrix : public Managed {
 
             while ( remain > 0) 
             { 
-            std::cout<< remain<<" nonzeroes left to fill.."<<std::endl;
+//            std::cout<< remain<<" nonzeroes left to fill.."<<std::endl;
             for ( GrB_Index i : row_lookup)
             {
                 GrB_Index col_guess = ((GrB_Index) (dis(r) * nnz/row_lookup.size() )) % ((GrB_Index) ncols_) ;
@@ -215,6 +220,7 @@ class matrix : public Managed {
         // TODO: Need to specify these
         GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ;
         GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format));
+        GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ;
         GRB_TRY (GrB_Matrix_nvals ((GrB_Index *) &nnz, mat)) ;
         //GRB_TRY (GxB_Matrix_fprint (mat, "my random mat", GxB_SHORT_VERBOSE, stdout)) ;
 
@@ -230,14 +236,16 @@ class matrix : public Managed {
 
 };
 
+
+
 template< typename T_C, typename T_M, typename T_A, typename T_B>
 class SpGEMM_problem_generator {
 
-    float Anzpercent,Bnzpercent,Cnzpercent;
-    int64_t Cnz;
+    float Anzpercent,Bnzpercent,Mnzpercent;
+    int64_t Mnz;
     int64_t *Bucket = nullptr;
 
-    int64_t BucketStart[13];
+    int64_t BucketStart[NBUCKETS+1];
     unsigned seed = 13372801;
     bool ready = false;
 
@@ -251,6 +259,8 @@ class SpGEMM_problem_generator {
     matrix<T_A> *A= nullptr;
     matrix<T_B> *B= nullptr;
 
+    SpGEMM_problem_generator() {};
+
     SpGEMM_problem_generator(int64_t nrows, int64_t ncols): nrows_(nrows), ncols_(ncols) {
     
        // Create sparse matrices
@@ -260,6 +270,16 @@ class SpGEMM_problem_generator {
        B = new matrix<T_B>(nrows_, ncols_);
     };
 
+    void initDim ( int64_t nrows, int64_t ncols){
+       nrows_ = nrows;
+       ncols_ = ncols;
+       // Create sparse matrices
+       C = new matrix<T_C>(nrows_, ncols_);
+       M = new matrix<T_M>(nrows_, ncols_);
+       A = new matrix<T_A>(nrows_, ncols_);
+       B = new matrix<T_B>(nrows_, ncols_);
+    }
+
     matrix<T_C>* getCptr(){ return C;}
     matrix<T_M>* getMptr(){ return M;}
     matrix<T_A>* getAptr(){ return A;}
@@ -283,31 +303,18 @@ class SpGEMM_problem_generator {
     int64_t* getBucket() { return Bucket;}
     int64_t* getBucketStart(){ return BucketStart;}
 
-    void loadCj() {
-
-       // Load C_i with column j info to avoid another lookup
-       for (int c = 0 ; c< M->mat->vdim; ++c) {
-           for ( int r = M->mat->p[c]; r< M->mat->p[c+1]; ++r){
-               C->mat->i[r] = c << 4 ; //shift to store bucket info
-           }
-       }
-    }
-
-    void init_C(float Cnzp, std::int64_t seed_c = 23456ULL, std::int64_t seed_m = 4567ULL){
+    void init_C(float Mnzp, std::int64_t seed_c = 23456ULL, std::int64_t seed_m = 4567ULL){
 
        // Get sizes relative to fully dense matrices
-       Cnzpercent = Cnzp;
-       Cnz = (int64_t)(Cnzp * nrows_ * ncols_);
+       Mnzpercent = Mnzp;
+       Mnz = (int64_t)(Mnzp * nrows_ * ncols_);
 
        //Seed the generator
-       std::cout<<"filling matrices"<<std::endl;
+       //std::cout<<"filling matrices"<<std::endl;
 
-       C->fill_random(Cnz, GxB_SPARSE, GxB_BY_ROW, seed_m);
-       M->fill_random(Cnz, GxB_SPARSE, GxB_BY_ROW, seed_m);
+       C->fill_random(Mnz, GxB_SPARSE, GxB_BY_ROW, seed_m);
+       M->fill_random(Mnz, GxB_SPARSE, GxB_BY_ROW, seed_m);
 
-//       std::cout<<"fill complete"<<std::endl;
-//       C->mat->p = M->mat->p; //same column pointers (assuming CSC here)
-//       C->mat->p_shallow = true ; // C->mat does not own M->mat->p
     }
 
     void del(){
@@ -315,7 +322,7 @@ class SpGEMM_problem_generator {
        M->clear();
        A->clear();
        B->clear();
-       if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) );
+       //if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) );
        delete C;
        delete M;
        delete A;
@@ -323,42 +330,43 @@ class SpGEMM_problem_generator {
        CHECK_CUDA( cudaDeviceSynchronize() );
     }
 
+    //
     void fill_buckets( int fill_bucket){
 
-       std::cout<<Cnz<<" slots to fill"<<std::endl;
+       std::cout<<Mnz<<" slots to fill"<<std::endl;
 
        if (fill_bucket == -1){  
 
        // Allocate Bucket space
-       CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Cnz*sizeof(int64_t)) );
+       CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Mnz*sizeof(int64_t)) );
 
-       //Fill buckets with random extents such that they sum to Cnz, set BucketStart
+       //Fill buckets with random extents such that they sum to Mnz, set BucketStart
            BucketStart[0] = 0; 
-           BucketStart[12] = Cnz;
-           for (int b = 1; b < 12; ++b){
-              BucketStart[b] = BucketStart[b-1] + (Cnz / 12);
+           BucketStart[NBUCKETS] = Mnz;
+           for (int b = 1; b < NBUCKETS; ++b){
+              BucketStart[b] = BucketStart[b-1] + (Mnz / NBUCKETS);
               //std::cout<< "bucket "<< b<<" starts at "<<BucketStart[b]<<std::endl;
               for (int j = BucketStart[b-1]; j < BucketStart[b]; ++j) { 
                 Bucket[j] = b ;
               }
            }
-           int b = 11;
-           for (int j = BucketStart[11]; j < BucketStart[12]; ++j) { 
+           int b = GB_BUCKET_MERGEPATH;
+           for (int j = BucketStart[GB_BUCKET_MERGEPATH]; j < BucketStart[NBUCKETS]; ++j) { 
                 Bucket[j] = b ; 
            }
        }
        else {// all in one test bucket
 
-           CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Cnz*sizeof(int64_t)) );
-           for (int j = 0; j < Cnz; ++j) {
+           CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Mnz*sizeof(int64_t)) );
+           for (int j = 0; j < Mnz; ++j) {
                Bucket[j] = j ;
            }
 
            BucketStart[0] = 0;
-           BucketStart[12] = Cnz;
-           for (int b= 0; b<12; ++b){
+           BucketStart[NBUCKETS] = Mnz;
+           for (int b= 0; b<NBUCKETS; ++b){
               if (b <= fill_bucket) BucketStart[b] = 0;
-              if (b  > fill_bucket) BucketStart[b] = Cnz;
+              if (b  > fill_bucket) BucketStart[b] = Mnz;
               //std::cout<< " one  bucket "<< b<<"starts at "<<BucketStart[b]<<std::endl;
            } 
            std::cout<<"all pairs to bucket "<<fill_bucket<<", no filling"<<std::endl;
@@ -367,4 +375,3 @@ class SpGEMM_problem_generator {
     }
 };
 
-
diff --git a/GraphBLAS/CUDA/test/jitTestFactory.hpp b/GraphBLAS/CUDA/test/jitTestFactory.hpp
index dd8d2096a..fb701d3f7 100644
--- a/GraphBLAS/CUDA/test/jitTestFactory.hpp
+++ b/GraphBLAS/CUDA/test/jitTestFactory.hpp
@@ -8,14 +8,17 @@
 #include <algorithm>
 #include <iostream>
 //#include "GB_binary_search.h"
+#include "GB_cuda_reduce_factory.hpp"
 #include "GpuTimer.h"
 #include "GB_cuda_buckets.h"
 #include "../../rmm_wrap/rmm_wrap.h"
 #include <gtest/gtest.h>
 #include "test_data.hpp"
+#include "problem_spec.hpp"
 
 extern "C" {
     #include "GB.h"
+    #include "GraphBLAS.h"
 }
 
 #include "../jitFactory.hpp"
@@ -41,25 +44,12 @@ bool test_AxB_phase1_factory( int64_t , int64_t , int64_t , int64_t ) ;
 template <typename T_C>
 bool test_AxB_dot3_phase2_factory( int , int64_t , int64_t , int64_t, int64_t ) ;
 
-////AxB_dot3_phase3 kernels
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_dndn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_vsvs_factory( int , int64_t , int64_t , int64_t , std::string&) ;
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_spdn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_vssp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_mp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_warp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
-
+template<typename T>
+void make_grb_matrix(GrB_Matrix mat, int64_t n_rows, int64_t n_cols,
+                     std::vector<int64_t> &indptr,
+                     std::vector<int64_t> &indices, T *data,
+                     int gxb_sparsity_control = GxB_SPARSE,
+                     int gxb_format = GxB_BY_ROW) ;
 
 //Fixture to generate valid inputs and hold them for tests
 class AxB_dot3_Test : public ::testing::Test
@@ -75,7 +65,7 @@ void print_array(void *arr, I size, const char *name) {
     for(I i = 0; i < size; ++i) {
         std::cout << static_cast<T*>(arr)[i] << ", ";
     }
-    std::cout << std::endl << "Done." << std::endl;
+    std::cout << std::endl;
 }
 
 //------------------------------------------------------------------------------
@@ -85,7 +75,7 @@ void print_array(void *arr, I size, const char *name) {
 // Test generator code, to allow parameterized tests
 // Uses jitFactory, dataFactory and GB_jit 
 template <typename T_C, typename T_M, typename T_A,typename T_B>
-bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_Monoid monoid, GrB_BinaryOp binop)
+bool test_AxB_phase1_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec)
 {
 
     int gpuID;
@@ -93,147 +83,133 @@ bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_M
 
     std::cout<< "found device "<<gpuID<<std::endl;
 
-    /**************************
-     * Create reference and input data
-     */
-
-    // FIXME: This should be getting set automatically somehow.
-    bool flipxy = false;
-    bool mask_struct = false;
-    bool mask_comp = false;
-
-    SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G(N, N);
-    int64_t Annz = N*N;
-    int64_t Bnnz = N*N;
-    int64_t Cnz = N;
-    float Cnzpercent = (float) Cnz/(N*N);
-
-    // TODO: Allocate and fill arrays for buckets and nano buckets
-    G.init_A(Annz, GxB_SPARSE, GxB_BY_ROW);
-    G.init_B(Bnnz, GxB_SPARSE, GxB_BY_ROW);
-    G.init_C(Cnzpercent);
-    G.fill_buckets( TB ); // all elements go to testbucket= TB
-
-    GrB_Matrix C = G.getC();
-    GrB_Matrix M = G.getM();
-    GrB_Matrix A = G.getA();
-    GrB_Matrix B = G.getB();
-
-    /************************
-     * Create semiring factory
-     */
-
-    GB_cuda_semiring_factory mysemiringfactory = GB_cuda_semiring_factory ( ) ;
-    GrB_Semiring mysemiring;
-    auto grb_info = GrB_Semiring_new(&mysemiring, monoid, binop);
-    GRB_TRY (grb_info) ;
-
-    mysemiringfactory.semiring_factory ( mysemiring, flipxy,
-                                         C->type,
-                                         M->type,
-                                         A->type,
-                                         B->type,
-                                         mask_struct,  // matrix types
-                                         mask_comp, GB_sparsity(C),
-                                         GB_sparsity(M),
-                                         GB_sparsity(A),
-                                         GB_sparsity(B));
+    cudaStream_t strm;
+    CHECK_CUDA(cudaStreamCreate(&strm));
 
     /********************
      * Launch kernel
      */
-
+    GB_cuda_mxm_factory mysemiringfactory = problem_spec.get_mxm_factory();
     phase1launchFactory p1lF(mysemiringfactory);
 
     GpuTimer kernTimer;
-    kernTimer.Start();
 
     int nthrd = p1lF.get_threads_per_block();
-    int ntasks = p1lF.get_number_of_blocks(M);
+    int ntasks = p1lF.get_number_of_blocks(problem_spec.getM());
 
     // TODO: Verify that RMM is checking and throwing exceptions
     int nanobuckets_size = NBUCKETS * nthrd * ntasks;
     int blockbuckets_size = NBUCKETS * ntasks;
 
-    printf("nanobuckets_size: %d\n", nanobuckets_size);
-    printf("blockbuckets_size: %d\n", blockbuckets_size);
-
     int64_t *Nanobuckets = (int64_t*)rmm_wrap_malloc(nanobuckets_size * sizeof (int64_t));
     int64_t *Blockbucket = (int64_t*)rmm_wrap_malloc(blockbuckets_size * sizeof (int64_t));
-//
-//    std::cout << "INvoking grid block launch for phase1" << std::endl;
-    p1lF.jitGridBlockLaunch(Nanobuckets, Blockbucket, C, M, A, B);
+
+    kernTimer.Start();
+    p1lF.jitGridBlockLaunch(Nanobuckets, Blockbucket,
+                            problem_spec.getC(), problem_spec.getM(),
+                            problem_spec.getA(), problem_spec.getB(), strm);
+
+    CHECK_CUDA(cudaStreamSynchronize(strm));
     kernTimer.Stop();
     std::cout<<"returned from phase1 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
 //
-    print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
-    print_array<int64_t>(Blockbucket, blockbuckets_size, "Blockbucket");
+//  print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
+//  print_array<int64_t>(Blockbucket, blockbuckets_size, "Blockbucket");
     std::cout<<"==== phase1 done=============================" <<std::endl;
+
+    int64_t bucket_count = 0;
+    for (int i =0; i< NBUCKETS*ntasks; ++i) bucket_count += Blockbucket[i];
+    EXPECT_EQ( bucket_count, problem_spec.getCnnz()); //check we sum to the right structural counts
 //
     rmm_wrap_free(Nanobuckets);
     rmm_wrap_free(Blockbucket);
 
-    G.del();
-//
+    std::cout << "end phase1 test ------------" << std::endl;
+
+    CHECK_CUDA(cudaStreamDestroy(strm));
+    fflush(stdout);
+
+    
     return true;
 }
 
+// Test generator code, to allow parameterized tests
+// Uses jitFactory, dataFactory and GB_jit
+template <typename T_C, typename T_M, typename T_A,typename T_B>
+bool test_AxB_dense_phase1_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec)
+{
+    cudaStream_t strm;
+    CHECK_CUDA(cudaStreamCreate(&strm));
+
+    /********************
+     * Launch kernel
+     */
+    GB_cuda_mxm_factory mysemiringfactory = problem_spec.get_mxm_factory();
+    dense_phase1launchFactory p1lF(mysemiringfactory);
+    p1lF.jitGridBlockLaunch(problem_spec.getC(), problem_spec.getM(), problem_spec.getA(), problem_spec.getB(), strm);
+
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    return true;
+}
+
+
 //------------------------------------------------------------------------------
 // test_AxB_phase2_factory: test phase2 and phase2end
 //------------------------------------------------------------------------------
 
-template <typename T_C>
-bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz)
+template <typename T_C, typename T_M, typename T_A, typename T_B>
+bool test_AxB_phase2_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec)
 {
 
     int gpuID;
     cudaGetDevice( &gpuID);
 
-    std::cout<< "found device "<<gpuID<<std::endl;
+    cudaStream_t strm;
+    CHECK_CUDA(cudaStreamCreate(&strm));
 
+    auto mymxm = problem_spec.get_mxm_factory();
+    phase1launchFactory p1lF(mymxm);
     phase2launchFactory p2lF;
     phase2endlaunchFactory p2elF;
 
-    SpGEMM_problem_generator<T_C, T_C, T_C, T_C> G(N, N);
-    int64_t Annz = N*N;
-    int64_t Bnnz = N*N;
-    int64_t Cnz = N;
-    float Cnzpercent = (float) Cnz/(N*N);
-
-    G.init_A(Annz, GxB_SPARSE, GxB_BY_ROW);
-    G.init_B(Bnnz, GxB_FULL, GxB_BY_ROW);
-    G.init_C(Cnzpercent);
-    G.fill_buckets( TB ); // all elements go to testbucket= TB
-    G.loadCj(); // FIXME: Figure out why this is needed here
-
-
-    GrB_Matrix C = G.getC();
-    GrB_Matrix M = G.getM();       // note: values are not accessed
-
    GpuTimer kernTimer;
    kernTimer.Start();
-    const int64_t mnz = GB_nnz (M) ;
+
+   const int64_t mnz = GB_nnz (problem_spec.getM()) ;
 
    int nthrd = p2lF.get_threads_per_block();
-   int ntasks = p2elF.get_number_of_blocks(M);
+   int ntasks = p2elF.get_number_of_blocks(problem_spec.getM());
 
     // fabricate data as if it came from phase1:
     int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t));
     int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t));
     int64_t *bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
-    int64_t *bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
     int64_t *offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t));
+    int64_t *bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
 
-    std::cout << "nthrd: " << nthrd << ", ntasks: " << ntasks << std::endl;
-    fillvector_constant(NBUCKETS * nthrd * ntasks, nanobuckets, (int64_t)1);
-    fillvector_constant(NBUCKETS * ntasks, blockbucket, (int64_t)1);
-    fillvector_constant(NBUCKETS, bucketp, (int64_t)1);
+    fillvector_constant(NBUCKETS, bucketp, (int64_t)0);
+    fillvector_constant(NBUCKETS, offset, (int64_t)0);
+    //fillvector_constant(problem_spec.getCnnz(), bucket, (int64_t)0);
+
+    std::cout << "Running phase1 kernel" << std::endl;
+    kernTimer.Start();
+    p1lF.jitGridBlockLaunch(nanobuckets, blockbucket,
+                            problem_spec.getC(), problem_spec.getM(),
+                            problem_spec.getA(), problem_spec.getB(), strm);
+
+
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+
+    std::cout << " phase1 internal phase2 "<< kernTimer.Elapsed() <<"ms Done." << std::endl;
+
+    //    // launch phase2 (just with p2ntasks as the # of tasks)
+    kernTimer.Start();
+    p2lF.jitGridBlockLaunch(blockbucket, offset, problem_spec.getM(),strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout << " phase2 kern "<< kernTimer.Elapsed() <<"ms Done." << std::endl;
 
-    print_array<int64_t>(nanobuckets, NBUCKETS*nthrd*ntasks, "nanobuckets");
-    print_array<int64_t>(blockbucket, NBUCKETS*ntasks, "blockbucket");
-//
-//    // launch phase2 (just with p2ntasks as the # of tasks)
-    p2lF.jitGridBlockLaunch(blockbucket, offset, M);
 //
 //    // do the reduction between phase2 and phase2end
     int64_t s= 0;
@@ -241,32 +217,42 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz)
     {
         bucketp[bucket] = s;
         s+= offset[bucket];
-        //printf("bucketp[%d] = %ld\n", bucket, Bucketp[bucket]);
     }
 
     // launch phase2end: note same # of tasks as phase1
+    kernTimer.Start();
     p2elF.jitGridBlockLaunch( nanobuckets, blockbucket,
-                              bucketp, bucket, offset, C,
-                              M);
-//    kernTimer.Stop();
-//    std::cout<<"returned from phase2 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+                              bucketp, bucket, offset, problem_spec.getC(),
+                              problem_spec.getM(),strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout<<"returned from phase2end kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
 //
 //
     print_array<int64_t>(bucketp, NBUCKETS, "bucketp");
-    print_array<int64_t>(bucket, mnz, "bucket");
-    std::cout<<"phase2 kernel done =================="<<std::endl;
+//  print_array<int64_t>(bucket, mnz, "bucket");
+    std::cout<<"phase2 done =================="<<std::endl;
+
+    EXPECT_EQ( bucketp[NBUCKETS], problem_spec.getCnnz()); //check we sum to the right structural counts
+
     rmm_wrap_free(nanobuckets);
     rmm_wrap_free(blockbucket);
     rmm_wrap_free(bucketp);
     rmm_wrap_free(bucket);
     rmm_wrap_free(offset);
-    G.del();
-   return true;
+
+    CHECK_CUDA(cudaStreamDestroy(strm));
+
+    return true;
 }
 
 template<typename T>
-void make_grb_matrix(GrB_Matrix &mat, int64_t n_rows, int64_t n_cols, std::vector<int64_t> &indptr, std::vector<int64_t> &indices, std::vector<T> &data,
-                     int gxb_sparsity_control = GxB_SPARSE, int gxb_format = GxB_BY_ROW) {
+void make_grb_matrix(GrB_Matrix mat, int64_t n_rows, int64_t n_cols,
+                     std::vector<int64_t> &indptr,
+                     std::vector<int64_t> &indices, T *data,
+                     int gxb_sparsity_control,
+                     int gxb_format ) 
+{
 
     GrB_Type type = cuda::jit::to_grb_type<T>();
 
@@ -287,207 +273,153 @@ void make_grb_matrix(GrB_Matrix &mat, int64_t n_rows, int64_t n_cols, std::vecto
 
     GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ;
     GRB_TRY (GB_convert_any_to_non_iso (mat, true, NULL)) ;
-    // TODO: Need to specify these
     GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ;
     GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format));
-    GRB_TRY (GxB_Matrix_fprint (mat, "my mat", GxB_SHORT_VERBOSE, stdout)) ;
 
-    bool iso ;
-    GRB_TRY (GxB_Matrix_iso (&iso, mat)) ;
-    if (iso)
-    {
-        printf ("Die! (cannot do iso)\n") ;
-        GrB_Matrix_free (&mat) ;
-    }
 
 }
 
-template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-bool test_AxB_dot3_full_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz,
-                                 GrB_Monoid monoid, GrB_BinaryOp binop) {
+template <
+    typename T_C, typename T_M, typename T_A,typename T_B,
+    typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_sparse_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
 
     // FIXME: Allow the adaptive tests in this guy
+    std::cout << "sparse test ======================" << std::endl;
 
-    //Generate test data and setup for using a jitify kernel with 'bucket' interface
-    // The testBucket arg tells the generator which bucket we want to exercise
-    int64_t Annz;
-    int64_t Bnnz;
-
-    switch(TB) {
-        case GB_BUCKET_DNDN:
-            Annz = N * N;
-            Bnnz = N * N;
-            break;
-        case GB_BUCKET_SPDN:
-            Annz = N * N;
-            Bnnz = N * 5;
-            break;
-        case GB_BUCKET_VSSP:
-            Annz = N * 2;
-            Bnnz = N * 10;
-            break;
-        case GB_BUCKET_VSVS_4:
-        case GB_BUCKET_VSVS_16:
-        case GB_BUCKET_VSVS_64:
-        case GB_BUCKET_VSVS_256:
-            Annz = N * 2;
-            Bnnz = N * 4;
-            break;
-        case GB_BUCKET_MERGEPATH:
-            Annz = N * 5;
-            Bnnz = N * 2;
-            break;
-        default:
-            printf("Bucket not yet being tested!\n");
-            exit(1);
-    }
-    int64_t Cnz = N;
-    float Cnzpercent = (float) Cnz/(N*N);
-
-    // FIXME: make this an argument
-    bool Mask_struct = true;
+    GpuTimer kernTimer;
 
-    std::cout << "Getting test data" << std::endl;
-    // FIXME: These need to be set based on the bucket being tested
-//    TestData<T_A, T_B, T_C, T_M> data = *make_karate_tricount<T_A, T_B, T_C, T_M>();
+    cudaStream_t strm;
+    CHECK_CUDA(cudaStreamCreate(&strm));
 
-    std::cout << "Creating problem gen" << std::endl;
-//    N = data.A_indptr.size()-1;
-    SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G(N, N);
-    G.init_C(float(Cnz) / (N * N));
+    std::cout << "sr_code: " << problem_spec.get_mxm_factory().sr_code << std::endl;
 
-//    GrB_Matrix A;
-//    GrB_Matrix B;
-//    GrB_Matrix C;
-//    GrB_Matrix M;
-//
-//    GrB_Matrix C_actual = G.getC();
+    bool result = false;
 
-//    make_grb_matrix<T_A>(A, data.A_indptr, data.A_indices, data.A_data, GxB_SPARSE);
-//    make_grb_matrix<T_B>(B, data.B_indptr, data.B_indices, data.B_data, GxB_FULL, GxB_BY_ROW);
-//    make_grb_matrix<T_C>(C, data.C_indptr, data.C_indices, data.C_data);
-//    make_grb_matrix<T_M>(M, data.M_indptr, data.M_indices, data.M_data);
+    int64_t N = problem_spec.getN();
+    /**
+     * Run Phase 1, phase 2 and phase2end: Compute nanobuckets and blockbuckets
+     */
 
+    auto mymxm = problem_spec.get_mxm_factory();
+    phase1launchFactory p1lF(mymxm);
+    phase2launchFactory p2lF;
+    phase2endlaunchFactory p2elF;
 
-//    std::cout << "Filling A" << std::endl;
-    G.init_A(Annz, GxB_SPARSE, GxB_BY_ROW, 543210, 0, 2);
-//    std::cout << "Filling B" << std::endl;
+    GrB_Matrix C = problem_spec.getC();
+    GrB_Matrix M = problem_spec.getM();
+    GrB_Matrix A = problem_spec.getA();
+    GrB_Matrix B = problem_spec.getB();
 
-    G.init_B(Bnnz, GxB_SPARSE, GxB_BY_ROW, 32, 0, 2);
+    const int64_t mnz = GB_nnz (M) ;
+    const int64_t cnz = GB_nnz (C) ;
+    const int64_t cvlen = C->vlen ;
+    const int64_t cvdim = C->vdim ;
+    const int64_t cnvec = C->nvec ;
 
-    /**
-     * For testing, we need to create our output C and configure
-     * it w/ the necessary sparsity.
-     */
-    G.fill_buckets( TB); // all elements go to testbucket= TB
+    bool C_iso = false ;
+    int C_sparsity = GB_sparsity (M) ;
+    int M_sparsity = GB_sparsity (M) ;
+    GrB_Type ctype = problem_spec.getBinaryOp()->ztype ;
 
-    GrB_Matrix C = G.getC();
-    GrB_Matrix M = G.getM();
-    GrB_Matrix A = G.getA();
-    GrB_Matrix B = G.getB();
+    int nthrd = p2lF.get_threads_per_block();
+    int ntasks = p2elF.get_number_of_blocks(M);
 
-//    GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_SHORT_VERBOSE, stdout)) ;
-//    GRB_TRY (GxB_Matrix_fprint (B, "B", GxB_SHORT_VERBOSE, stdout)) ;
-//    GRB_TRY (GxB_Matrix_fprint (M, "M", GxB_SHORT_VERBOSE, stdout)) ;
-//    GRB_TRY (GxB_Matrix_fprint (C, "C", GxB_SHORT_VERBOSE, stdout)) ;
-//
-    std::cout << "Building semiring factgory" << std::endl;
-    GB_cuda_semiring_factory mysemiringfactory = GB_cuda_semiring_factory ( ) ;
-    GrB_Semiring mysemiring;
-    auto grb_info = GrB_Semiring_new(&mysemiring, monoid, binop);
-    GRB_TRY (grb_info) ;
-
-    bool flipxy = false;
-    bool mask_struct = false;
-    bool mask_comp = false;
-//    GrB_Matrix C_actual = C;
-
-    mysemiringfactory.semiring_factory ( mysemiring, flipxy,
-                                         C->type, M->type,
-                                         A->type, B->type,
-                                         mask_struct,  // matrix types
-                                         mask_comp, GB_sparsity(C),
-                                         GB_sparsity(M),
-                                         GB_sparsity(A),
-                                         GB_sparsity(B) ) ;
+    // fabricate data as if it came from phase1:
+    int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t));
+    int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t));
+    int64_t *bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
+    int64_t *bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
+    int64_t *offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t));
 
-    bool result = false;
+    fillvector_constant(NBUCKETS, bucketp, (int64_t)0);
+    fillvector_constant(NBUCKETS, offset, (int64_t)0);
+    //fillvector_constant(problem_spec.getCnnz(), bucket, (int64_t)0);
 
-    /**
-     * Run Phase 1: Compute nanobuckets and blockbuckets
-     */
-    const int64_t mnz = GB_nnz (M) ;
+    std::cout << "sparse phase1 kernel" << std::endl;
+    kernTimer.Start();
+    p1lF.jitGridBlockLaunch(nanobuckets, blockbucket,
+                            C, M, A, B, strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout<<"sparse test phase1 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
 
-    int chunk_size = 128;
+    //    // launch phase2 (just with p2ntasks as the # of tasks)
+    kernTimer.Start();
+    p2lF.jitGridBlockLaunch(blockbucket, offset, M, strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout<<"sparse test phase2 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
 
-    int number_of_sms = GB_Global_gpu_sm_get (0);
-    int64_t *bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
+//
+//    // do the reduction between phase2 and phase2end
+    int64_t s= 0;
+    for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
+    {
+        bucketp[bucket] = s;
+        s+= offset[bucket];
+    }
 
-    CHECK_CUDA(cudaMemset(bucketp, 0, (NBUCKETS+1)*sizeof(int64_t)));
+    std::cout << "Launching phase2end" << std::endl;
 
-    int64_t *bucket = (int64_t*)rmm_wrap_malloc(Cnz * sizeof (int64_t));
+    // launch phase2end: note same # of tasks as phase1
+    kernTimer.Start();
+    p2elF.jitGridBlockLaunch( nanobuckets, blockbucket,
+                              bucketp, bucket, offset, C, M, strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout << "sparse test phase2end " <<kernTimer.Elapsed()<<"ms"<<std::endl;
 
     /**
      * Run Phase 3: Execute dot3 on all buckets
      */
-    for (int b =0; b < 12; ++b) {// loop on buckets
-        if (b == TB) {
-            G.fill_buckets(b);
-            int64_t *Bucket = G.getBucket();
-            int64_t *BucketStart = G.getBucketStart();
+    for (int b = 1; b < NBUCKETS; ++b) {// loop on buckets
+           int64_t b_start = bucketp[b];
+           int64_t b_end = bucketp[b+1];
+           int64_t nvecs = b_end - b_start;
 
-            int64_t b_start = BucketStart [b] ;
-            int64_t b_end   = BucketStart [b+1] ;
-            int64_t nvecs = b_end - b_start ;
+           if (nvecs == 0) continue;
 
-            if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
-
-            G.loadCj();
-
-           GpuTimer kernTimer;
            kernTimer.Start();
-
-           GB_cuda_mxm_phase3(mysemiringfactory, (GB_bucket_code )b,
-                              b_start, b_end, bucketp, Bucket, C, M, B, A);
-
-            print_array<int64_t>(bucketp, NBUCKETS+1, "bucketp");
+           phase3launchFactory p3lf(mymxm, (GB_bucket_code)b);
+           p3lf.jitGridBlockLaunch( b_start, b_end, bucketp, bucket, C, M,
+                                    A, B, strm);
+           CHECK_CUDA(cudaStreamSynchronize(strm));
 
            kernTimer.Stop();
+           std::cout << "phase3 bucket="<<b<<" done " <<kernTimer.Elapsed()<<"ms"<<std::endl;
+           fflush(stdout);
+
+       }
+       C->nzombies += (bucketp[1]); //add pre-zombies to the count;
 
-           std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-           GRB_TRY (GxB_Matrix_fprint (C, "C GPU", GxB_SHORT_VERBOSE, stdout)) ;
+           GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE));
+           fflush(stdout);
 
-            GrB_Matrix C_actual;
+            GrB_Matrix C_expected;
             GrB_Type type = cuda::jit::to_grb_type<T_C>();
-            GRB_TRY (GrB_Matrix_new (&C_actual, type, N, N)) ;
+            GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ;
 
             // ensure the GPU is not used
             GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_NEVER)) ;
 
             // Use GrB_DESC_S for structural because dot3 mask will never be complemented
-            GRB_TRY (GrB_mxm(C_actual, M, NULL, mysemiring, A, B,
-                Mask_struct ? GrB_DESC_ST1 : GrB_DESC_T1));
-//            GRB_TRY (GrB_mxm(C_actual, M, NULL, mysemiring, A, B,
-//                             Mask_struct ? GrB_DESC_S : NULL));
+            // The order of B and A is swapped to account for CSR vs CSC assumption
+            GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(),
+                             problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1));
 
-            GRB_TRY (GxB_Matrix_fprint (M, "M actual", GxB_SHORT_VERBOSE, stdout));
-            GRB_TRY (GxB_Matrix_fprint (A, "A actual", GxB_SHORT_VERBOSE, stdout));
-            GRB_TRY (GxB_Matrix_fprint (B, "B actual", GxB_SHORT_VERBOSE, stdout));
 
-            GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE));
-            GRB_TRY(GrB_Matrix_wait(C_actual, GrB_MATERIALIZE));
+            GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE));
 
-            GRB_TRY (GxB_Matrix_fprint (C, "C GPU", GxB_COMPLETE, stdout));
-            GRB_TRY (GxB_Matrix_fprint (C_actual, "C_actual", GxB_COMPLETE, stdout));
             // compare
             double tol = 0 ;
             GrB_Index nvals1 = 0, nvals2 = 0 ;
             GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ;
-            GRB_TRY (GrB_Matrix_nvals (&nvals2, C_actual)) ;
-            if (nvals1 != nvals2) { printf ("!!\n") ; abort ( ) ; } 
+            GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ;
+            if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!!\n") ; ADD_FAILURE( ) ; }
             GrB_Index nrows, ncols ;
-            GrB_Matrix_nrows (&nrows, C) ;
-            GrB_Matrix_ncols (&ncols, C) ;
+            GrB_Matrix_nrows (&nrows, C_expected) ;
+            GrB_Matrix_ncols (&ncols, C_expected) ;
 
             GrB_Matrix T;
 
@@ -504,1602 +436,489 @@ bool test_AxB_dot3_full_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz,
             else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
             else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
             else if (type == GrB_FP32  )
-            {
+            {   tol = 1e-6;
                 op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
                 op_abs = GrB_ABS_FP32 ;
             }
             else if (type == GrB_FP64  )
-            {
+            {   tol = 1e12;
                 op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
                 op_abs = GrB_ABS_FP64 ;
             }
             else if (type == GxB_FC32  )
-            {
+            {   tol = 2e-6;
                 op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
                 op_abs = GxB_ABS_FC32 ;
             }
             else if (type == GxB_FC64  )
-            {
+            {   tol = 2e-12;
                 op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
                 op_abs = GxB_ABS_FC64 ;
             }
 
 
-            // Diff = C - C_actual
-            GrB_Matrix Diff ;
-            GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ;
-            GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_actual, NULL)) ;
-            GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64,
-                C, Diff, NULL)) ;
-            GRB_TRY (GxB_Matrix_fprint (Diff, "Diff actual", GxB_COMPLETE, stdout));
-            GRB_TRY (GrB_Matrix_free (&Diff)) ;
 
             if (tol == 0)
             {
                 // check for perfect equality
-                GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_actual,
+                GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected,
                     NULL)) ;
                 GrB_Index nvals3 = 1 ;
-                GRB_TRY (GxB_Matrix_fprint (T, "T actual", GxB_SHORT_VERBOSE, stdout));
                 GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ;
-                if (nvals1 != nvals3) { printf ("!!\n") ; abort ( ) ; } 
+//                if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; }
                 bool is_same = false ;
                 GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL,
                     T, NULL)) ;
-                if (!is_same) { printf ("!!\n") ; abort ( ) ; } 
+                if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; } 
                 GRB_TRY (GrB_Matrix_free (&T)) ;
             }
             else
             {
                 // TODO: check with roundoff
-                { printf ("!!\n") ; abort ( ) ; } 
+                // Diff = C - C_expected
+                GrB_Matrix Diff ;
+                GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ;
+                GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ;
+                GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64,
+                    C, Diff, NULL)) ;
+                GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) );
+                GrB_Index nvals3 = 1 ;
+                GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ;
+                if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; } 
+                double is_same = false ;
+                GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64,
+                    Diff, NULL)) ;
+                printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 );
+                EXPECT_LT( is_same/nvals3, tol);
+                GRB_TRY (GrB_Matrix_free (&Diff)) ;
+
             }
 
             // re-enable the GPU
             GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_ALWAYS)) ;
-         }
-        }
+         
 
-    rmm_wrap_free(bucket);
+    rmm_wrap_free(nanobuckets);
+    rmm_wrap_free(blockbucket);
     rmm_wrap_free(bucketp);
+    rmm_wrap_free(bucket);
+    rmm_wrap_free(offset);
+    GRB_TRY(GrB_Matrix_free(&C_expected));
+    CHECK_CUDA(cudaStreamDestroy(strm));
 
-    G.del();
-
+    std::cout << "phase 3 test complete ======================" << std::endl;
     return result;
 }
 
-template <typename T>
-bool test_reduce_factory(unsigned int N, GrB_Monoid monoid ) {
+template <
+        typename T_C, typename T_M, typename T_A,typename T_B,
+        typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_dense_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
 
-    //std::cout<<" alloc'ing data and output"<<std::endl;
-    std::vector<int64_t> indptr(N+1);
-    std::vector<int64_t> index(N);
-    std::vector<T> d_data(N);
+    std::cout << "phase dense  test ======================" << std::endl;
 
-    indptr[N] = N;
-    fillvector_linear<int64_t>((int)N, indptr.data(), (int64_t)0);
-    fillvector_constant<int64_t>((int)N, index.data(), (int64_t)1);
-    fillvector_linear<T> ( N, d_data.data());
+    GpuTimer kernTimer;
 
-    GrB_Type t = cuda::jit::to_grb_type<T>();
+    cudaStream_t strm;
+    CHECK_CUDA(cudaStreamCreate(&strm));
 
-    GrB_Matrix A;
-    make_grb_matrix(A, N, N, indptr, index, d_data, GxB_SPARSE, GxB_BY_ROW);
+    bool result = false;
 
-    GRB_TRY (GrB_Matrix_wait (A, GrB_MATERIALIZE)) ;
-    GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_COMPLETE, stdout));
+    int64_t N = problem_spec.getN();
 
-    T actual;
-    GB_cuda_reduce( A, &actual, monoid );
+    auto mymxm = problem_spec.get_mxm_factory();
+    dense_phase1launchFactory p1lF(mymxm);
 
-    GrB_Vector v;
-    GrB_Vector_new(&v, t, N);
+    GrB_Matrix C = problem_spec.getC();
+    GrB_Matrix M = problem_spec.getM();
+    GrB_Matrix A = problem_spec.getA();
+    GrB_Matrix B = problem_spec.getB();
 
-    // Just sum in place for now (since we are assuming sum)
-    int sum = 0;
-    for(int i = 0; i < N; ++i) {
-        sum+= d_data[i];
-        cuda::jit::vector_set_element<T>(v, i, d_data[i]);
-    }
-    printf("Sum: %d\n", sum);
+    problem_spec.set_sparsity_control( A, GxB_FULL, GxB_BY_ROW);
+    problem_spec.set_sparsity_control( B, GxB_FULL, GxB_BY_ROW);
+
+    const int64_t mnz = GB_nnz (M) ;
+    const int64_t cnz = GB_nnz (C) ;
+    const int64_t cvlen = C->vlen ;
+    const int64_t cvdim = C->vdim ;
+    const int64_t cnvec = C->nvec ;
+
+    bool C_iso = false ;
+    GrB_Type ctype = problem_spec.getBinaryOp()->ztype ;
+
+    std::cout << "Running phase1 kernel" << std::endl;
+    kernTimer.Start();
+    p1lF.jitGridBlockLaunch(C, M, A, B, strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout<<"Dense internal phase1 kernel done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+    std::cout << "Running dense kernel" << std::endl;
+    mxm_dense_launchFactory p3lf(mymxm);
+    kernTimer.Start();
+    p3lf.jitGridBlockLaunch( C, M, A, B, strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout<<"Dense kernel done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+    GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE));
+    fflush(stdout);
 
+    GrB_Matrix C_expected;
+    GrB_Type type = cuda::jit::to_grb_type<T_C>();
+    GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ;
+
+    // ensure the GPU is not used
     GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_NEVER)) ;
 
-    printf("Invoking grb reduce\n");
-    T expected;
-    GRB_TRY(cuda::jit::vector_reduce(&expected, v, monoid));
-    printf("Done.\n");
+    // Use GrB_DESC_S for structural because dot3 mask will never be complemented
+    // The order of B and A is swapped to account for CSR vs CSC assumption
+    GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(),
+                     problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1));
+
+    GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE));
+    std::cout << "nnz: " << GB_nnz (C_expected) << std::endl ;
+
+    // compare
+    double tol = 0 ;
+    GrB_Index nvals1 = 0, nvals2 = 0 ;
+    GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ;
+    GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ;
+    if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!! nvals1=%lu, nvals2=%lu\n", nvals1, nvals2) ; ADD_FAILURE( ) ; }
+    GrB_Index nrows, ncols ;
+    GrB_Matrix_nrows (&nrows, C_expected) ;
+    GrB_Matrix_ncols (&ncols, C_expected) ;
+
+    GrB_Matrix T;
+
+    GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ;
+    GrB_BinaryOp op = NULL;
+    GrB_UnaryOp op_abs = NULL ;
+    if      (type == GrB_BOOL  ) op = GrB_EQ_BOOL   ;
+    else if (type == GrB_INT8  ) op = GrB_EQ_INT8   ;
+    else if (type == GrB_INT16 ) op = GrB_EQ_INT16  ;
+    else if (type == GrB_INT32 ) op = GrB_EQ_INT32  ;
+    else if (type == GrB_INT64 ) op = GrB_EQ_INT64  ;
+    else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8  ;
+    else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ;
+    else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
+    else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
+    else if (type == GrB_FP32  )
+    {   tol = 5e-6;
+        op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
+        op_abs = GrB_ABS_FP32 ;
+    }
+    else if (type == GrB_FP64  )
+    {   tol = 1e12;
+        op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
+        op_abs = GrB_ABS_FP64 ;
+    }
+    else if (type == GxB_FC32  )
+    {   tol = 2e-6;
+        op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
+        op_abs = GxB_ABS_FC32 ;
+    }
+    else if (type == GxB_FC64  )
+    {   tol = 2e-12;
+        op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
+        op_abs = GxB_ABS_FC64 ;
+    }
+
 
+
+    if (tol == 0)
+    {
+        // check for perfect equality
+        GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected,
+                                                NULL)) ;
+        GrB_Index nvals3 = 1 ;
+        GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ;
+//        if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!! nvals1=%ld nvals3=%ld\n", nvals1, nvals3) ; ADD_FAILURE( ) ; }
+        bool is_same = false ;
+        GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL,
+                                         T, NULL)) ;
+        if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; }
+        GRB_TRY (GrB_Matrix_free (&T)) ;
+    }
+    else
+    {
+        // TODO: check with roundoff
+        // Diff = C - C_expected
+        GrB_Matrix Diff ;
+        GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ;
+        GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ;
+        GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64,
+                                               C, Diff, NULL)) ;
+        GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) );
+        GrB_Index nvals3 = 1 ;
+        GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ;
+        if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; }
+        double is_same = false ;
+        GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64,
+                                         Diff, NULL)) ;
+        printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 );
+        EXPECT_LT( is_same/nvals3, tol);
+        GRB_TRY (GrB_Matrix_free (&Diff)) ;
+
+    }
+
+    // re-enable the GPU
     GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_ALWAYS)) ;
-    if(expected != actual) {
-        std::cout << "results do not match: reduced=" << expected << ", actual=" << actual << std::endl;
-        exit(1);
-    } else {
-        std::cout << "Results matched!" << std::endl;
+
+
+    GRB_TRY(GrB_Matrix_free(&C_expected));
+    CHECK_CUDA(cudaStreamDestroy(strm));
+
+    std::cout << "phase 3 dense test complete ======================" << std::endl;
+    return result;
+}
+
+template <
+        typename T_C, typename T_M, typename T_A,typename T_B,
+        typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_sparse_dense_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
+
+    std::cout << "sparse dense test ======================" << std::endl;
+
+    GpuTimer kernTimer;
+
+    cudaStream_t strm;
+    CHECK_CUDA(cudaStreamCreate(&strm));
+
+    bool result = false;
+
+    int64_t N = problem_spec.getN();
+
+    GrB_Matrix C = problem_spec.getC();
+    GrB_Matrix M = problem_spec.getM();
+    GrB_Matrix A = problem_spec.getA();
+    GrB_Matrix B = problem_spec.getB();
+
+    problem_spec.set_sparsity_control( A, GxB_SPARSE, GxB_BY_ROW);
+
+    // TODO: Need to make sure the format itself is actually dense.
+    problem_spec.set_sparsity_control( B, GxB_FULL, GxB_BY_ROW);
+
+    auto mymxm = problem_spec.get_mxm_factory();
+    dense_phase1launchFactory p1lF(mymxm);
+
+    const int64_t mnz = GB_nnz (M) ;
+    const int64_t cnz = GB_nnz (C) ;
+    const int64_t cvlen = C->vlen ;
+    const int64_t cvdim = C->vdim ;
+    const int64_t cnvec = C->nvec ;
+
+    bool C_iso = false ;
+    GrB_Type ctype = problem_spec.getBinaryOp()->ztype ;
+
+    std::cout << "Running dense_phase1 kernel" << std::endl;
+    kernTimer.Start();
+    p1lF.jitGridBlockLaunch(C, M, A, B, strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout<<"Dense internal phase1 kernel done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+    std::cout << "Running sparse dense kernel" << std::endl;
+    mxm_sparse_dense_launchFactory spdnlf(mymxm);
+    kernTimer.Start();
+    spdnlf.jitGridBlockLaunch( C, M, A, B, strm);
+    CHECK_CUDA(cudaStreamSynchronize(strm));
+    kernTimer.Stop();
+    std::cout<<"Sparse_Dense done "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+    GRB_TRY(GrB_Matrix_wait(C, GrB_MATERIALIZE));
+    fflush(stdout);
+
+    GrB_Matrix C_expected;
+    GrB_Type type = cuda::jit::to_grb_type<T_C>();
+    GRB_TRY (GrB_Matrix_new (&C_expected, type, N, N)) ;
+
+    // ensure the GPU is not used
+    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_NEVER)) ;
+
+    // Use GrB_DESC_S for structural because dot3 mask will never be complemented
+    // The order of B and A is swapped to account for CSR vs CSC assumption
+    GRB_TRY (GrB_mxm(C_expected, problem_spec.getM(), NULL, problem_spec.get_semiring(), problem_spec.getB(),
+                     problem_spec.getA(), problem_spec.get_mask_struct() ? GrB_DESC_ST1 : GrB_DESC_T1));
+
+    GRB_TRY(GrB_Matrix_wait(C_expected, GrB_MATERIALIZE));
+    std::cout << "nnz: " << GB_nnz (C_expected) << std::endl ;
+
+    // compare
+    double tol = 0 ;
+    GrB_Index nvals1 = 0, nvals2 = 0 ;
+    GRB_TRY (GrB_Matrix_nvals (&nvals1, C)) ;
+    GRB_TRY (GrB_Matrix_nvals (&nvals2, C_expected)) ;
+    if (nvals1 != nvals2) { printf ("Wrong number of nonzeroes found, test fail!!! nvals1=%lu, nvals2=%lu\n", nvals1, nvals2) ; ADD_FAILURE( ) ; }
+    GrB_Index nrows, ncols ;
+    GrB_Matrix_nrows (&nrows, C_expected) ;
+    GrB_Matrix_ncols (&ncols, C_expected) ;
+
+    GrB_Matrix T;
+
+    GRB_TRY (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ;
+    GrB_BinaryOp op = NULL;
+    GrB_UnaryOp op_abs = NULL ;
+    if      (type == GrB_BOOL  ) op = GrB_EQ_BOOL   ;
+    else if (type == GrB_INT8  ) op = GrB_EQ_INT8   ;
+    else if (type == GrB_INT16 ) op = GrB_EQ_INT16  ;
+    else if (type == GrB_INT32 ) op = GrB_EQ_INT32  ;
+    else if (type == GrB_INT64 ) op = GrB_EQ_INT64  ;
+    else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8  ;
+    else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ;
+    else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
+    else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
+    else if (type == GrB_FP32  )
+    {   tol = 5e-6;
+        op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
+        op_abs = GrB_ABS_FP32 ;
+    }
+    else if (type == GrB_FP64  )
+    {   tol = 1e12;
+        op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
+        op_abs = GrB_ABS_FP64 ;
+    }
+    else if (type == GxB_FC32  )
+    {   tol = 2e-6;
+        op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
+        op_abs = GxB_ABS_FC32 ;
+    }
+    else if (type == GxB_FC64  )
+    {   tol = 2e-12;
+        op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
+        op_abs = GxB_ABS_FC64 ;
     }
 
-    return expected == actual;
+
+
+    if (tol == 0)
+    {
+        // check for perfect equality
+        GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (T, NULL, NULL, op, C, C_expected,
+                                                NULL)) ;
+        GrB_Index nvals3 = 1 ;
+        GRB_TRY (GrB_Matrix_nvals (&nvals3, T)) ;
+//        if (nvals1 != nvals3) { printf (" difference matrix wrong size, test fail!! nvals1=%ld nvals3=%ld\n", nvals1, nvals3) ; ADD_FAILURE( ) ; }
+        bool is_same = false ;
+        GRB_TRY (GrB_Matrix_reduce_BOOL (&is_same, NULL, GrB_LAND_MONOID_BOOL,
+                                         T, NULL)) ;
+        if (!is_same) { printf (" results don't match, test fail!!\n") ; ADD_FAILURE ( ) ; }
+        GRB_TRY (GrB_Matrix_free (&T)) ;
+    }
+    else
+    {
+        // TODO: check with roundoff
+        // Diff = C - C_expected
+        GrB_Matrix Diff ;
+        GRB_TRY (GrB_Matrix_new (&Diff, GrB_FP64, nrows, ncols)) ;
+        GRB_TRY (GrB_Matrix_apply (Diff, NULL, NULL, GrB_AINV_FP64, C_expected, NULL)) ;
+        GRB_TRY (GrB_Matrix_eWiseAdd_BinaryOp (Diff, NULL, NULL, GrB_PLUS_FP64,
+                                               C, Diff, NULL)) ;
+        GRB_TRY( GrB_Matrix_apply( Diff, NULL, NULL, op_abs, Diff, NULL) );
+        GrB_Index nvals3 = 1 ;
+        GRB_TRY (GrB_Matrix_nvals (&nvals3, Diff)) ;
+        if (nvals1 != nvals3) { printf ("fp difference matrix wrong size, test fail!!\n") ; ADD_FAILURE( ) ; }
+        double is_same = false ;
+        GRB_TRY (GrB_Matrix_reduce_FP64 (&is_same, NULL, GrB_PLUS_MONOID_FP64,
+                                         Diff, NULL)) ;
+        printf("difference = %12.6g, rel_l1_err=%12.6g\n", is_same, is_same/nvals3 );
+        EXPECT_LT( is_same/nvals3, tol);
+        GRB_TRY (GrB_Matrix_free (&Diff)) ;
+
+    }
+
+    // re-enable the GPU
+    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_ALWAYS)) ;
+
+
+    GRB_TRY(GrB_Matrix_free(&C_expected));
+    CHECK_CUDA(cudaStreamDestroy(strm));
+
+    std::cout << "phase 3 dense test complete ======================" << std::endl;
+    return result;
 }
 
-//bool test_triangle_counting() {
-//
-//    // Hardcoding int64_t for now
-//    TestData<T_A, T_B, T_C, T_M> data = *make_karate_tricount<int64_t, int64_t, int64_t, int64_t>();
-//
-//    GrB_Monoid monoid = GrB_PLUS_MONOID_INT64;
-//    GrB_BinaryOp binop = GrB_TIMES_INT64;
-//    std::cout << "Creating problem gen" << std::endl;
-//    N = data.A_indptr.size()-1;
-//
-//    GrB_Matrix A;
-//    GrB_Matrix B;
-//    GrB_Matrix C;
-//    GrB_Matrix M;
-//
-//    make_grb_matrix<T_A>(A, data.A_indptr, data.A_indices, data.A_data, GxB_SPARSE);
-//    make_grb_matrix<T_B>(B, data.B_indptr, data.B_indices, data.B_data, GxB_FULL, GxB_BY_ROW);
-//    make_grb_matrix<T_C>(C, data.C_indptr, data.C_indices, data.C_data);
-//    make_grb_matrix<T_M>(M, data.M_indptr, data.M_indices, data.M_data);
-//
-//    GrB_Semiring mysemiring;
-//    auto grb_info = GrB_Semiring_new(&mysemiring, monoid, binop);
-//    GRB_TRY (grb_info) ;
-//
-//    mysemiringfactory.semiring_factory ( mysemiring, false,
-//                                         C->type, M->type,
-//                                         A->type, B->type,
-//                                         true,  // matrix types
-//                                         false,
-//                                         GB_sparsity(C),
-//                                         GB_sparsity(M),
-//                                         GB_sparsity(A),
-//                                         GB_sparsity(B)
-//                                       ) ;
-//
-//    bool result = false;
-//
-//    /**
-//     * Run Phase 1: Compute nanobuckets and blockbuckets
-//     */
-//    const int64_t mnz = GB_nnz (M) ;
-//
-//    int chunk_size = 128;
-//
-//    // Use GrB_DESC_S for structural because dot3 mask will never be complemented
-//    GRB_TRY (GrB_mxm(C_actual, M, NULL, mysemiring, A, B, GrB_DESC_ST1));
-//
-//    GRB_TRY (GxB_Matrix_fprint (M, "M actual", GxB_SHORT_VERBOSE, stdout));
-//    GRB_TRY (GxB_Matrix_fprint (A, "A actual", GxB_SHORT_VERBOSE, stdout));
-//    GRB_TRY (GxB_Matrix_fprint (B, "B actual", GxB_SHORT_VERBOSE, stdout));
-//    GRB_TRY (GxB_Matrix_fprint (C, "C GPU", GxB_SHORT_VERBOSE, stdout));
-//    GRB_TRY (GxB_Matrix_fprint (C_actual, "C_actual", GxB_SHORT_VERBOSE, stdout));
-//
-//    GRB_TRY(GrB_reduce_)
-//
-//    return result;
-//
-//}
 
+template <typename T_C, typename T_M, typename T_A, typename T_B>
+bool test_reduce_factory(mxm_problem_spec<T_C, T_M, T_A, T_B> &problem_spec) {
 
+    std::cout << "reduce test ======================" << std::endl;
+
+    // TODO: This test doesn't really fit the `mxm` category
+    GrB_Monoid monoid = problem_spec.getMonoid();
+    int64_t N = problem_spec.getN();
+
+    GrB_Matrix A;
+
+    // TODO: Using C here so that the reduced type matches
+    GrB_Matrix_dup(&A, problem_spec.getC());
+    GrB_Type type = cuda::jit::to_grb_type<T_C>();
+
+    A->i[0] = GB_FLIP(A->i[0]); // FIXME
+    A->i[1] = GB_FLIP(A->i[1]); // FIXME
+    A->nzombies = 2; // FIXME: use an opaque method to insert zombies into A
+
+    //GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_SHORT_VERBOSE, stdout)) ;
+
+    GB_cuda_reduce_factory myreducefactory;
+    myreducefactory.reduce_factory(monoid, A);
+
+    T_C actual;
+    GB_cuda_reduce(myreducefactory, A, &actual, monoid );
+
+    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_NEVER)) ;
+
+    T_C expected;
+    GRB_TRY(cuda::jit::matrix_reduce(&expected, A, monoid));
+
+    GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_ALWAYS)) ;
+
+    double tol = 0;
+    GrB_BinaryOp op = NULL;
+    GrB_UnaryOp op_abs = NULL ;
+
+    if      (type == GrB_BOOL  ) op = GrB_EQ_BOOL   ;
+    else if (type == GrB_INT8  ) op = GrB_EQ_INT8   ;
+    else if (type == GrB_INT16 ) op = GrB_EQ_INT16  ;
+    else if (type == GrB_INT32 ) op = GrB_EQ_INT32  ;
+    else if (type == GrB_INT64 ) op = GrB_EQ_INT64  ;
+    else if (type == GrB_UINT8 ) op = GrB_EQ_UINT8  ;
+    else if (type == GrB_UINT16) op = GrB_EQ_UINT16 ;
+    else if (type == GrB_UINT32) op = GrB_EQ_UINT32 ;
+    else if (type == GrB_UINT64) op = GrB_EQ_UINT64 ;
+    else if (type == GrB_FP32  )
+    {   tol = 1e-6;
+        op = (tol == 0)? GrB_EQ_FP32 : GrB_MINUS_FP32   ;
+        op_abs = GrB_ABS_FP32 ;
+    }
+    else if (type == GrB_FP64  )
+    {   tol = 1e12;
+        op = (tol == 0)? GrB_EQ_FP64 : GrB_MINUS_FP64   ;
+        op_abs = GrB_ABS_FP64 ;
+    }
+    else if (type == GxB_FC32  )
+    {   tol = 2e-6;
+        op = (tol == 0)? GxB_EQ_FC32 : GxB_MINUS_FC32   ;
+        op_abs = GxB_ABS_FC32 ;
+    }
+    else if (type == GxB_FC64  )
+    {   tol = 2e-12;
+        op = (tol == 0)? GxB_EQ_FC64 : GxB_MINUS_FC64   ;
+        op_abs = GxB_ABS_FC64 ;
+    }
+
+    if(tol == 0) {
+       EXPECT_EQ( actual , expected);
+        //std::cout << "results do not match: reduced=" << expected << ", actual=" << actual << std::endl;
+        //exit(1);
+    } else if ( (tol > 0) && ( ( type ==GrB_FP32) || ( type ==GxB_FC32) 
+                            || ( type ==GrB_FP64) || ( type ==GxB_FC64) ) ){
+       EXPECT_LT( abs((double)actual - (double)expected)/(abs((double)expected)+1.e-12), tol) ;
+    }
+
+    std::cout<< expected<< " " << actual<< "reduce test complete ======================" << std::endl;
+    GRB_TRY(GrB_Matrix_free(&A));
+
+    return expected == actual;
+}
 
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_dndn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
-//// Assumes all matrices are square so far, so only N dimension given.
-//// Sparsity is dense here so Anz = Bnz = N*N.
-//// Generates three randomized matrices, builds buckets and calls a kernel.
-//
-//
-//launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "dndn");
-//
-//int testBucket = TB;
-//
-////unsigned seed = 13372801;
-////std::mt19937 r; //random number generator Mersenne Twister
-////r.seed(seed);
-//int gpuID;
-//cudaGetDevice( &gpuID);
-//
-//std::cout<< "found device "<<gpuID<<std::endl;
-//
-//T_Z MONOID_IDENTITY;
-//if (SEMI_RING == "PLUS_TIMES") {
-//   std::cout << "Plus Times (+,*) semiring"<<std::endl;
-//   MONOID_IDENTITY = 0;
-//   ADD_ptr<T_Z> = myOP_plus<T_Z>;
-//   MUL_ptr<T_Z> = myOP_times<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MIN_PLUS") {
-//   std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
-//   MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
-//   ADD_ptr<T_Z> = myOP_min<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MAX_PLUS") {
-//   MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
-//   std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
-//   ADD_ptr<T_Z> = myOP_max<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//}
-//
-////Generate test data and setup for using a jitify kernel with 'bucket' interface
-//SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
-//int64_t Annz = N*N;
-//int64_t Bnnz = N*N;
-//int64_t Cnz = N;
-//float Cnzpercent = (float) Cnz/(N*N);
-//
-//G.init(N, Annz, Bnnz, Cnzpercent);
-//
-//G.fill_buckets( testBucket); // all elements go to testbucket= TB
-//
-//matrix<T_C>* C = G.getCptr();
-//matrix<T_M>* M = G.getMptr();
-//matrix<T_A>* A = G.getAptr();
-//matrix<T_B>* B = G.getBptr();
-//
-//T_C *Cx = C->x;
-//T_A *Ax = A->x;
-//T_B *Bx = B->x;
-//
-//// Set clear zombie count
-//C->zombie_count = 0;
-//
-////std::cout<<"got all matrices"<<std::endl;
-//int64_t *Bucket = G.getBucket();
-//int64_t *BucketStart = G.getBucketStart();
-//
-//int zc_valid = 0;
-//
-//bool result = false;
-//
-//for (int b =0; b < 12; ++b) {// loop on buckets
-//
-//    int64_t b_start = BucketStart [b] ;
-//    int64_t b_end   = BucketStart [b+1] ;
-//    int64_t nvecs = b_end - b_start ;
-//    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
-//
-//    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
-//    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
-//    if (b == TB) { //test cases for dense-dense kernels
-//       int nthrd = 32;
-//       int sz = 4;
-//       //int m = 256/sz;
-//       int nblck = Cnz;
-//       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
-//
-//       GpuTimer kernTimer;
-//       kernTimer.Start();
-//       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
-//                                C, M, A, B, sz);
-//
-//       kernTimer.Stop();
-//       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//       zc_valid = C->zombie_count;
-//       C->zombie_count = 0;
-//       for (int i =0 ; i< Cnz; ++i) {
-//            //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
-//            X_valid[i] = Cx[i];
-//            Cx[i] = 0;
-//            i_valid[i] = C->i[i];
-//       }
-//       G.loadCj();
-//
-//       for (int64_t pair = b_start ; pair < b_end ; pair++) {
-//
-//        // get the kth entry in bucket b
-//        //std::cout<< " pair ="<<pair<<std::endl;
-//        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
-//        int64_t i = M->i[pC] ;          // row index of C(i,j)
-//
-//        // get C(i,j)
-//        int64_t k = (C->i [pC] >> 4) ;    // col index of C(i,j)
-//        //ASSERT ((C->i [pC] & 4) == b) ;
-//        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
-//        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
-//
-//        // xvp, xvi, xvals:  A(:,i)
-//        // xvp is Ap [i] and Ap [i+1]
-//        int64_t pA_start = A->p [i] ;
-//        int64_t pA_end   = A->p [i+1] ;
-//        // indices are in Ai [pA_start ... pA_end-1]
-//        // values  are in Ax [pA_start ... pA_end-1]
-//
-//        // yvp, yvi, yvals:  B(:,j)
-//        // yvp is Bp [j] and Bp [j+1]
-//        int64_t pB_start = B->p [j] ;
-//        int64_t pB_end   = B->p [j+1] ;
-//        // indices are in Bi [pB_start ... pB_end-1]
-//        // values  are in Bx [pB_start ... pB_end-1]
-//        k = pA_start;
-//        int64_t l = pB_start;
-//        T_Z cij = MONOID_IDENTITY;
-//        while( k < pA_end && l < pB_end) {
-//           //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
-//           cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
-//           k++;
-//           l++;
-//           //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
-//        }
-//        //std::cout<< " dot  = "<< sum << std::endl;
-//
-//        // output for this dot product is
-//
-//        if (cij == MONOID_IDENTITY) {
-//            C->i [pC] = -1;//GB_FLIP (i)
-//            C->zombie_count++;
-//        }
-//        else {
-//            Cx [pC] = (T_C)cij;
-//            C->i [pC] = i;
-//        }
-//    }
-//       T_C err = 0;
-//       for (int j =0 ; j< N; ++j) {
-//         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
-//             int64_t i =  C->i[l];
-//             //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
-//             if (i >= 0)
-//                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
-//         }
-//       }
-//       std::cout<< " 2-norm of err ="<< err<<std::endl;
-//       std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
-//
-//       EXPECT_EQ(err,0);
-//       EXPECT_EQ( zc_valid, C->get_zombie_count());
-//
-//       free(X_valid);
-//       free(i_valid);
-//     }
-//    }
-//
-//G.del();
-//
-//return result;
-//
-//}
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_vsvs_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
-//// Assumes all matrices are square so far, so only N dimension given.
-//// Sparsity is controlled by Anz and Bnz vs N*N.
-//// Generates three randomized matrices, builds buckets and calls a kernel.
-//
-//
-//launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vsvs");
-//
-//int testBucket = TB;
-//
-////unsigned seed = 13372801;
-////std::mt19937 r; //random number generator Mersenne Twister
-////r.seed(seed);
-//int gpuID;
-//cudaGetDevice( &gpuID);
-//std::cout<< "found device "<<gpuID<<std::endl;
-//
-////T_Z MONOID_IDENTITY;
-//if (SEMI_RING == "PLUS_TIMES") {
-//   //MONOID_IDENTITY =(T_Z)0;
-//   ADD_ptr<T_Z> = myOP_plus<T_Z>;
-//   MUL_ptr<T_Z> = myOP_times<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MIN_PLUS") {
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
-//   ADD_ptr<T_Z> = myOP_min<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MAX_PLUS") {
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
-//   ADD_ptr<T_Z> = myOP_max<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//}
-//
-////Generate test data and setup for using a jitify kernel with 'bucket' interface
-//SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
-//int64_t Cnz = N;
-//float Cnzpercent = (float) Cnz/(N*N);
-//
-//G.init(N, Anz, Bnz, Cnzpercent);
-//
-//G.fill_buckets( testBucket); // all elements go to testbucket= TB
-//
-//matrix<T_C>* C = G.getCptr();
-//matrix<T_M>* M = G.getMptr();
-//matrix<T_A>* A = G.getAptr();
-//matrix<T_B>* B = G.getBptr();
-//
-//T_C *Cx = C->x;
-//T_A *Ax = A->x;
-//T_B *Bx = B->x;
-//int64_t *Ci = C->i;
-//int64_t *Mi = M->i;
-//int64_t *Ai = A->i;
-//int64_t *Bi = B->i;
-//int64_t *Ap = A->p;
-//int64_t *Bp = B->p;
-//
-////std::cout<<"got all matrices"<<std::endl;
-//int64_t *Bucket = G.getBucket();
-//int64_t *BucketStart = G.getBucketStart();
-//
-//int zc_valid = 0;
-//
-//bool result = false;
-//
-//for (int b =0; b < 12; ++b) {// loop on buckets
-//
-//    int64_t b_start = BucketStart [b] ;
-//    int64_t b_end   = BucketStart [b+1] ;
-//    int64_t nvecs = b_end - b_start ;
-//    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
-//
-//    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
-//    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
-//    if (b == TB) { //test cases for v.sparse-v.sparse kernels
-//       int nthrd = 32;
-//       int sz = Anz/N;
-//       int m = 256/sz;
-//       int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ;
-//       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
-//
-//       GpuTimer kernTimer;
-//       kernTimer.Start();
-//       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
-//                                C, M, A, B, sz);
-//
-//       kernTimer.Stop();
-//       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//       //std::cout<<"returned from kernel"<<std::endl;
-//
-//       zc_valid = C->zombie_count;
-//       C->zombie_count = 0;
-//       for (int i =0 ; i< Cnz; ++i) {
-//            X_valid[i] = Cx[i];
-//            Cx[i] = 0;
-//            i_valid[i] = Ci[i];
-//       }
-//       G.loadCj();
-//       for (int64_t pair = b_start ; pair < b_end ; pair++) {
-//
-//        // get the kth entry in bucket b
-//        //std::cout<< " pair ="<<pair<<std::endl;
-//        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
-//        int64_t i = Mi[pC] ;          // row index of C(i,j)
-//
-//        // get C(i,j)
-//        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
-//        //ASSERT ((C->i [pC] & 4) == b) ;
-//        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
-//        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
-//
-//        // xvp, xvi, xvals:  A(:,i)
-//        // xvp is Ap [i] and Ap [i+1]
-//        int64_t pA_start = Ap [i] ;
-//        int64_t pA_end   = Ap [i+1] ;
-//        // indices are in Ai [pA_start ... pA_end-1]
-//        // values  are in Ax [pA_start ... pA_end-1]
-//
-//        // yvp, yvi, yvals:  B(:,j)
-//        // yvp is Bp [j] and Bp [j+1]
-//        int64_t pB_start = Bp [j] ;
-//        int64_t pB_end   = Bp [j+1] ;
-//        // indices are in Bi [pB_start ... pB_end-1]
-//        // values  are in Bx [pB_start ... pB_end-1]
-//        k = pA_start;
-//        int64_t l = pB_start;
-//        T_Z cij ;
-//        bool cij_exists = false;
-//        while( k < pA_end && l < pB_end) {
-//            if ( Ai[k] < Bi[l]) ++k;
-//            else if ( Ai[k] > Bi[l]) ++l;
-//            else {
-//                if (cij_exists) {
-//                   cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( Ax[k] , Bx[l] ) );
-//                }
-//                else{
-//                   cij_exists = true;
-//                   cij = (*MUL_ptr<T_Z>)( Ax[k], Bx[l]);
-//                }
-//                k++;
-//                l++;
-//            }
-//        }
-//        //std::cout<< " dot  = "<< sum << std::endl;
-//
-//        // output for this dot product is
-//
-//        if (cij_exists) {
-//            Ci [pC] = i;
-//            Cx[pC] = (T_C)cij;
-//        }
-//        else {
-//            Ci [pC] = -1;//GB_FLIP (i)
-//            C->zombie_count++;
-//        }
-//    }
-//       T_C err = 0;
-//       for (int j =0 ; j< N; ++j) {
-//         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
-//             //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
-//             if (Ci[l] > 0)
-//                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
-//         }
-//       }
-//       std::cout<< " 2-norm of err ="<< err<<std::endl;
-//       std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
-//
-//       EXPECT_EQ(err,0);
-//       EXPECT_EQ( zc_valid, C->get_zombie_count());
-//
-//       free(X_valid);
-//       free(i_valid);
-//     }
-//    }
-//
-//G.del();
-//
-//return result;
-//
-//}
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_vssp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
-//// Assumes all matrices are square so far, so only N dimension given.
-//// Sparsity is controlled by Anz and Bnz vs N*N.
-//// Generates three randomized matrices, builds buckets and calls a kernel.
-//
-//launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vssp");
-//
-//int testBucket = TB;
-//
-////unsigned seed = 13372801;
-////std::mt19937 r; //random number generator Mersenne Twister
-////r.seed(seed);
-//int gpuID;
-//cudaGetDevice( &gpuID);
-//std::cout<< "found device "<<gpuID<<std::endl;
-//
-////T_Z MONOID_IDENTITY;
-//if (SEMI_RING == "PLUS_TIMES") {
-//   //MONOID_IDENTITY =(T_Z)0;
-//   ADD_ptr<T_Z> = myOP_plus<T_Z>;
-//   MUL_ptr<T_Z> = myOP_times<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MIN_PLUS") {
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
-//   ADD_ptr<T_Z> = myOP_min<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MAX_PLUS") {
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
-//   ADD_ptr<T_Z> = myOP_max<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//}
-//
-////Generate test data and setup for using a jitify kernel with 'bucket' interface
-//SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
-//
-//int64_t Cnz = N;
-//float Cnzpercent = (float)( Cnz)/(N*N);
-//
-//G.init(N, Anz, Bnz, Cnzpercent );
-//
-//G.fill_buckets( testBucket); // all elements go to testbucket= TB
-//
-//matrix<T_C>* C = G.getCptr();
-//matrix<T_M>* M = G.getMptr();
-//matrix<T_A>* A = G.getAptr();
-//matrix<T_B>* B = G.getBptr();
-//
-//T_C *Cx = C->x;
-//T_A *Ax = A->x;
-//T_B *Bx = B->x;
-//int64_t *Ci = C->i;
-//int64_t *Mi = M->i;
-//int64_t *Ai = A->i;
-//int64_t *Bi = B->i;
-//int64_t *Ap = A->p;
-//int64_t *Bp = B->p;
-//
-//
-////std::cout<<"got all matrices"<<std::endl;
-//int64_t *Bucket = G.getBucket();
-//int64_t *BucketStart = G.getBucketStart();
-//
-//int zc_valid = 0;
-//int zc = 0;
-//
-//bool result = false;
-//
-//for (int b =0; b < 12; ++b) {// loop on buckets
-//
-//    int64_t b_start = BucketStart [b] ;
-//    int64_t b_end   = BucketStart [b+1] ;
-//    int64_t nvecs = b_end - b_start ;
-//    if (nvecs == 0) continue;
-//    std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
-//
-//    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
-//    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
-//    if (b == TB) { //test cases for v.sparse-dense kernels
-//       int nthrd = 32;
-//       int sz = 4;
-//       //int m = 256/sz;
-//       int nblck = (Cnz -1 + nthrd )/(nthrd) ;
-//       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
-//
-//       GpuTimer kernTimer;
-//       kernTimer.Start();
-//       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
-//                                C, M, A, B, sz);
-//
-//       kernTimer.Stop();
-//       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//       //std::cout<<"returned from kernel"<<std::endl;
-//
-//       zc_valid = C->zombie_count;
-//       C->zombie_count = 0;
-//       for (int i =0 ; i< Cnz; ++i) {
-//            X_valid[i] = Cx[i];
-//            Cx[i] = 0;
-//            i_valid[i] = C->i[i];
-//       }
-//       G.loadCj();
-//
-//
-//       for (int64_t pair = b_start ; pair < b_end ; pair++) {
-//
-//        // get the kth entry in bucket b
-//        //std::cout<< " pair ="<<pair<<std::endl;
-//        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
-//
-//        int64_t i = Mi[pC] ;          // row index of C(i,j)
-//        // get C(i,j)
-//        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
-//        //ASSERT ((C->i [pC] & 4) == b) ;
-//        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
-//        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
-//
-//        int64_t pA      = Ap[i];
-//        int64_t pA_end  = Ap[i+1];
-//        int64_t nnzA = pA_end - pA;
-//
-//        int64_t pB      = Bp[j];
-//        int64_t pB_end  = Bp[j+1];
-//        int64_t nnzB = pB_end - pB;
-//
-//        //Search for each nonzero in the smaller vector to find intersection
-//        bool cij_exists = false;
-//
-//        T_A aki;
-//        T_B bkj;
-//        T_Z cij;
-//
-//        if (nnzA <= nnzB) {
-//            //----------------------------------------------------------------------
-//            // A(:,i) is very sparse compared to B(:,j)
-//            //----------------------------------------------------------------------
-//
-//            while (pA < pA_end && pB < pB_end)
-//            {
-//                int64_t ia = Ai [pA] ;
-//                int64_t ib = Bi [pB] ;
-//                if (ia < ib)
-//                {
-//                    // A(ia,i) appears before B(ib,j)
-//                    pA++ ;
-//                }
-//                else if (ib < ia)
-//                {
-//                    // B(ib,j) appears before A(ia,i)
-//                    // discard all entries B(ib:ia-1,j)
-//                    int64_t pleft = pB + 1 ;
-//                    int64_t pright = pB_end - 1 ;
-//                    GB_BINARY_TRIM_SEARCH (ia, Bi, pleft, pright) ;
-//                    //ASSERT (pleft > pB) ;
-//                    pB = pleft ;
-//                }
-//                else // ia == ib == k
-//                {
-//                    // A(k,i) and B(k,j) are the next entries to merge
-//                    #if defined ( GB_PHASE_1_OF_2 )
-//                    cij_exists = true ;
-//                    break ;
-//                    #else
-//                    GB_GETA (aki, Ax, pA) ;             /* aki = A(k,i) */
-//                    GB_GETB (bkj, Bx, pB) ;             /* bkj = B(k,j) */
-//                    if (cij_exists)
-//                    {
-//                        cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
-//                        /* cij += aki * bkj */
-//                    }
-//                    else
-//                    {
-//                        /* cij = A(k,i) * B(k,j), and add to the pattern */
-//                        cij_exists = true ;
-//                        cij=  (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;
-//                        /* cij = aki * bkj */
-//                    }
-//                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
-//                    pA++ ;
-//                    pB++ ;
-//                    #endif
-//                }
-//            }
-//        }
-//        else {
-//            //----------------------------------------------------------------------
-//            // B(:,j) is very sparse compared to A(:,i)
-//            //----------------------------------------------------------------------
-//
-//            while (pA < pA_end && pB < pB_end)
-//            {
-//                int64_t ia = Ai [pA] ;
-//                int64_t ib = Bi [pB] ;
-//                if (ia < ib)
-//                {
-//                    // A(ia,i) appears before B(ib,j)
-//                    // discard all entries A(ia:ib-1,i)
-//                    int64_t pleft = pA + 1 ;
-//                    int64_t pright = pA_end - 1 ;
-//                    GB_BINARY_TRIM_SEARCH (ib, Ai, pleft, pright) ;
-//                    //ASSERT (pleft > pA) ;
-//                    pA = pleft ;
-//                }
-//                else if (ib < ia)
-//                {
-//                    // B(ib,j) appears before A(ia,i)
-//                    pB++ ;
-//                }
-//                else // ia == ib == k
-//                {
-//                    // A(k,i) and B(k,j) are the next entries to merge
-//                    #if defined ( GB_PHASE_1_OF_2 )
-//                    cij_exists = true ;
-//                    break ;
-//                    #else
-//                    GB_GETA (aki, Ax, pA) ;             /* aki = A(k,i) */
-//                    GB_GETB (bkj, Bx, pB) ;             /* bkj = B(k,j) */
-//                    if (cij_exists)
-//                    {
-//                        cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
-//                        /* cij += aki * bkj */      \
-//                    }
-//                    else
-//                    {
-//                        /* cij = A(k,i) * B(k,j), and add to the pattern */
-//                        cij_exists = true ;
-//                        cij=  (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;
-//                    }
-//                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
-//                    pA++ ;
-//                    pB++ ;
-//                    #endif
-//                }
-//            }
-//
-//        }
-//        if ( cij_exists){
-//           Ci[pair] = i;
-//           Cx[pair] = (T_C)cij;
-//        }
-//        else {
-//           zc++;
-//           //printf(" %lld, %lld is zombie %d!\n",i,j,zc);
-//           Ci[pair] = GB_FLIP( i );
-//        }
-//
-//    }
-//       C->zombie_count = zc;
-//       T_C err = 0;
-//       for (int j =0 ; j< N; ++j) {
-//         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
-//             int64_t i = Ci[l];
-//             //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
-//             if (i > 0){ //not a zombie!
-//                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
-//             }
-//         }
-//       }
-//       std::cout<< " 2-norm of err ="<< err<<std::endl;
-//       std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
-//
-//       EXPECT_EQ(err,0);
-//       EXPECT_EQ( zc_valid, C->get_zombie_count());
-//
-//       free(X_valid);
-//       free(i_valid);
-//     }
-//    }
-//
-//G.del();
-//
-//return result;
-//
-//}
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_spdn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
-//// Assumes all matrices are square so far, so only N dimension given.
-//// Sparsity is controlled by Anz and Bnz vs N*N.
-//// Generates three randomized matrices, builds buckets and calls a kernel.
-//
-//launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "spdn");
-//
-//int testBucket = TB;
-//
-////unsigned seed = 13372801;
-////std::mt19937 r; //random number generator Mersenne Twister
-////r.seed(seed);
-//int gpuID;
-//cudaGetDevice( &gpuID);
-//std::cout<< "found device "<<gpuID<<std::endl;
-//
-////T_Z MONOID_IDENTITY;
-//if (SEMI_RING == "PLUS_TIMES") {
-//  // MONOID_IDENTITY =(T_Z)0;
-//   ADD_ptr<T_Z> = myOP_plus<T_Z>;
-//   MUL_ptr<T_Z> = myOP_times<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MIN_PLUS") {
-//  // MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
-//   ADD_ptr<T_Z> = myOP_min<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MAX_PLUS") {
-//  // MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
-//   ADD_ptr<T_Z> = myOP_max<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//}
-//
-////Generate test data and setup for using a jitify kernel with 'bucket' interface
-//SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
-//
-//int64_t Cnz = N;
-//float Cnzpercent = (float)( Cnz)/(N*N);
-//
-////spdn case means B should be dense -> Bnz = N*N;
-//G.init(N, Anz, N*N, Cnzpercent );
-//
-//G.fill_buckets( testBucket); // all elements go to testbucket= TB
-//
-//matrix<T_C>* C = G.getCptr();
-//matrix<T_M>* M = G.getMptr();
-//matrix<T_A>* A = G.getAptr();
-//matrix<T_B>* B = G.getBptr();
-//
-//T_C *Cx = C->x;
-//T_A *Ax = A->x;
-//T_B *Bx = B->x;
-//int64_t *Ci = C->i;
-//int64_t *Mi = M->i;
-//int64_t *Ai = A->i;
-//int64_t *Bi = B->i;
-//int64_t *Ap = A->p;
-//int64_t *Bp = B->p;
-//
-//
-////std::cout<<"got all matrices"<<std::endl;
-//int64_t *Bucket = G.getBucket();
-//int64_t *BucketStart = G.getBucketStart();
-//
-//int zc_valid = 0;
-//
-//bool result = false;
-//
-//for (int b =0; b < 12; ++b) {// loop on buckets
-//
-//    int64_t b_start = BucketStart [b] ;
-//    int64_t b_end   = BucketStart [b+1] ;
-//    int64_t nvecs = b_end - b_start ;
-//    if (nvecs == 0) continue;
-//    std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
-//
-//    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
-//    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
-//    if (b == TB) { //test cases for v.sparse-dense kernels
-//       int nthrd = 32;
-//       int sz = Anz/N;
-//       int m = 256/sz;
-//       int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ;
-//       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
-//
-//       GpuTimer kernTimer;
-//       kernTimer.Start();
-//       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
-//                                C, M, A, B, sz);
-//
-//       kernTimer.Stop();
-//       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//       //std::cout<<"returned from kernel"<<std::endl;
-//
-//       zc_valid = C->zombie_count;
-//       C->zombie_count = 0;
-//       for (int i =0 ; i< Cnz; ++i) {
-//            X_valid[i] = Cx[i];
-//            Cx[i] = 0;
-//            i_valid[i] = Ci[i];
-//       }
-//       G.loadCj();
-//       for (int64_t pair = b_start ; pair < b_end ; pair++) {
-//
-//        // get the kth entry in bucket b
-//        //std::cout<< " pair ="<<pair<<std::endl;
-//        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
-//        int64_t i = Mi[pC] ;          // row index of C(i,j)
-//
-//        // get C(i,j)
-//        //int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
-//        //ASSERT ((C->i [pC] & 4) == b) ;
-//        //int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
-//        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
-//
-//         int64_t pA = Ap[i];
-//         int64_t pA_end   = Ap[i+1];
-//         int64_t nnzA   = pA_end - pA;
-//         int64_t pB = Bp[i];
-//         int64_t pB_end   = Bp[i+1];
-//         int64_t nnzB   = pB_end - pB;
-//         T_A aki;
-//         T_B bkj;
-//         T_Z cij;
-//
-//         if( nnzA == A->vlen) // A is dense
-//         {
-//            int64_t k = Bi [pB] ;               // first row index of B(:,j)
-//            // cij = A(k,i) * B(k,j)
-//            GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
-//            GB_GETB (bkj, Bx, pB  ) ;           // bkj = B(k,j)
-//            cij = (*MUL_ptr<T_Z>)( aki, bkj) ;           // cij = aki * bkj
-//
-//            for (int64_t p = pB+1 ; p < pB_end ; p++)
-//            {
-//                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
-//                int64_t k = Bi [p] ;                // next row index of B(:,j)
-//                // cij += A(k,i) * B(k,j)
-//                GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
-//                GB_GETB (bkj, Bx, p   ) ;           // bkj = B(k,j)
-//                cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
-//            }
-//
-//         }
-//         if( nnzB == B->vlen) // B is dense
-//         {
-//            int64_t k = Ai [pA] ;               // first row index of A(:,i)
-//            // cij = A(k,i) * B(k,j)
-//            GB_GETA (aki, Ax, pA  ) ;           // aki = A(k,i)
-//            GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
-//            cij = (*MUL_ptr<T_Z>)( aki, bkj) ;           // cij = aki * bkj
-//
-//            for (int64_t p = pA+1 ; p < pA_end ; p++)
-//            {
-//                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
-//                int64_t k = Ai [p] ;                // next row index of A(:,i)
-//                // cij += A(k,i) * B(k,j)
-//                GB_GETA (aki, Ax, p   ) ;           // aki = A(k,i)
-//                GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
-//                cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
-//            }
-//         }
-//
-//         Ci[pair] = i;
-//         Cx[pair] = cij;
-//
-//      }
-//       T_C err = 0;
-//       for (int j =0 ; j< N; ++j) {
-//         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
-//             int64_t i =  Ci[l];
-//         //std::cout<<i<<","<<j<<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
-//             if (i >=0 )
-//                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
-//         }
-//       }
-//       std::cout<< " 2-norm of err ="<< err<<std::endl;
-//       std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
-//
-//       EXPECT_EQ(err,0);
-//       EXPECT_EQ( zc_valid, C->get_zombie_count());
-//
-//       free(X_valid);
-//       free(i_valid);
-//     }
-//    }
-//
-//G.del();
-//
-//return result;
-//
-//}
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_mp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
-//// Assumes all matrices are square so far, so only N dimension given.
-//// Sparsity is dense here so Anz = Bnz = N*N.
-//// Generates three randomized matrices, builds buckets and calls a kernel.
-//
-//
-//launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "mp");
-//
-//int testBucket = TB;
-//
-////unsigned seed = 13372801;
-////std::mt19937 r; //random number generator Mersenne Twister
-////r.seed(seed);
-////int gpuID;
-////cudaGetDevice( &gpuID);
-//
-////std::cout<< "found device "<<gpuID<<std::endl;
-//
-////T_Z MONOID_IDENTITY;
-//if (SEMI_RING == "PLUS_TIMES") {
-//   std::cout << "Plus Times (+,*) semiring"<<std::endl;
-//   //MONOID_IDENTITY = 0;
-//   ADD_ptr<T_Z> = myOP_plus<T_Z>;
-//   MUL_ptr<T_Z> = myOP_times<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MIN_PLUS") {
-//   std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
-//   ADD_ptr<T_Z> = myOP_min<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MAX_PLUS") {
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
-//   std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
-//   ADD_ptr<T_Z> = myOP_max<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//}
-//
-////Generate test data and setup for using a jitify kernel with 'bucket' interface
-//SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
-//int64_t Annz = Anz;
-//int64_t Bnnz = Bnz;
-//int64_t Cnz = N;
-//float Cnzpercent = (float) Cnz/(N*N);
-//
-//G.init(N, Annz, Bnnz, Cnzpercent);
-//
-//G.fill_buckets( testBucket); // all elements go to testbucket= TB
-//
-//matrix<T_C>* C = G.getCptr();
-//matrix<T_M>* M = G.getMptr();
-//matrix<T_A>* A = G.getAptr();
-//matrix<T_B>* B = G.getBptr();
-//
-//T_C *Cx = C->x;
-//T_A *Ax = A->x;
-//T_B *Bx = B->x;
-//int64_t *Ci = C->i;
-//int64_t *Mi = M->i;
-//int64_t *Ai = A->i;
-//int64_t *Bi = B->i;
-//int64_t *Ap = A->p;
-//int64_t *Bp = B->p;
-//
-//// Set clear zombie count
-//C->zombie_count = 0;
-//
-////std::cout<<"got all matrices"<<std::endl;
-//int64_t *Bucket = G.getBucket();
-//int64_t *BucketStart = G.getBucketStart();
-//
-//int zc_valid = 0;
-//
-//bool result = false;
-//
-//for (int b =0; b < 12; ++b) {// loop on buckets
-//
-//    int64_t b_start = BucketStart [b] ;
-//    int64_t b_end   = BucketStart [b+1] ;
-//    int64_t nvecs = b_end - b_start ;
-//    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
-//
-//    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
-//    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
-//    if (b == TB) { //test cases for merge-path kernel
-//       int nthrd = 32;
-//       int nblck = Cnz;
-//       int sz = 0;
-//       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
-//
-//       GpuTimer kernTimer;
-//       kernTimer.Start();
-//       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
-//                                C, M, A, B, sz);
-//
-//       kernTimer.Stop();
-//       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//       //std::cout<<"returned from kernel"<<std::endl;
-//
-//       zc_valid = C->zombie_count;
-//       C->zombie_count = 0;
-//       for (int i =0 ; i< Cnz; ++i) {
-//            //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
-//            X_valid[i] = Cx[i];
-//            i_valid[i] = C->i[i];
-//            // clear values for next test
-//            Cx[i] = 0;
-//       }
-//       G.loadCj();
-//
-//       for (int64_t pair = b_start ; pair < b_end ; pair++) {
-//
-//        // get the kth entry in bucket b
-//        //std::cout<< " pair ="<<pair<<std::endl;
-//        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
-//        int64_t i = Mi[pC] ;          // row index of C(i,j)
-//
-//        // get C(i,j)
-//        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
-//        //ASSERT ((C->i [pC] & 4) == b) ;
-//        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
-//        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
-//
-//        int64_t pA_start = Ap [i] ;
-//        int64_t pA_end   = Ap [i+1] ;
-//
-//        int64_t pB_start = Bp [j] ;
-//        int64_t pB_end   = Bp [j+1] ;
-//        // NOTE: this test code is NOT doing merge-path. This is just a
-//        // single-threaded linear merge for correctness testing.
-//        k = pA_start;
-//        int64_t l = pB_start;
-//        T_Z cij ;
-//        bool cij_exists = false;
-//        while( k < pA_end && l < pB_end) {
-//           if      ( Ai[k] < Bi[l] ) k += 1;
-//           else if ( Ai[k] > Bi[l] ) l += 1;
-//           else {
-//             if (cij_exists) {
-//               //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
-//               cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
-//             }
-//             else {
-//               cij_exists = true;
-//               cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
-//             }
-//
-//             k++;
-//             l++;
-//           }
-//           //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
-//        }
-//        //std::cout<< " dot  = "<< sum << std::endl;
-//
-//        // output for this dot product is
-//
-//        if (cij_exists) {
-//            Cx [pC] = (T_C)cij;
-//            Ci [pC] = i;
-//        }
-//        else {
-//            C->i [pC] = -1;//GB_FLIP (i)
-//            C->zombie_count++;
-//        }
-//    }
-//       T_C err = 0;
-//       for (int j =0 ; j< N; ++j) {
-//         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
-//
-//             if (Ci[l] > 0) {
-//                //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
-//                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
-//             }
-//         }
-//       }
-//       std::cout<< " 2-norm of err ="<< err<<std::endl;
-//       std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
-//
-//       EXPECT_EQ(err,0);
-//       EXPECT_EQ( zc_valid, C->get_zombie_count());
-//
-//       free(X_valid);
-//       free(i_valid);
-//     }
-//    }
-//
-//G.del();
-//
-//return result;
-//
-//}
-//
-//template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
-//bool test_AxB_dot3_warp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
-//// Assumes all matrices are square so far, so only N dimension given.
-//// Sparsity is dense here so Anz = Bnz = N*N.
-//// Generates three randomized matrices, builds buckets and calls a kernel.
-//
-//
-//launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "warp");
-//
-//int testBucket = TB;
-//
-////unsigned seed = 13372801;
-////std::mt19937 r; //random number generator Mersenne Twister
-////r.seed(seed);
-////int gpuID;
-////cudaGetDevice( &gpuID);
-//
-////std::cout<< "found device "<<gpuID<<std::endl;
-//
-////T_Z MONOID_IDENTITY;
-//if (SEMI_RING == "PLUS_TIMES") {
-//   std::cout << "Plus Times (+,*) semiring"<<std::endl;
-//   //MONOID_IDENTITY = 0;
-//   ADD_ptr<T_Z> = myOP_plus<T_Z>;
-//   MUL_ptr<T_Z> = myOP_times<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MIN_PLUS") {
-//   std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
-//   ADD_ptr<T_Z> = myOP_min<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//
-//}
-//else if(SEMI_RING == "MAX_PLUS") {
-//   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
-//   std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
-//   ADD_ptr<T_Z> = myOP_max<T_Z>;
-//   MUL_ptr<T_Z> = myOP_plus<T_Z>;
-//}
-//
-////Generate test data and setup for using a jitify kernel with 'bucket' interface
-//SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
-//int64_t Cnz = N;
-//float Cnzpercent = (float) Cnz/(N*N);
-//
-//G.init(N, Anz, Bnz, Cnzpercent);
-//
-//G.fill_buckets( testBucket); // all elements go to testbucket= TB
-//
-//matrix<T_C>* C = G.getCptr();
-//matrix<T_M>* M = G.getMptr();
-//matrix<T_A>* A = G.getAptr();
-//matrix<T_B>* B = G.getBptr();
-//
-//T_C *Cx = C->x;
-//T_A *Ax = A->x;
-//T_B *Bx = B->x;
-//int64_t *Ci = C->i;
-//int64_t *Mi = M->i;
-//int64_t *Ai = A->i;
-//int64_t *Bi = B->i;
-//int64_t *Ap = A->p;
-//int64_t *Bp = B->p;
-//
-//// Set clear zombie count
-//C->zombie_count = 0;
-//
-////std::cout<<"got all matrices"<<std::endl;
-//int64_t *Bucket = G.getBucket();
-//int64_t *BucketStart = G.getBucketStart();
-//
-//int zc_valid = 0;
-//
-//bool result = false;
-//
-//for (int b =0; b < 12; ++b) {// loop on buckets
-//
-//    int64_t b_start = BucketStart [b] ;
-//    int64_t b_end   = BucketStart [b+1] ;
-//    int64_t nvecs = b_end - b_start ;
-//    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
-//
-//    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
-//    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
-//    if (b == TB) { //test cases for merge-path kernel
-//       int nthrd = 32;
-//       int nblck = (Cnz + nthrd -1)/nthrd ;
-//       int sz = 0;
-//       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
-//
-//       GpuTimer kernTimer;
-//       kernTimer.Start();
-//       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
-//                                C, M, A, B, sz);
-//
-//       kernTimer.Stop();
-//       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
-//
-//       //std::cout<<"returned from kernel"<<std::endl;
-//
-//       zc_valid = C->zombie_count;
-//       C->zombie_count = 0;
-//       for (int i =0 ; i< Cnz; ++i) {
-//            //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
-//            X_valid[i] = Cx[i];
-//            i_valid[i] = C->i[i];
-//            // clear values for next test
-//            Cx[i] = 0;
-//       }
-//       G.loadCj();
-//
-//       for (int64_t pair = b_start ; pair < b_end ; pair++) {
-//
-//        // get the kth entry in bucket b
-//        //std::cout<< " pair ="<<pair<<std::endl;
-//        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
-//        int64_t i = Mi[pC] ;          // row index of C(i,j)
-//
-//        // get C(i,j)
-//        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
-//        //ASSERT ((C->i [pC] & 4) == b) ;
-//        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
-//        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
-//
-//        int64_t pA_start = Ap [i] ;
-//        int64_t pA_end   = Ap [i+1] ;
-//
-//        int64_t pB_start = Bp [j] ;
-//        int64_t pB_end   = Bp [j+1] ;
-//        // NOTE: this test code is NOT doing merge-path. This is just a
-//        // single-threaded linear merge for correctness testing.
-//        k = pA_start;
-//        int64_t l = pB_start;
-//        T_Z cij ;
-//        bool cij_exists = false;
-//        while( k < pA_end && l < pB_end) {
-//           if      ( Ai[k] < Bi[l] ) k += 1;
-//           else if ( Ai[k] > Bi[l] ) l += 1;
-//           else {
-//             if (cij_exists) {
-//               //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
-//               cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
-//             }
-//             else {
-//               cij_exists = true;
-//               cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
-//             }
-//
-//             k++;
-//             l++;
-//           }
-//           //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
-//        }
-//        //std::cout<< " dot  = "<< sum << std::endl;
-//
-//        // output for this dot product is
-//
-//        if (cij_exists) {
-//            Cx [pC] = (T_C)cij;
-//            Ci [pC] = i;
-//        }
-//        else {
-//            C->i [pC] = -1;//GB_FLIP (i)
-//            C->zombie_count++;
-//        }
-//    }
-//       T_C err = 0;
-//       for (int j =0 ; j< N; ++j) {
-//         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
-//
-//             if (Ci[l] > 0) {
-//                //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
-//                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
-//             }
-//         }
-//       }
-//       std::cout<< " 2-norm of err ="<< err<<std::endl;
-//       std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
-//
-//       EXPECT_EQ(err,0);
-//       EXPECT_EQ( zc_valid, C->get_zombie_count());
-//
-//       free(X_valid);
-//       free(i_valid);
-//     }
-//    }
-//
-//G.del();
-//
-//return result;
-//
-//}
-//
-//
-//template <typename T1,typename T2,typename T3>
-//bool test_dndotfactoryUM( unsigned int N, std::string SEMI_RING) {
-//
-//  dotFactory<T1,T2,T3> dF;
-//
-//  int block(512);
-//  int nblock= (N + 8*block -1)/(8*block);
-//  int grid(nblock);
-//  T1* x;
-//  T2* y;
-//  T3* output;
-//  CHECK_CUDA( cudaMallocManaged((void**)&x, N*sizeof(T1)) );
-//  CHECK_CUDA( cudaMallocManaged((void**)&y, N*sizeof(T2)) );
-//  CHECK_CUDA( cudaMallocManaged((void**)&output, nblock*sizeof(T3)) );
-//
-//  //we will get a triangular sum = N*(N+1)/2 with these inputs
-//  fillvector_linear<T1> (N, x);
-//  fillvector_constant<T2> (N, y, T2(1));
-//
-//  dF.jitGridBlockLaunch( grid, block, x, y, output, N, SEMI_RING );
-//
-//  T3 sum;
-//  if (SEMI_RING == "PLUS_TIMES")
-//  {
-//      myOpPTR<T3> = myOP_plus<T3>;
-//      sum = (T3)0;
-//  }
-//  if (SEMI_RING == "MIN_PLUS")
-//  {
-//      sum = std::numeric_limits<T3>::max();
-//      myOpPTR<T3> = myOP_min<T3>;
-//  }
-//
-//  for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]);
-//
-//  bool result = false;
-//  T3 expect;
-//  if (SEMI_RING == "PLUS_TIMES") {
-//     expect = (T3)(N*(N-1)/2);
-//     T3 temp = (sum -expect) ;
-//     if (temp < 0) temp = -temp ;
-//     //result = (temp < (T3)1) ; //adjust formula for leading 0
-//     EXPECT_LE( temp, (T3)1 );
-//  }
-//  else if (SEMI_RING == "MIN_PLUS") {
-//     expect = (T3) 1;
-//     //result = (sum == expect) ;   //min is 1 from the (0,1) pair
-//     EXPECT_EQ( sum, expect);
-//  }
-//  else expect = (T3)0;
-//  std::cout <<"test_dotfactoryUM with "<<SEMI_RING<<" semi-ring="<< sum
-//                                       <<" expected "<<expect << std::endl;
-//
-//  cudaFree(x);
-//  cudaFree(y);
-//  cudaFree(output);
-//  return result;
-//}
-//
-//
-//template <typename T1,typename T2,typename T3>
-//bool test_spdotfactoryUM( unsigned int N, unsigned int xn, unsigned int yn, std::string SEMI_RING) {
-//
-//#define INTMIN( A, B) ( (A) < (B) ) ?  (A) : (B)
-//
-//  // N here is the index space that the sparse vectors are drawn from.
-//  // Indices in xi and yi are in the range (0,N-1)
-//  // We will generate a number of random values in this range for test data
-//  std::cout<< " xn,yn= "<<xn<<','<<yn<<"min = "<< std::min( xn, yn) <<std::endl;
-//  int n_threads = std::min( xn, yn) / 4;
-//  std::cout<< "I think we need "<< n_threads<<" threads to do this."<<std::endl;
-//  int pad_threads = 2;
-//  while ( pad_threads < n_threads) {
-//      pad_threads *= 2;
-//  }
-//  int block= 32;
-//  int nblock= ( pad_threads + block -1)/(block);
-//  int grid(nblock);
-//  std::cout<<"N="<<N<<" xn ="<<xn<<", yn="<<yn<<" nblock="<<nblock<<" block ="<<block<<std::endl;
-//  unsigned int *xi;
-//  unsigned int *yi;
-//  T1* x;
-//  T2* y;
-//  T3* output;
-//  unsigned int intersection_size = 0; //will be filled in later if needed and xn != yn
-//  unsigned seed = 13372801;
-//  std::mt19937 r; //random number generator Mersenne Twister
-//  r.seed(seed);
-//  cudaMallocManaged((void**)&x, xn*sizeof(T1));
-//  cudaMallocManaged((void**)&xi, xn*sizeof(int));
-//  cudaMallocManaged((void**)&y, yn*sizeof(T2));
-//  cudaMallocManaged((void**)&yi, yn*sizeof(int));
-//  cudaMallocManaged((void**)&output, nblock*sizeof(T3));
-//
-//  int inv_sparsity = N/std::max(xn,yn);  //= values not taken per value occupied in index space
-//  std::cout<<" Using inv_sparsity value of "<< inv_sparsity<<std::endl;
-//  fillvector_constant<T1> (xn, x, T1(1));
-//  fillvector_constant<T2> (yn, y, T2(1));
-//
-//  if( xn == yn){  // test case : all values intersect, generate 1 random number for both
-//      intersection_size = xn;
-//      std::cout << " all-intersect case..."<<std::endl;
-//      for (unsigned int i =0; i < xn; ++i){
-//          unsigned int rand_i = inv_sparsity*i+ r() %(inv_sparsity);
-//          xi[i] = rand_i; //we will get a count of the intersection size
-//          yi[i] = rand_i; //we will get a count of the intersection size
-//      }
-//      //std::sort (xi, xi + xn);
-//      //std::sort (yi, yi + yn);
-//  }
-//  else { // generate two different sets of indices, no known intersection pattern
-//      for (unsigned int i =0; i < xn; ++i){
-//          unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
-//          xi[i] = rand_i; //we will get a count of the intersection size
-//      }
-//      for (unsigned int i =0; i < yn; ++i){
-//          unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
-//          yi[i] = rand_i; //we will get a count of the intersection size
-//      }
-//      //std::sort (xi, xi + xn);
-//      //std::sort (yi, yi + yn);
-//      unsigned int xp =0;
-//      unsigned int yp =0;
-//      while (1){  //find the intersection size by merge of two sorted lists
-//          if (xi[xp] < yi[yp]) xp++;
-//          else if (xi[xp] > yi[yp]) yp++;
-//          else {
-//              intersection_size++;
-//              xp++;
-//              yp++;
-//          }
-//          if ( ( xp == xn ) || ( yp == yn) )  break;
-//      }
-//  }
-//  if( xn < 128 ) {
-//
-//    std::cout<< " xi = [";
-//    for (unsigned int i = 0 ; i < xn; ++i) {
-//        std::cout<< xi[i] << ",";
-//    }
-//    std::cout<< " ]" <<std::endl;
-//
-//  }
-//  std::cout << " Launching sparseDot CUDA kernel xn = "<<xn<<" yn="<<yn<<std::endl;
-//  spdotFactory<T1,T2,T3> spdF;
-//  spdF.jitGridBlockLaunch( grid, block, xn, xi, x, yn, yi, y, output, SEMI_RING );
-//
-//  cudaDeviceSynchronize ( ) ;
-//
-//  T3 sum;
-//  if (SEMI_RING == "PLUS_TIMES")
-//  {
-//      myOpPTR<T3> = myOP_plus<T3>;
-//      sum = (T3)0;
-//  }
-//  if (SEMI_RING == "MIN_PLUS")
-//  {
-//      sum = std::numeric_limits<T3>::max();
-//      myOpPTR<T3> = myOP_min<T3>;
-//  }
-//
-//  for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]);
-//
-//  bool result = false;
-//  T3 expect;
-//  if (SEMI_RING == "PLUS_TIMES") {
-//     T3 temp;
-//     expect = intersection_size;
-//     temp = (sum - expect);
-//     if (temp < 0) temp = -temp ;
-//     result = (temp < (T3)1) ; //adjust formula for leading 0
-//  }
-//  else if (SEMI_RING == "MIN_PLUS") {
-//     expect = 2;
-//     result = (sum== expect) ;   //min is 2 from the (1,1) pair
-//  }
-//  else expect = (T3) 0;
-//
-//  std::cout <<"test_spdotfactoryUM with "<<SEMI_RING<<" semi-ring= "
-//            << sum << " expected "<<intersection_size<< std::endl;
-//  cudaFree(x);
-//  cudaFree(xi);
-//  cudaFree(y);
-//  cudaFree(yi);
-//  cudaFree(output);
-//  return result;
-//}
diff --git a/GraphBLAS/CUDA/test/output b/GraphBLAS/CUDA/test/output
deleted file mode 100644
index 033b5a434..000000000
Binary files a/GraphBLAS/CUDA/test/output and /dev/null differ
diff --git a/GraphBLAS/CUDA/test/outputc b/GraphBLAS/CUDA/test/outputc
deleted file mode 100644
index 9291f7fa0..000000000
--- a/GraphBLAS/CUDA/test/outputc
+++ /dev/null
@@ -1,33013 +0,0 @@
- init called with mode 3 init_size 262144 max_size 268435456
- make_managed_pool called with  init_size 262144 max_size 268435456
-GBCUDA_DEV enabled: for development only!
-warming up device 0 memsize 3.40875e+10 sms 80
- rmm_wrap_alloc 256 bytes
-oooo nice block of memory of size 8
-be free, block of memory of size 8
-good ol' cudaMalloc just to be sure
-GPU 0 nice and toasty now
- rmm_wrap_alloc 8448 bytes
-[==========] Running 16 tests from 4 test suites.
-[----------] Global test environment set-up.
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_1
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_1.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 2048 bytes
- rmm_wrap_alloc 2048 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 1024 bytes
- rmm_wrap_alloc 4096 bytes
- rmm_wrap_alloc 4096 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 131072 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 54321
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 4096 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- calling stringify semiring: 0x7f1fea02dc00
-inside enumify: 0x7f1fea02dc00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
-nanobuckets_size: 384
-blockbuckets_size: 12
- rmm_wrap_alloc 3072 bytes
- rmm_wrap_alloc 256 bytes
-A TYpe: 0x7f2028b56f40
-B TYpe: 0x7f2028b56f40
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
-GB_jit_AxB_phase1
-#include "/home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h"
-#include "GB_jit_AxB_phase1.cuh"
- jit_cache get program GB_jit_AxB_phase1
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_phase1
- got kernel instance AxB_phase1_bool
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_phase1_bool
----------------------------------------
---- Linker for void AxB_phase1<bool>(long long*, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*) ---
----------------------------------------
-info    : 0 bytes gmem
-info    : Function properties for '_Z10AxB_phase1IbEvPxS0_P16GB_Matrix_opaqueS2_S2_S2_':
-info    : used 199 registers, 4576 stack, 3104 bytes smem, 400 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z10AxB_phase1IbEvPxS0_P16GB_Matrix_opaqueS2_S2_S2_<<<1,32,0,0>>>(long*,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*)
-returned from phase1 kernel 10.4223ms
-Printing Nanobuckets
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-Printing Blockbucket
-0, 0, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-==== phase1 done=============================
-[       OK ] AxB_dot3_tests_PLUS_TIMES_1.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (11 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_1.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 32768 bytes
- rmm_wrap_alloc 32768 bytes
- rmm_wrap_alloc 65536 bytes
- rmm_wrap_alloc 65536 bytes
- rmm_wrap_alloc 131072 bytes
- rmm_wrap_alloc 262144 bytes
- rmm_wrap_alloc 262144 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 54321
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 8192 bytes
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- calling stringify semiring: 0x7f1fea02dd00
-inside enumify: 0x7f1fea02dd00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
-nanobuckets_size: 3072
-blockbuckets_size: 96
- rmm_wrap_alloc 24576 bytes
- rmm_wrap_alloc 768 bytes
-A TYpe: 0x7f2028b56f40
-B TYpe: 0x7f2028b56f40
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
-GB_jit_AxB_phase1
-#include "/home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h"
-#include "GB_jit_AxB_phase1.cuh"
- jit_cache get program GB_jit_AxB_phase1
-found memory-cached prog GB_jit_AxB_phase1
- got kernel instance AxB_phase1_bool
-found memory-cached prog AxB_phase1_bool
-Launching _Z10AxB_phase1IbEvPxS0_P16GB_Matrix_opaqueS2_S2_S2_<<<8,32,0,0>>>(long*,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*)
-returned from phase1 kernel 1.93946ms
-Printing Nanobuckets
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-Printing Blockbucket
-0, 0, 0, 0, 0, 0, 0, 0, 248, 248, 248, 248, 248, 248, 248, 248, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-==== phase1 done=============================
-[       OK ] AxB_dot3_tests_PLUS_TIMES_1.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (235 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_1.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 54321
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- calling stringify semiring: 0x7f1fea02db00
-inside enumify: 0x7f1fea02db00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
-nanobuckets_size: 384
-blockbuckets_size: 12
- rmm_wrap_alloc 3072 bytes
- rmm_wrap_alloc 256 bytes
-A TYpe: 0x7f2028b56f40
-B TYpe: 0x7f2028b56f40
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
-GB_jit_AxB_phase1
-#include "/home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h"
-#include "GB_jit_AxB_phase1.cuh"
- jit_cache get program GB_jit_AxB_phase1
-found memory-cached prog GB_jit_AxB_phase1
- got kernel instance AxB_phase1_int32_t
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_phase1_int32_t
----------------------------------------
---- Linker for void AxB_phase1<int>(long long*, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*) ---
----------------------------------------
-info    : 0 bytes gmem
-info    : Function properties for '_Z10AxB_phase1IiEvPxS0_P16GB_Matrix_opaqueS2_S2_S2_':
-info    : used 199 registers, 4576 stack, 3104 bytes smem, 400 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z10AxB_phase1IiEvPxS0_P16GB_Matrix_opaqueS2_S2_S2_<<<1,32,0,0>>>(long*,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*)
-returned from phase1 kernel 9.86829ms
-Printing Nanobuckets
-0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 5, 6, 6, 6, 6, 6, 7, 8, 9, 9, 9, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15, 16, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-Printing Blockbucket
-28, 0, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-==== phase1 done=============================
-[       OK ] AxB_dot3_tests_PLUS_TIMES_1.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (11 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_1.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 54321
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 16384 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- calling stringify semiring: 0x7f1fea03ef00
-inside enumify: 0x7f1fea03ef00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
-nanobuckets_size: 3072
-blockbuckets_size: 96
- rmm_wrap_alloc 24576 bytes
- rmm_wrap_alloc 768 bytes
-A TYpe: 0x7f2028b56f40
-B TYpe: 0x7f2028b56f40
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
-GB_jit_AxB_phase1
-#include "/home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h"
-#include "GB_jit_AxB_phase1.cuh"
- jit_cache get program GB_jit_AxB_phase1
-found memory-cached prog GB_jit_AxB_phase1
- got kernel instance AxB_phase1_int32_t
-found memory-cached prog AxB_phase1_int32_t
-Launching _Z10AxB_phase1IiEvPxS0_P16GB_Matrix_opaqueS2_S2_S2_<<<8,32,0,0>>>(long*,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*)
-returned from phase1 kernel 2.08486ms
-Printing Nanobuckets
-0, 3, 4, 6, 9, 12, 13, 15, 18, 20, 21, 23, 26, 30, 32, 35, 37, 38, 40, 43, 45, 45, 48, 51, 54, 55, 58, 58, 61, 61, 63, 66, 0, 1, 4, 6, 7, 8, 11, 13, 14, 16, 19, 21, 22, 22, 24, 25, 27, 30, 32, 33, 35, 39, 40, 41, 42, 45, 46, 50, 51, 55, 57, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8, 12, 15, 19, 19, 22, 23, 25, 27, 30, 32, 33, 34, 37, 39, 40, 42, 45, 48, 50, 50, 54, 57, 61, 63, 65, 67, 68, 69, 0, 2, 4, 4, 4, 5, 5, 9, 10, 13, 15, 17, 18, 20, 23, 26, 27, 29, 32, 34, 35, 36, 38, 42, 42, 43, 43, 45, 47, 49, 52, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 4, 7, 8, 9, 11, 12, 15, 16, 18, 19, 22, 25, 27, 29, 31, 34, 35, 37, 38, 40, 41, 44, 46, 48, 50, 53, 56, 57, 58, 0, 3, 5, 8, 9, 12, 15, 17, 20, 21, 24, 26, 29, 30, 31, 33, 35, 37, 38, 41, 43, 46, 48, 51, 52, 54, 56, 58, 59, 60, 63, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 9, 11, 14, 15, 17, 18, 19, 22, 24, 25, 28, 31, 32, 33, 36, 38, 41, 45, 46, 47, 48, 50, 52, 54, 55, 55, 58, 60, 62, 0, 2, 2, 3, 5, 6, 9, 11, 14, 17, 18, 20, 23, 24, 25, 28, 31, 32, 34, 35, 35, 38, 41, 44, 46, 48, 50, 53, 57, 58, 60, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 7, 8, 12, 14, 15, 16, 20, 23, 26, 29, 32, 34, 35, 38, 40, 41, 45, 47, 48, 50, 53, 57, 59, 61, 63, 66, 68, 71, 72, 0, 1, 4, 5, 8, 8, 10, 13, 16, 16, 17, 18, 19, 20, 22, 25, 26, 28, 31, 31, 33, 36, 38, 39, 39, 41, 43, 45, 46, 48, 49, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 3, 5, 8, 10, 11, 12, 13, 15, 16, 18, 20, 24, 24, 26, 28, 29, 30, 32, 34, 35, 35, 36, 39, 42, 43, 46, 48, 50, 52, 0, 3, 5, 9, 11, 12, 14, 17, 20, 23, 25, 28, 30, 32, 32, 36, 38, 40, 43, 46, 48, 50, 53, 57, 60, 61, 62, 65, 66, 68, 70, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 8, 10, 11, 11, 12, 13, 13, 14, 17, 17, 20, 21, 23, 26, 29, 31, 33, 35, 35, 37, 40, 42, 46, 49, 53, 54, 58, 59, 60, 0, 1, 3, 4, 6, 9, 13, 16, 19, 23, 26, 27, 31, 32, 35, 37, 38, 39, 41, 43, 45, 49, 51, 52, 54, 54, 55, 55, 58, 58, 61, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 7, 9, 11, 13, 15, 18, 20, 22, 24, 25, 27, 27, 28, 28, 29, 31, 35, 38, 39, 41, 44, 47, 50, 52, 54, 58, 58, 59, 59, 0, 3, 4, 5, 7, 9, 11, 13, 14, 16, 18, 20, 23, 25, 29, 32, 36, 39, 41, 41, 42, 45, 47, 48, 49, 50, 52, 54, 54, 58, 61, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-Printing Blockbucket
-132, 138, 116, 124, 144, 104, 120, 118, 116, 110, 132, 124, 104, 144, 128, 130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-==== phase1 done=============================
-[       OK ] AxB_dot3_tests_PLUS_TIMES_1.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (230 ms)
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_1 (489 ms total)
-
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_2
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_2.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 54321
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- rmm_wrap_alloc 3072 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-nthrd: 32, ntasks: 1
-Printing nanobuckets
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-Printing blockbucket
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-GB_jit_AxB_phase2
-#include "GB_jit_AxB_phase2.cuh"
- jit_cache get program GB_jit_AxB_phase2
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_phase2
- got kernel instance AxB_phase2
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_phase2
----------------------------------------
---- Linker for AxB_phase2(long long*, long long*, int) ---
----------------------------------------
-info    : 39 bytes gmem
-info    : Function properties for '_Z10AxB_phase2PxS_i':
-info    : used 88 registers, 720 stack, 32 bytes smem, 372 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z10AxB_phase2PxS_i<<<1,32,0,0>>>(long*,long*,int)
-s_0: 1, s_1=1, s_10=1, s_11=1
-GB_jit_AxB_phase2end
-#include "GB_jit_AxB_phase2end.cuh"
- jit_cache get program GB_jit_AxB_phase2end
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_phase2end
- got kernel instance AxB_phase2end
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_phase2end
----------------------------------------
---- Linker for AxB_phase2end(long long*, long long const*, long long const*, long long*, long long const*, GB_Matrix_opaque*, long long) ---
----------------------------------------
-info    : 0 bytes gmem
-info    : Function properties for '_Z13AxB_phase2endPxPKxS1_S_S1_P16GB_Matrix_opaquex':
-info    : used 107 registers, 0 stack, 0 bytes smem, 408 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z13AxB_phase2endPxPKxS1_S_S1_P16GB_Matrix_opaquex<<<1,32,0,0>>>(long*,long*,long*,long*,long*,GB_Matrix_opaque*,long)
-Printing bucketp
-0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
-Done.
-Printing bucket
-0, 16, 5, 8, 9, 12, 15, 17, 20, 21, 24, 26, 29, 30, 31, 33, 35, 37, 38, 41, 43, 46, 48, 51, 52, 54, 56, 58, 59, 60, 63, 66, 
-Done.
-phase2 kernel done ==================
-[       OK ] AxB_dot3_tests_PLUS_TIMES_2.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (17 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_2.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 1024 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 54321
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 4096 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- rmm_wrap_alloc 24576 bytes
- rmm_wrap_alloc 768 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 256 bytes
-nthrd: 32, ntasks: 8
-Printing nanobuckets
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-Printing blockbucket
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-GB_jit_AxB_phase2
-#include "GB_jit_AxB_phase2.cuh"
- jit_cache get program GB_jit_AxB_phase2
-found memory-cached prog GB_jit_AxB_phase2
- got kernel instance AxB_phase2
-found memory-cached prog AxB_phase2
-Launching _Z10AxB_phase2PxS_i<<<1,32,0,0>>>(long*,long*,int)
-s_0: 1, s_1=1, s_10=1, s_11=1
-GB_jit_AxB_phase2end
-#include "GB_jit_AxB_phase2end.cuh"
- jit_cache get program GB_jit_AxB_phase2end
-found memory-cached prog GB_jit_AxB_phase2end
- got kernel instance AxB_phase2end
-found memory-cached prog AxB_phase2end
-Launching _Z13AxB_phase2endPxPKxS1_S_S1_P16GB_Matrix_opaquex<<<8,32,0,0>>>(long*,long*,long*,long*,long*,GB_Matrix_opaque*,long)
-Printing bucketp
-0, 2, 4, 4294967302, 4294967303, 8589934600, 12884901897, 17179869194, 17179869195, 21474836492, 21474836493, 25769803791, 
-Done.
-Printing bucket
-1, 656, 816, 848, 880, 1, 4294967296, 0, 0, 4294967296, 1, 1, 4294967297, 4294967296, 4294967296, 4294967296, 1, 4294967297, 0, 0, 0, 0, 0, 1, 4294967296, 0, 1, 1, 0, 4294967297, 4294967296, 0, 0, 4294967296, 4294967296, 0, 1, 1, 0, 4294967296, 0, 4294967296, 1, 1, 1, 1, 4294967296, 4294967297, 0, 4294967297, 4294967296, 0, 0, 0, 4294967297, 1, 1, 4294967297, 4294967296, 0, 4294967296, 0, 0, 4294967296, 4294967297, 4294967296, 1, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 0, 0, 4294967297, 0, 1, 4294967297, 4294967297, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 1, 4294967297, 4294967296, 4294967297, 0, 4294967296, 4294967296, 4294967296, 0, 4294967296, 4294967296, 4294967297, 4294967297, 4294967296, 1, 4294967296, 4294967296, 4294967296, 4294967296, 0, 4294967297, 1, 4294967296, 0, 4294967296, 4294967296, 4294967297, 0, 1, 4294967297, 0, 0, 0, 4294967296, 4294967297, 4294967297, 0, 4294967296, 4294967297, 1, 0, 0, 0, 0, 1, 4294967297, 1, 4294967296, 0, 4294967296, 4294967296, 4294967296, 1, 4294967297, 1, 4294967296, 4294967296, 0, 4294967297, 4294967297, 1, 0, 4294967296, 0, 1, 0, 1, 0, 0, 0, 4294967297, 0, 1, 1, 4294967296, 1, 4294967297, 4294967297, 1, 4294967296, 4294967297, 4294967297, 1, 0, 0, 4294967297, 4294967296, 0, 4294967297, 4294967297, 4294967297, 4294967296, 4294967296, 0, 0, 1, 0, 0, 1, 4294967296, 1, 4294967296, 4294967296, 1, 4294967297, 1, 4294967297, 1, 4294967296, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967296, 1, 4294967297, 4294967297, 0, 4294967297, 4294967296, 0, 0, 1, 4294967296, 0, 4294967297, 4294967297, 1, 0, 4294967296, 0, 4294967296, 0, 1, 4294967296, 1, 4294967297, 1, 0, 1, 4294967296, 1, 4294967296, 1, 4294967297, 4294967296, 0, 4294967296, 4294967296, 0, 1, 0, 4294967297, 4294967297, 1, 4294967297, 4294967296, 1, 0, 4294967297, 4294967296, 0, 0, 4294967296, 0, 0, 4294967297, 1, 0, 4294967296, 4294967297, 1, 4294967297, 4294967297, 0, 4294967296, 1, 4294967297, 4294967297, 4294967297, 4294967296, 0, 4294967297, 4294967297, 1, 0, 4294967297, 4294967297, 0, 4294967296, 0, 0, 4294967296, 1, 4294967297, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 4294967297, 4294967297, 1, 0, 4294967296, 1, 4294967297, 1, 0, 4294967297, 1, 0, 4294967297, 4294967297, 4294967297, 1, 1, 0, 4294967296, 4294967297, 4294967296, 4294967297, 0, 4294967297, 1, 0, 1, 1, 1, 4294967297, 4294967297, 1, 4294967296, 4294967297, 1, 1, 4294967297, 4294967296, 4294967296, 4294967296, 4294967297, 4294967296, 0, 0, 4294967296, 0, 1, 1, 4294967297, 4294967297, 4294967297, 1, 1, 0, 1, 1, 4294967296, 4294967296, 0, 0, 0, 4294967297, 4294967296, 0, 4294967297, 4294967296, 4294967297, 0, 4294967297, 1, 4294967296, 1, 1, 1, 4294967296, 4294967297, 1, 0, 4294967297, 0, 0, 4294967297, 0, 4294967296, 1, 4294967297, 4294967296, 0, 4294967296, 4294967297, 0, 1, 4294967296, 4294967297, 1, 4294967296, 0, 4294967296, 0, 4294967296, 4294967296, 0, 0, 0, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 0, 4294967297, 4294967296, 0, 1, 1, 4294967297, 1, 4294967296, 1, 0, 4294967297, 1, 1, 4294967296, 0, 0, 4294967296, 4294967296, 0, 4294967297, 0, 4294967297, 4294967297, 0, 4294967297, 0, 1, 0, 1, 4294967297, 1, 1, 1, 0, 4294967296, 4294967296, 4294967297, 1, 4294967296, 1, 0, 4294967297, 4294967296, 0, 4294967297, 4294967296, 1, 4294967296, 4294967297, 0, 4294967296, 0, 0, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 0, 1, 4294967296, 4294967296, 0, 4294967297, 1, 4294967297, 0, 1, 4294967296, 1, 4294967296, 4294967297, 4294967296, 0, 1, 4294967296, 4294967296, 4294967297, 0, 0, 4294967296, 0, 0, 4294967297, 4294967297, 0, 0, 4294967297, 4294967296, 4294967297, 0, 0, 1, 4294967297, 0, 4294967297, 0, 0, 4294967297, 0, 0, 4294967296, 0, 0, 4294967297, 1, 4294967297, 4294967297, 4294967297, 0, 0, 4294967296, 4294967297, 0, 4294967297, 1, 0, 0, 0, 4294967297, 4294967296, 0, 0, 4294967297, 1, 4294967297, 1, 1, 4294967296, 0, 4294967296, 0, 1, 0, 4294967297, 1, 4294967296, 0, 4294967296, 4294967297, 4294967297, 4294967297, 0, 1, 1, 4294967296, 4294967296, 0, 4294967296, 0, 4294967296, 4294967297, 4294967296, 4294967297, 4294967297, 1, 4294967297, 4294967296, 0, 0, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967296, 0, 1, 4294967296, 4294967296, 1, 4294967297, 4294967296, 0, 0, 4294967297, 1, 1, 4294967297, 4294967296, 1, 0, 1, 4294967296, 4294967297, 4294967297, 4294967297, 1, 1, 0, 4294967297, 4294967296, 0, 4294967296, 1, 1, 0, 4294967297, 4294967297, 0, 4294967296, 4294967297, 0, 4294967297, 4294967297, 4294967297, 4294967296, 0, 0, 4294967297, 0, 4294967296, 4294967297, 4294967296, 0, 4294967297, 0, 4294967297, 4294967296, 4294967297, 4294967296, 0, 1, 4294967297, 4294967296, 4294967297, 0, 4294967297, 4294967296, 1, 4294967297, 4294967296, 0, 1, 0, 4294967297, 1, 4294967296, 0, 4294967296, 4294967297, 1, 4294967297, 0, 1, 4294967296, 4294967296, 4294967296, 4294967297, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967297, 1, 0, 1, 0, 4294967297, 0, 1, 0, 4294967296, 4294967296, 4294967296, 4294967296, 0, 1, 1, 4294967297, 1, 1, 4294967297, 1, 4294967296, 4294967297, 4294967297, 4294967296, 1, 4294967297, 4294967296, 1, 1, 1, 1, 1, 0, 4294967296, 4294967297, 1, 0, 1, 4294967297, 4294967296, 0, 0, 4294967297, 4294967297, 1, 4294967297, 1, 1, 4294967297, 1, 4294967297, 0, 1, 1, 4294967297, 1, 4294967297, 4294967297, 4294967297, 4294967297, 4294967296, 0, 0, 4294967297, 0, 4294967296, 0, 0, 0, 4294967297, 1, 4294967297, 0, 4294967296, 0, 0, 4294967296, 1, 0, 4294967296, 0, 0, 4294967296, 4294967296, 1, 4294967296, 4294967297, 0, 4294967297, 0, 0, 0, 0, 1, 1, 4294967296, 4294967297, 4294967296, 4294967297, 1, 4294967296, 4294967297, 4294967296, 0, 4294967297, 0, 0, 0, 0, 0, 1, 4294967296, 4294967296, 4294967297, 4294967296, 4294967296, 4294967296, 1, 4294967296, 4294967296, 1, 1, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 1, 0, 4294967297, 4294967296, 0, 4294967296, 4294967296, 1, 0, 4294967296, 4294967296, 4294967297, 0, 0, 4294967296, 1, 4294967297, 0, 4294967296, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 0, 1, 4294967297, 1, 1, 0, 0, 1, 0, 1, 1, 4294967296, 0, 1, 0, 1, 4294967297, 1, 1, 4294967296, 1, 0, 4294967297, 0, 0, 4294967296, 1, 4294967297, 0, 4294967296, 0, 0, 4294967297, 4294967296, 4294967297, 1, 1, 4294967296, 4294967297, 4294967297, 4294967296, 4294967296, 4294967296, 1, 0, 4294967296, 4294967297, 0, 1, 4294967296, 4294967296, 1, 0, 4294967296, 0, 1, 1, 4294967297, 4294967296, 4294967297, 4294967296, 4294967296, 1, 4294967297, 0, 4294967296, 0, 0, 4294967297, 4294967296, 1, 0, 0, 1, 4294967296, 4294967297, 4294967297, 1, 1, 4294967296, 0, 4294967297, 0, 0, 0, 1, 0, 4294967297, 4294967296, 1, 0, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 0, 1, 1, 0, 0, 4294967296, 4294967297, 1, 1, 0, 1, 0, 4294967296, 4294967297, 4294967297, 0, 0, 1, 1, 1, 4294967296, 0, 1, 4294967297, 4294967296, 1, 4294967296, 1, 4294967296, 4294967297, 1, 1, 1, 0, 1, 1, 4294967297, 4294967297, 1, 4294967297, 0, 1, 4294967296, 0, 4294967297, 0, 4294967297, 4294967296, 4294967297, 4294967297, 1, 4294967297, 4294967296, 4294967296, 1, 4294967296, 4294967296, 4294967296, 4294967296, 4294967296, 0, 0, 0, 1, 0, 0, 1, 0, 0, 4294967297, 1, 4294967296, 4294967296, 4294967296, 1, 1, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 0, 0, 0, 0, 4294967297, 1, 1, 0, 4294967296, 4294967296, 1, 0, 4294967297, 0, 1, 4294967297, 1, 0, 4294967296, 4294967297, 4294967297, 4294967297, 4294967296, 4294967296, 0, 1, 
-Done.
-phase2 kernel done ==================
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
-[       OK ] AxB_dot3_tests_PLUS_TIMES_2.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (226 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_2.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 54321
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- rmm_wrap_alloc 3072 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-nthrd: 32, ntasks: 1
-Printing nanobuckets
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-Printing blockbucket
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-GB_jit_AxB_phase2
-#include "GB_jit_AxB_phase2.cuh"
- jit_cache get program GB_jit_AxB_phase2
-found memory-cached prog GB_jit_AxB_phase2
- got kernel instance AxB_phase2
-found memory-cached prog AxB_phase2
-Launching _Z10AxB_phase2PxS_i<<<1,32,0,0>>>(long*,long*,int)
-s_0: 1, s_1=1, s_10=1, s_11=1
-GB_jit_AxB_phase2end
-#include "GB_jit_AxB_phase2end.cuh"
- jit_cache get program GB_jit_AxB_phase2end
-found memory-cached prog GB_jit_AxB_phase2end
- got kernel instance AxB_phase2end
-found memory-cached prog AxB_phase2end
-Launching _Z13AxB_phase2endPxPKxS1_S_S1_P16GB_Matrix_opaquex<<<1,32,0,0>>>(long*,long*,long*,long*,long*,GB_Matrix_opaque*,long)
-Printing bucketp
-0, 1, 2, 4294967299, 4294967300, 4294967302, 4294967303, 8589934600, 8589934601, 8589934603, 8589934605, 8589934606, 
-Done.
-Printing bucket
-0, 16, 4294967296, 1, 1, 1, 0, 0, 4294967297, 1, 0, 4294967296, 0, 4294967296, 4294967296, 4294967297, 1, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 0, 0, 4294967297, 4294967297, 4294967296, 1, 0, 1, 4294967296, 4294967296, 
-Done.
-phase2 kernel done ==================
-[       OK ] AxB_dot3_tests_PLUS_TIMES_2.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (2 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_2.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
-found device 0
-inside fill, using seed 12345
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 1024 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 54321
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 4096 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
- rmm_wrap_alloc 24576 bytes
- rmm_wrap_alloc 768 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 256 bytes
-nthrd: 32, ntasks: 8
-Printing nanobuckets
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-Printing blockbucket
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
-Done.
-GB_jit_AxB_phase2
-#include "GB_jit_AxB_phase2.cuh"
- jit_cache get program GB_jit_AxB_phase2
-found memory-cached prog GB_jit_AxB_phase2
- got kernel instance AxB_phase2
-found memory-cached prog AxB_phase2
-Launching _Z10AxB_phase2PxS_i<<<1,32,0,0>>>(long*,long*,int)
-s_0: 1, s_1=1, s_10=1, s_11=1
-GB_jit_AxB_phase2end
-#include "GB_jit_AxB_phase2end.cuh"
- jit_cache get program GB_jit_AxB_phase2end
-found memory-cached prog GB_jit_AxB_phase2end
- got kernel instance AxB_phase2end
-found memory-cached prog AxB_phase2end
-Launching _Z13AxB_phase2endPxPKxS1_S_S1_P16GB_Matrix_opaquex<<<8,32,0,0>>>(long*,long*,long*,long*,long*,GB_Matrix_opaque*,long)
-Printing bucketp
-0, 2, 4, 4294967302, 4294967303, 8589934600, 12884901897, 17179869194, 17179869195, 21474836492, 21474836493, 25769803791, 
-Done.
-Printing bucket
-1, 16, 816, 848, 880, 1, 4294967296, 0, 0, 4294967296, 1, 1, 4294967297, 4294967296, 4294967296, 4294967296, 1, 4294967297, 0, 0, 0, 0, 0, 1, 4294967296, 0, 1, 1, 0, 4294967297, 4294967296, 0, 0, 4294967296, 4294967296, 0, 1, 1, 0, 4294967296, 0, 4294967296, 1, 1, 1, 1, 4294967296, 4294967297, 0, 4294967297, 4294967296, 0, 0, 0, 4294967297, 1, 1, 4294967297, 4294967296, 0, 4294967296, 0, 0, 4294967296, 4294967297, 4294967296, 1, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 0, 0, 4294967297, 0, 1, 4294967297, 4294967297, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 1, 4294967297, 4294967296, 4294967297, 0, 4294967296, 4294967296, 4294967296, 0, 4294967296, 4294967296, 4294967297, 4294967297, 4294967296, 1, 4294967296, 4294967296, 4294967296, 4294967296, 0, 4294967297, 1, 4294967296, 0, 4294967296, 4294967296, 4294967297, 0, 1, 4294967297, 0, 0, 0, 4294967296, 4294967297, 4294967297, 0, 4294967296, 4294967297, 1, 0, 0, 0, 0, 1, 4294967297, 1, 4294967296, 0, 4294967296, 4294967296, 4294967296, 1, 4294967297, 1, 4294967296, 4294967296, 0, 4294967297, 4294967297, 1, 0, 4294967296, 0, 1, 0, 1, 0, 0, 0, 4294967297, 0, 1, 1, 4294967296, 1, 4294967297, 4294967297, 1, 4294967296, 4294967297, 4294967297, 1, 0, 0, 4294967297, 4294967296, 0, 4294967297, 4294967297, 4294967297, 4294967296, 4294967296, 0, 0, 1, 0, 0, 1, 4294967296, 1, 4294967296, 4294967296, 1, 4294967297, 1, 4294967297, 1, 4294967296, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967296, 1, 4294967297, 4294967297, 0, 4294967297, 4294967296, 0, 0, 1, 4294967296, 0, 4294967297, 4294967297, 1, 0, 4294967296, 0, 4294967296, 0, 1, 4294967296, 1, 4294967297, 1, 0, 1, 4294967296, 1, 4294967296, 1, 4294967297, 4294967296, 0, 4294967296, 4294967296, 0, 1, 0, 4294967297, 4294967297, 1, 4294967297, 4294967296, 1, 0, 4294967297, 4294967296, 0, 0, 4294967296, 0, 0, 4294967297, 1, 0, 4294967296, 4294967297, 1, 4294967297, 4294967297, 0, 4294967296, 1, 4294967297, 4294967297, 4294967297, 4294967296, 0, 4294967297, 4294967297, 1, 0, 4294967297, 4294967297, 0, 4294967296, 0, 0, 4294967296, 1, 4294967297, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 4294967297, 4294967297, 1, 0, 4294967296, 1, 4294967297, 1, 0, 4294967297, 1, 0, 4294967297, 4294967297, 4294967297, 1, 1, 0, 4294967296, 4294967297, 4294967296, 4294967297, 0, 4294967297, 1, 0, 1, 1, 1, 4294967297, 4294967297, 1, 4294967296, 4294967297, 1, 1, 4294967297, 4294967296, 4294967296, 4294967296, 4294967297, 4294967296, 0, 0, 4294967296, 0, 1, 1, 4294967297, 4294967297, 4294967297, 1, 1, 0, 1, 1, 4294967296, 4294967296, 0, 0, 0, 4294967297, 4294967296, 0, 4294967297, 4294967296, 4294967297, 0, 4294967297, 1, 4294967296, 1, 1, 1, 4294967296, 4294967297, 1, 0, 4294967297, 0, 0, 4294967297, 0, 4294967296, 1, 4294967297, 4294967296, 0, 4294967296, 4294967297, 0, 1, 4294967296, 4294967297, 1, 4294967296, 0, 4294967296, 0, 4294967296, 4294967296, 0, 0, 0, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 0, 4294967297, 4294967296, 0, 1, 1, 4294967297, 1, 4294967296, 1, 0, 4294967297, 1, 1, 4294967296, 0, 0, 4294967296, 4294967296, 0, 4294967297, 0, 4294967297, 4294967297, 0, 4294967297, 0, 1, 0, 1, 4294967297, 1, 1, 1, 0, 4294967296, 4294967296, 4294967297, 1, 4294967296, 1, 0, 4294967297, 4294967296, 0, 4294967297, 4294967296, 1, 4294967296, 4294967297, 0, 4294967296, 0, 0, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 0, 1, 4294967296, 4294967296, 0, 4294967297, 1, 4294967297, 0, 1, 4294967296, 1, 4294967296, 4294967297, 4294967296, 0, 1, 4294967296, 4294967296, 4294967297, 0, 0, 4294967296, 0, 0, 4294967297, 4294967297, 0, 0, 4294967297, 4294967296, 4294967297, 0, 0, 1, 4294967297, 0, 4294967297, 0, 0, 4294967297, 0, 0, 4294967296, 0, 0, 4294967297, 1, 4294967297, 4294967297, 4294967297, 0, 0, 4294967296, 4294967297, 0, 4294967297, 1, 0, 0, 0, 4294967297, 4294967296, 0, 0, 4294967297, 1, 4294967297, 1, 1, 4294967296, 0, 4294967296, 0, 1, 0, 4294967297, 1, 4294967296, 0, 4294967296, 4294967297, 4294967297, 4294967297, 0, 1, 1, 4294967296, 4294967296, 0, 4294967296, 0, 4294967296, 4294967297, 4294967296, 4294967297, 4294967297, 1, 4294967297, 4294967296, 0, 0, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967296, 0, 1, 4294967296, 4294967296, 1, 4294967297, 4294967296, 0, 0, 4294967297, 1, 1, 4294967297, 4294967296, 1, 0, 1, 4294967296, 4294967297, 4294967297, 4294967297, 1, 1, 0, 4294967297, 4294967296, 0, 4294967296, 1, 1, 0, 4294967297, 4294967297, 0, 4294967296, 4294967297, 0, 4294967297, 4294967297, 4294967297, 4294967296, 0, 0, 4294967297, 0, 4294967296, 4294967297, 4294967296, 0, 4294967297, 0, 4294967297, 4294967296, 4294967297, 4294967296, 0, 1, 4294967297, 4294967296, 4294967297, 0, 4294967297, 4294967296, 1, 4294967297, 4294967296, 0, 1, 0, 4294967297, 1, 4294967296, 0, 4294967296, 4294967297, 1, 4294967297, 0, 1, 4294967296, 4294967296, 4294967296, 4294967297, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967297, 1, 0, 1, 0, 4294967297, 0, 1, 0, 4294967296, 4294967296, 4294967296, 4294967296, 0, 1, 1, 4294967297, 1, 1, 4294967297, 1, 4294967296, 4294967297, 4294967297, 4294967296, 1, 4294967297, 4294967296, 1, 1, 1, 1, 1, 0, 4294967296, 4294967297, 1, 0, 1, 4294967297, 4294967296, 0, 0, 4294967297, 4294967297, 1, 4294967297, 1, 1, 4294967297, 1, 4294967297, 0, 1, 1, 4294967297, 1, 4294967297, 4294967297, 4294967297, 4294967297, 4294967296, 0, 0, 4294967297, 0, 4294967296, 0, 0, 0, 4294967297, 1, 4294967297, 0, 4294967296, 0, 0, 4294967296, 1, 0, 4294967296, 0, 0, 4294967296, 4294967296, 1, 4294967296, 4294967297, 0, 4294967297, 0, 0, 0, 0, 1, 1, 4294967296, 4294967297, 4294967296, 4294967297, 1, 4294967296, 4294967297, 4294967296, 0, 4294967297, 0, 0, 0, 0, 0, 1, 4294967296, 4294967296, 4294967297, 4294967296, 4294967296, 4294967296, 1, 4294967296, 4294967296, 1, 1, 0, 0, 4294967297, 4294967296, 4294967296, 4294967297, 1, 0, 4294967297, 4294967296, 0, 4294967296, 4294967296, 1, 0, 4294967296, 4294967296, 4294967297, 0, 0, 4294967296, 1, 4294967297, 0, 4294967296, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 0, 1, 4294967297, 1, 1, 0, 0, 1, 0, 1, 1, 4294967296, 0, 1, 0, 1, 4294967297, 1, 1, 4294967296, 1, 0, 4294967297, 0, 0, 4294967296, 1, 4294967297, 0, 4294967296, 0, 0, 4294967297, 4294967296, 4294967297, 1, 1, 4294967296, 4294967297, 4294967297, 4294967296, 4294967296, 4294967296, 1, 0, 4294967296, 4294967297, 0, 1, 4294967296, 4294967296, 1, 0, 4294967296, 0, 1, 1, 4294967297, 4294967296, 4294967297, 4294967296, 4294967296, 1, 4294967297, 0, 4294967296, 0, 0, 4294967297, 4294967296, 1, 0, 0, 1, 4294967296, 4294967297, 4294967297, 1, 1, 4294967296, 0, 4294967297, 0, 0, 0, 1, 0, 4294967297, 4294967296, 1, 0, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 0, 1, 1, 0, 0, 4294967296, 4294967297, 1, 1, 0, 1, 0, 4294967296, 4294967297, 4294967297, 0, 0, 1, 1, 1, 4294967296, 0, 1, 4294967297, 4294967296, 1, 4294967296, 1, 4294967296, 4294967297, 1, 1, 1, 0, 1, 1, 4294967297, 4294967297, 1, 4294967297, 0, 1, 4294967296, 0, 4294967297, 0, 4294967297, 4294967296, 4294967297, 4294967297, 1, 4294967297, 4294967296, 4294967296, 1, 4294967296, 4294967296, 4294967296, 4294967296, 4294967296, 0, 0, 0, 1, 0, 0, 1, 0, 0, 4294967297, 1, 4294967296, 4294967296, 4294967296, 1, 1, 4294967297, 4294967297, 4294967296, 4294967296, 4294967297, 4294967296, 4294967297, 4294967296, 0, 0, 0, 0, 4294967297, 1, 1, 0, 4294967296, 4294967296, 1, 0, 4294967297, 0, 1, 4294967297, 1, 0, 4294967296, 4294967297, 4294967297, 4294967297, 4294967296, 4294967296, 0, 1, 
-Done.
-phase2 kernel done ==================
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
-[       OK ] AxB_dot3_tests_PLUS_TIMES_2.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (229 ms)
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_2 (475 ms total)
-
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_3
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_3.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
-Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-32 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff50e0200
-inside enumify: 0x7f1ff50e0200
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-bucket 1 has 32 dots to do
-LAUNCHING BUCKET CODE: 1
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_dndn
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_dot3_phase3_dndn
- got kernel instance AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
----------------------------------------
---- Linker for void AxB_dot3_phase3_dndn<int, int, int>(long long, long long, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, int) ---
----------------------------------------
-info    : 40 bytes gmem
-info    : Function properties for '_Z20AxB_dot3_phase3_dndnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i':
-info    : used 98 registers, 320 stack, 0 bytes smem, 412 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z20AxB_dot3_phase3_dndnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-tid=0, i,j = 6,0  nnzA= 32, nnzB=32
-tid=0, i,j = 1,1  nnzA= 32, nnzB=32
-tid=0, i,j = 12,3  nnzA= 32, nnzB=32
-tid=0, i,j = 17,3  nnzA= 32, nnzB=32
-tid=0, i,j = 19,4  nnzA= 32, nnzB=32
-tid=0, i,j = 19,5  nnzA= 32, nnzB=32
-tid=0, i,j = 22,6  nnzA= 32, nnzB=32
-tid=0, i,j = 24,6  nnzA= 32, nnzB=32
-tid=0, i,j = 10,8  nnzA= 32, nnzB=32
-tid=0, i,j = 19,9  nnzA= 32, nnzB=32
-tid=0, i,j = 31,9  nnzA= 32, nnzB=32
-tid=0, i,j = 13,11  nnzA= 32, nnzB=32
-tid=0, i,j = 11,12  nnzA= 32, nnzB=32
-tid=0, i,j = 24,14  nnzA= 32, nnzB=32
-tid=0, i,j = 30,15  nnzA= 32, nnzB=32
-tid=0, i,j = 20,16  nnzA= 32, nnzB=32
-tid=0, i,j = 30,17  nnzA= 32, nnzB=32
-tid=0, i,j = 18,18  nnzA= 32, nnzB=32
-tid=0, i,j = 1,19  nnzA= 32, nnzB=32
-tid=0, i,j = 25,20  nnzA= 32, nnzB=32
-tid=0, i,j = 24,21  nnzA= 32, nnzB=32
-tid=0, i,j = 27,21  nnzA= 32, nnzB=32
-tid=0, i,j = 30,22  nnzA= 32, nnzB=32
-tid=0, i,j = 30,23  nnzA= 32, nnzB=32
-tid=0, i,j = 14,24  nnzA= 32, nnzB=32
-tid=0, i,j = 4,25  nnzA= 32, nnzB=32
-tid=0, i,j = 15,26  nnzA= 32, nnzB=32
-tid=0, i,j = 28,27  nnzA= 32, nnzB=32
-tid=0, i,j = 16,28  nnzA= 32, nnzB=32
-tid=0, i,j = 9,29  nnzA= 32, nnzB=32
-tid=0, i,j = 24,30  nnzA= 32, nnzB=32
-tid=0, i,j = 31,31  nnzA= 32, nnzB=32
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 5.1968ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-
-    (0,6)   11
-    (1,1)   10
-    (3,12)   7
-    (3,17)   6
-    (4,19)   8
-    (5,19)   10
-    (6,22)   6
-    (6,24)   9
-    (8,10)   7
-    (9,19)   8
-    (9,31)   6
-    (11,13)   8
-    (12,11)   6
-    (14,24)   10
-    (15,30)   9
-    (16,20)   5
-    (17,30)   7
-    (18,18)   12
-    (19,1)   6
-    (20,25)   7
-    (21,24)   9
-    (21,27)   6
-    (22,30)   8
-    (23,30)   11
-    (24,14)   7
-    (25,4)   9
-    (26,15)   4
-    (27,28)   5
-    (28,16)   4
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-
-    (0,6)   11
-    (1,1)   10
-    (3,12)   7
-    (3,17)   6
-    (4,19)   8
-    (5,19)   10
-    (6,22)   6
-    (6,24)   9
-    (8,10)   7
-    (9,19)   8
-    (9,31)   6
-    (11,13)   8
-    (12,11)   6
-    (14,24)   10
-    (15,30)   9
-    (16,20)   5
-    (17,30)   7
-    (18,18)   12
-    (19,1)   6
-    (20,25)   7
-    (21,24)   9
-    (21,27)   6
-    (22,30)   8
-    (23,30)   11
-    (24,14)   7
-    (25,4)   9
-    (26,15)   4
-    (27,28)   5
-    (28,16)   4
-    (29,9)   7
-    (30,24)   10
-    (31,31)   10
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 32 entries, memory: 1.5 KB
-
-    (0,6)   11
-    (1,1)   10
-    (3,12)   7
-    (3,17)   6
-    (4,19)   8
-    (5,19)   10
-    (6,22)   6
-    (6,24)   9
-    (8,10)   7
-    (9,19)   8
-    (9,31)   6
-    (11,13)   8
-    (12,11)   6
-    (14,24)   10
-    (15,30)   9
-    (16,20)   5
-    (17,30)   7
-    (18,18)   12
-    (19,1)   6
-    (20,25)   7
-    (21,24)   9
-    (21,27)   6
-    (22,30)   8
-    (23,30)   11
-    (24,14)   7
-    (25,4)   9
-    (26,15)   4
-    (27,28)   5
-    (28,16)   4
-    (29,9)   7
-    (30,24)   10
-    (31,31)   10
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 32 entries, memory: 1.2 KB
-
-    (0,6)    0
-    (1,1)    0
-    (3,12)    0
-    (3,17)    0
-    (4,19)    0
-    (5,19)    0
-    (6,22)    0
-    (6,24)    0
-    (8,10)    0
-    (9,19)    0
-    (9,31)    0
-    (11,13)    0
-    (12,11)    0
-    (14,24)    0
-    (15,30)    0
-    (16,20)    0
-    (17,30)    0
-    (18,18)    0
-    (19,1)    0
-    (20,25)    0
-    (21,24)    0
-    (21,27)    0
-    (22,30)    0
-    (23,30)    0
-    (24,14)    0
-    (25,4)    0
-    (26,15)    0
-    (27,28)    0
-    (28,16)    0
-    (29,9)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
- work:32 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 160 values, invsparse = 7
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-160 nonzeroes left to fill..
-62 nonzeroes left to fill..
- rmm_wrap_alloc 2048 bytes
- rmm_wrap_alloc 1024 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1fea03f000
-inside enumify: 0x7f1fea03f000
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-bucket 5 has 32 dots to do
-LAUNCHING BUCKET CODE: 5
-Confiring spdnINside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_spdn
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_dot3_phase3_spdn
- got kernel instance AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
----------------------------------------
---- Linker for void AxB_dot3_phase3_spdn<int, int, int>(long long, long long, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, int) ---
----------------------------------------
-info    : 0 bytes gmem
-info    : Function properties for '_Z20AxB_dot3_phase3_spdnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i':
-info    : used 112 registers, 296 stack, 0 bytes smem, 412 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z20AxB_dot3_phase3_spdnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 2.00294ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 6
-
-    (0,6) zombie
-    (1,1)   2
-    (3,12)   2
-    (3,17)   2
-    (4,19) zombie
-    (5,19) zombie
-    (6,22)   2
-    (6,24)   0
-    (8,10)   0
-    (9,19) zombie
-    (9,31)   4
-    (11,13)   3
-    (12,11)   2
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18) zombie
-    (19,1)   3
-    (20,25) zombie
-    (21,24)   1
-    (21,27)   0
-    (22,30)   1
-    (23,30)   0
-    (24,14)   1
-    (25,4)   0
-    (26,15)   1
-    (27,28)   2
-    (28,16)   1
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 26 entries, memory: 1.1 KB
-
-    (1,1)   2
-    (3,12)   2
-    (3,17)   2
-    (6,22)   2
-    (6,24)   0
-    (8,10)   0
-    (9,31)   4
-    (11,13)   3
-    (12,11)   2
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (19,1)   3
-    (21,24)   1
-    (21,27)   0
-    (22,30)   1
-    (23,30)   0
-    (24,14)   1
-    (25,4)   0
-    (26,15)   1
-    (27,28)   2
-    (28,16)   1
-    (29,9)   0
-    (30,24)   1
-    (31,31)   2
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 26 entries, memory: 1.1 KB
-
-    (1,1)   2
-    (3,12)   2
-    (3,17)   2
-    (6,22)   2
-    (6,24)   0
-    (8,10)   0
-    (9,31)   4
-    (11,13)   3
-    (12,11)   2
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (19,1)   3
-    (21,24)   1
-    (21,27)   0
-    (22,30)   1
-    (23,30)   0
-    (24,14)   1
-    (25,4)   0
-    (26,15)   1
-    (27,28)   2
-    (28,16)   1
-    (29,9)   0
-    (30,24)   1
-    (31,31)   2
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 26 entries, memory: 1.2 KB
-
-    (1,1)    0
-    (3,12)    0
-    (3,17)    0
-    (6,22)    0
-    (6,24)    0
-    (8,10)    0
-    (9,31)    0
-    (11,13)    0
-    (12,11)    0
-    (14,24)    0
-    (15,30)    0
-    (16,20)    0
-    (17,30)    0
-    (19,1)    0
-    (21,24)    0
-    (21,27)    0
-    (22,30)    0
-    (23,30)    0
-    (24,14)    0
-    (25,4)    0
-    (26,15)    0
-    (27,28)    0
-    (28,16)    0
-    (29,9)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 26 entries, memory: 1.0 KB
-
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (19,1)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    (29,9)   1
-    (30,24)   1
-    (31,31)   1
- work:26 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 320 values, invsparse = 4
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-320 nonzeroes left to fill..
-140 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff52fd100
-inside enumify: 0x7f1ff52fd100
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-bucket 6 has 32 dots to do
-LAUNCHING BUCKET CODE: 6
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vssp
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_dot3_phase3_vssp
- got kernel instance AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
----------------------------------------
---- Linker for void AxB_dot3_phase3_vssp<int, int, int>(long long, long long, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, int) ---
----------------------------------------
-info    : 0 bytes gmem
-info    : Function properties for '_Z20AxB_dot3_phase3_vsspIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i':
-info    : used 215 registers, 296 stack, 0 bytes smem, 412 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z20AxB_dot3_phase3_vsspIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 2.62758ms
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 15
-
-    (0,6) zombie
-    (1,1)   1
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22)   0
-    (6,24)   0
-    (8,10)   0
-    (9,19) zombie
-    (9,31)   1
-    (11,13) zombie
-    (12,11)   0
-    (14,24)   1
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   1
-    (20,25) zombie
-    (21,24) zombie
-    (21,27)   1
-    (22,30)   0
-    (23,30)   0
-    (24,14) zombie
-    (25,4) zombie
-    (26,15)   0
-    (27,28) zombie
-    (28,16)   0
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 17 entries, memory: 1.1 KB
-
-    (1,1)   1
-    (3,17)   0
-    (6,22)   0
-    (6,24)   0
-    (8,10)   0
-    (9,31)   1
-    (12,11)   0
-    (14,24)   1
-    (15,30)   1
-    (19,1)   1
-    (21,27)   1
-    (22,30)   0
-    (23,30)   0
-    (26,15)   0
-    (28,16)   0
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 17 entries, memory: 1.1 KB
-
-    (1,1)   1
-    (3,17)   0
-    (6,22)   0
-    (6,24)   0
-    (8,10)   0
-    (9,31)   1
-    (12,11)   0
-    (14,24)   1
-    (15,30)   1
-    (19,1)   1
-    (21,27)   1
-    (22,30)   0
-    (23,30)   0
-    (26,15)   0
-    (28,16)   0
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 17 entries, memory: 1.2 KB
-
-    (1,1)    0
-    (3,17)    0
-    (6,22)    0
-    (6,24)    0
-    (8,10)    0
-    (9,31)    0
-    (12,11)    0
-    (14,24)    0
-    (15,30)    0
-    (19,1)    0
-    (21,27)    0
-    (22,30)    0
-    (23,30)    0
-    (26,15)    0
-    (28,16)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 17 entries, memory: 1.0 KB
-
-    (1,1)   1
-    (3,17)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,31)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (19,1)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (26,15)   1
-    (28,16)   1
-    (30,24)   1
-    (31,31)   1
- work:17 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 1024 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff50e0300
-inside enumify: 0x7f1ff50e0300
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-bucket 7 has 32 dots to do
-LAUNCHING BUCKET CODE: 7
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
----------------------------------------
---- Linker for void AxB_dot3_phase3_vsvs<int, int, int>(long long, long long, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, int) ---
----------------------------------------
-info    : 0 bytes gmem
-info    : Function properties for '_Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i':
-info    : used 88 registers, 336 stack, 128 bytes smem, 412 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 2.37363ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff52ff000
-inside enumify: 0x7f1ff52ff000
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-bucket 8 has 32 dots to do
-LAUNCHING BUCKET CODE: 8
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.387072ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5300900
-inside enumify: 0x7f1ff5300900
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-bucket 9 has 32 dots to do
-LAUNCHING BUCKET CODE: 9
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.468992ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5301f00
-inside enumify: 0x7f1ff5301f00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-bucket 10 has 32 dots to do
-LAUNCHING BUCKET CODE: 10
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.418816ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 160 values, invsparse = 7
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-160 nonzeroes left to fill..
-51 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-20 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5306300
-inside enumify: 0x7f1ff5306300
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-bucket 11 has 32 dots to do
-LAUNCHING BUCKET CODE: 11
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_mp
- failed to open cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_dot3_phase3_mp
-compiling now
----------------------------------------
---- Source of GB_jit_AxB_dot3_phase3_mp ---
----------------------------------------
-  1 #include "/home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h"
-  2 #include "GB_jit_AxB_dot3_phase3_mp.cuh"
----------------------------------------
-Found #include GB_jit_AxB_dot3_phase3_mp.cuh from GB_jit_AxB_dot3_phase3_mp:2 [] at:
-  ../templates/GB_jit_AxB_dot3_phase3_mp.cuh
-Found #include limits from GB_jit_AxB_dot3_phase3_mp.cuh:36 [../templates/GB_jit_AxB_dot3_phase3_mp.cuh] at:
-  __jitify_builtin/limits
-Found #include climits from limits:4 [__jitify_builtin/limits] at:
-  __jitify_builtin/climits
-Found #include cfloat from limits:5 [__jitify_builtin/limits] at:
-  __jitify_builtin/cfloat
-Found #include cstdint from GB_jit_AxB_dot3_phase3_mp.cuh:37 [../templates/GB_jit_AxB_dot3_phase3_mp.cuh] at:
-  __jitify_builtin/cstdint
-Found #include cooperative_groups.h from GB_jit_AxB_dot3_phase3_mp.cuh:38 [../templates/GB_jit_AxB_dot3_phase3_mp.cuh] at:
-  /usr/local/cuda/include/cooperative_groups.h
-Found #include cooperative_groups/details/info.h from cooperative_groups.h:55 [/usr/local/cuda/include/cooperative_groups.h] at:
-  /usr/local/cuda/include/cooperative_groups/details/info.h
-Found #include cooperative_groups/details/driver_abi.h from cooperative_groups.h:56 [/usr/local/cuda/include/cooperative_groups.h] at:
-  /usr/local/cuda/include/cooperative_groups/details/driver_abi.h
-Found #include cooperative_groups/details/helpers.h from cooperative_groups.h:57 [/usr/local/cuda/include/cooperative_groups.h] at:
-  /usr/local/cuda/include/cooperative_groups/details/helpers.h
-Found #include sync.h from cooperative_groups/details/helpers.h:53 [/usr/local/cuda/include/cooperative_groups/details/helpers.h] at:
-  /usr/local/cuda/include/cooperative_groups/details/sync.h
-Found #include info.h from sync.h:52 [/usr/local/cuda/include/cooperative_groups/details/sync.h] at:
-  /usr/local/cuda/include/cooperative_groups/details/info.h
-Found #include cooperative_groups/details/partitioning.h from cooperative_groups.h:1810 [/usr/local/cuda/include/cooperative_groups.h] at:
-  /usr/local/cuda/include/cooperative_groups/details/partitioning.h
-Found #include matrix.h from GB_jit_AxB_dot3_phase3_mp.cuh:39 [../templates/GB_jit_AxB_dot3_phase3_mp.cuh] at:
-  ../matrix.h
-matrix.h(52): warning: stdbool.h: [jitify] File not found
-Found #include stddef.h from matrix.h:53 [../matrix.h] at:
-  __jitify_builtin/stddef.h
-Found #include GB_opaque.h from matrix.h:131 [../matrix.h] at:
-  ../../Source/GB_opaque.h
-Found #include GB_Operator.h from GB_opaque.h:397 [../../Source/GB_opaque.h] at:
-  ../../Source/Template/GB_Operator.h
-Found #include GB_matrix.h from GB_opaque.h:495 [../../Source/GB_opaque.h] at:
-  ../../Source/Template/GB_matrix.h
-Found #include GB_imin.h from matrix.h:135 [../matrix.h] at:
-  ../../Source/GB_imin.h
-Found #include GB_zombie.h from matrix.h:136 [../matrix.h] at:
-  ../../Source/GB_zombie.h
-Found #include GB_nnz.h from matrix.h:137 [../matrix.h] at:
-  ../../Source/GB_nnz.h
-Found #include GB_partition.h from matrix.h:138 [../matrix.h] at:
-  ../../Source/GB_partition.h
-Found #include GB_binary_search.h from matrix.h:139 [../matrix.h] at:
-  ../../Source/GB_binary_search.h
-Found #include GB_lookup_template.c from GB_binary_search.h:230 [../../Source/GB_binary_search.h] at:
-  ../../Source/Template/GB_lookup_template.c
-Found #include GB_search_for_vector_template.c from matrix.h:140 [../matrix.h] at:
-  ../../Source/Template/GB_search_for_vector_template.c
-completed func()
- compiled serialized prog GB_jit_AxB_dot3_phase3_mp
-writing prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_AxB_dot3_phase3_mp
- got kernel instance AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
- failed to open cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-compiling now
-About to instantiate kernel
-ABout to compile kernel
-done compilling
----------------------------------------
-_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i
----------------------------------------
---- PTX for 0x7ffcc2488c20 in GB_jit_AxB_dot3_phase3_mp ---
----------------------------------------
-//
-// Generated by NVIDIA NVVM Compiler
-//
-// Compiler Build ID: CL-30794723
-// Cuda compilation tools, release 11.6, V11.6.55
-// Based on NVVM 7.0.1
-//
-
-.version 7.6
-.target sm_70, debug
-.address_size 64
-
-	// .weak	_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i
-.weak .func  (.param .b32 func_retval0) _Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0_
-(
-	.param .align 8 .b8 _Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0__param_0[16],
-	.param .b32 _Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0__param_1
-)
-;
-.extern .func  (.param .b32 func_retval0) vprintf
-(
-	.param .b64 vprintf_param_0,
-	.param .b64 vprintf_param_1
-)
-;
-.weak .func _ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev
-(
-	.param .b64 _ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev_param_0
-)
-;
-.weak .func _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE
-(
-	.param .b64 _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE_param_0,
-	.param .b64 _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE_param_1
-)
-;
-.weak .func _ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2_
-(
-	.param .b64 _ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2__param_0,
-	.param .b64 _ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2__param_1
-)
-;
-.func  (.param .b64 func_retval0) _Z9atomicAddPyy
-(
-	.param .b64 _Z9atomicAddPyy_param_0,
-	.param .b64 _Z9atomicAddPyy_param_1
-)
-;
-.func  (.param .b32 func_retval0) _Z13__ballot_syncji
-(
-	.param .b32 _Z13__ballot_syncji_param_0,
-	.param .b32 _Z13__ballot_syncji_param_1
-)
-;
-.func  (.param .b32 func_retval0) _Z16__shfl_down_syncjiji
-(
-	.param .b32 _Z16__shfl_down_syncjiji_param_0,
-	.param .b32 _Z16__shfl_down_syncjiji_param_1,
-	.param .b32 _Z16__shfl_down_syncjiji_param_2,
-	.param .b32 _Z16__shfl_down_syncjiji_param_3
-)
-;
-.func  (.param .b64 func_retval0) __ullAtomicAdd
-(
-	.param .b64 __ullAtomicAdd_param_0,
-	.param .b64 __ullAtomicAdd_param_1
-)
-;
-.global .align 1 .b8 $str[42] = {119, 97, 114, 112, 32, 37, 100, 32, 122, 111, 109, 98, 105, 101, 32, 99, 111, 117, 110, 116, 32, 61, 32, 37, 100, 44, 32, 110, 122, 111, 109, 98, 105, 101, 115, 32, 61, 32, 37, 100, 10, 0};
-.global .align 1 .b8 $str$1[17] = {32, 67, 122, 111, 109, 98, 105, 101, 32, 61, 32, 37, 108, 108, 100, 10, 0};
-
-.weak .entry _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i(
-	.param .u64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_0,
-	.param .u64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_1,
-	.param .u64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_2,
-	.param .u64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_3,
-	.param .u64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_4,
-	.param .u64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_5,
-	.param .u64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_6,
-	.param .u32 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_7
-)
-{
-	.local .align 8 .b8 	__local_depot0[160];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<81>;
-	.reg .b16 	%rs<2>;
-	.reg .b32 	%r<141>;
-	.reg .b64 	%rd<224>;
-	.loc	10 76 0
-$L__func_begin0:
-	.loc	10 76 0
-
-
-	mov.u64 	%SPL, __local_depot0;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd31, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_0];
-	ld.param.u64 	%rd32, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_1];
-	ld.param.u64 	%rd33, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_2];
-	ld.param.u64 	%rd34, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_3];
-	ld.param.u64 	%rd35, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_4];
-	ld.param.u64 	%rd36, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_5];
-	ld.param.u64 	%rd37, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_6];
-	ld.param.u32 	%r67, [_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_7];
-	mov.b64 	%rd38, %rd33;
-	mov.b32 	%r68, %r67;
-$L__tmp0:
-	.loc	10 89 5
-	mov.u16 	%rs1, 1;
-	st.u8 	[%rd34+203], %rs1;
-	.loc	10 90 13
-	ld.u64 	%rd1, [%rd36+104];
-$L__tmp1:
-	.loc	10 91 13
-	ld.u64 	%rd2, [%rd37+104];
-$L__tmp2:
-	.loc	10 92 13
-	ld.u64 	%rd3, [%rd34+104];
-$L__tmp3:
-	.loc	10 93 17
-	ld.u64 	%rd39, [%rd34+96];
-	mov.b64 	%rd4, %rd39;
-$L__tmp4:
-	.loc	10 94 17
-	ld.u64 	%rd40, [%rd35+96];
-	mov.b64 	%rd5, %rd40;
-$L__tmp5:
-	.loc	10 95 17
-	ld.u64 	%rd41, [%rd36+96];
-	mov.b64 	%rd6, %rd41;
-$L__tmp6:
-	.loc	10 96 17
-	ld.u64 	%rd42, [%rd37+96];
-	mov.b64 	%rd7, %rd42;
-$L__tmp7:
-	.loc	10 97 17
-	ld.u64 	%rd43, [%rd36+88];
-	mov.b64 	%rd8, %rd43;
-$L__tmp8:
-	.loc	10 98 17
-	ld.u64 	%rd44, [%rd37+88];
-	mov.b64 	%rd9, %rd44;
-$L__tmp9:
-	.loc	10 102 12
-	mov.u32 	%r69, 0;
-	mov.b32 	%r1, %r69;
-$L__tmp10:
-	.loc	10 108 20
-	mov.u32 	%r70, %tid.x;
-	mov.u32 	%r71, %ntid.x;
-	mov.u32 	%r72, %ctaid.x;
-	mul.lo.s32 	%r73, %r71, %r72;
-	add.s32 	%r74, %r70, %r73;
-	mov.b32 	%r75, %r74;
-$L__tmp11:
-	.loc	10 109 13
-	mov.u32 	%r76, %tid.x;
-	mov.b32 	%r2, %r76;
-$L__tmp12:
-	.loc	10 111 11
-	mov.u32 	%r77, %ctaid.x;
-	mov.b32 	%r78, %r77;
-$L__tmp13:
-	.loc	10 114 18
-	mov.u64 	%rd45, 0;
-	mov.b64 	%rd46, %rd45;
-$L__tmp14:
-	.loc	10 115 18
-	mov.b64 	%rd47, %rd45;
-$L__tmp15:
-	.loc	10 116 25
-	mov.b64 	%rd48, %rd45;
-$L__tmp16:
-	.loc	10 0 25
-	add.u64 	%rd49, %SP, 88;
-	mov.b64 	%rd50, %rd49;
-	st.u64 	[%SP+80], %rd50;
-	.loc	3 682 5
-	bra.uni	$L__tmp17;
-$L__tmp17:
-	.loc	3 609 5
-	ld.u64 	%rd51, [%SP+80];
-	{ // callseq 0, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd51;
-	call.uni 
-	_ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev, 
-	(
-	param0
-	);
-	} // callseq 0
-$L__tmp18:
-	.loc	3 682 5
-	ld.u64 	%rd52, [%SP+88];
-	ld.u64 	%rd53, [%SP+96];
-$L__tmp19:
-	.loc	10 118 65
-	st.u64 	[%SP+120], %rd53;
-	st.u64 	[%SP+112], %rd52;
-	add.u64 	%rd54, %SP, 112;
-	mov.b64 	%rd55, %rd54;
-	st.u64 	[%SP+64], %rd55;
-	.loc	10 118 39
-	bra.uni	$L__tmp20;
-$L__tmp20:
-	.loc	3 1684 5
-	ld.u64 	%rd56, [%SP+64];
-	add.u64 	%rd57, %SP, 72;
-	mov.b64 	%rd58, %rd57;
-	st.u64 	[%SP+48], %rd58;
-	mov.b64 	%rd59, %rd56;
-	st.u64 	[%SP+56], %rd59;
-	.loc	3 1684 5
-	bra.uni	$L__tmp21;
-$L__tmp21:
-	.loc	3 1648 13
-	ld.u64 	%rd60, [%SP+48];
-	ld.u64 	%rd61, [%SP+56];
-	{ // callseq 1, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd60;
-	.param .b64 param1;
-	st.param.b64 	[param1+0], %rd61;
-	call.uni 
-	_ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2_, 
-	(
-	param0, 
-	param1
-	);
-	} // callseq 1
-	add.u64 	%rd62, %SP, 104;
-	mov.b64 	%rd63, %rd62;
-	st.u64 	[%SP+24], %rd63;
-$L__tmp22:
-	.loc	10 118 39
-	bra.uni	$L__tmp23;
-$L__tmp23:
-	.loc	3 1612 9
-	ld.u64 	%rd64, [%SP+24];
-	add.u64 	%rd65, %SP, 32;
-	mov.b64 	%rd66, %rd65;
-	st.u64 	[%SP+8], %rd66;
-	mov.b64 	%rd67, %rd64;
-	st.u64 	[%SP+16], %rd67;
-	.loc	3 1612 9
-	bra.uni	$L__tmp24;
-$L__tmp24:
-	.loc	3 1630 9
-	ld.u64 	%rd68, [%SP+8];
-	ld.u64 	%rd69, [%SP+16];
-	{ // callseq 2, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd68;
-	.param .b64 param1;
-	st.param.b64 	[param1+0], %rd69;
-	call.uni 
-	_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE, 
-	(
-	param0, 
-	param1
-	);
-	} // callseq 2
-$L__tmp25:
-	.loc	3 1612 9
-	ld.u64 	%rd70, [%SP+32];
-	ld.u64 	%rd71, [%SP+40];
-$L__tmp26:
-	.loc	10 118 39
-	st.u64 	[%SP+136], %rd71;
-	st.u64 	[%SP+128], %rd70;
-	.loc	10 120 15
-	mov.u32 	%r79, %ntid.x;
-	mov.b32 	%r3, %r79;
-$L__tmp27:
-	.loc	10 125 10
-	mov.u32 	%r80, %ctaid.x;
-	cvt.u64.u32 	%rd72, %r80;
-	add.s64 	%rd10, %rd31, %rd72;
-$L__tmp28:
-	.loc	10 125 5
-	mov.u32 	%r116, %r1;
-$L__tmp29:
-	mov.u64 	%rd219, %rd10;
-$L__tmp30:
-	bra.uni 	$L__BB0_1;
-
-$L__BB0_1:
-	mov.u64 	%rd11, %rd219;
-	mov.u32 	%r4, %r116;
-$L__tmp31:
-	setp.lt.s64 	%p15, %rd11, %rd32;
-	not.pred 	%p16, %p15;
-	@%p16 bra 	$L__BB0_69;
-	bra.uni 	$L__BB0_2;
-
-$L__BB0_2:
-$L__tmp32:
-	.loc	10 130 20
-	shl.b64 	%rd83, %rd11, 3;
-	add.s64 	%rd84, %rd5, %rd83;
-	ld.u64 	%rd85, [%rd84];
-	mov.b64 	%rd12, %rd85;
-$L__tmp33:
-	.loc	10 131 20
-	shl.b64 	%rd86, %rd11, 3;
-	add.s64 	%rd87, %rd4, %rd86;
-	ld.u64 	%rd88, [%rd87];
-	shr.s64 	%rd89, %rd88, 4;
-$L__tmp34:
-	.loc	10 133 25
-	shl.b64 	%rd90, %rd12, 3;
-	add.s64 	%rd91, %rd8, %rd90;
-	ld.u64 	%rd92, [%rd91];
-	mov.b64 	%rd13, %rd92;
-$L__tmp35:
-	.loc	10 134 25
-	shl.b64 	%rd93, %rd12, 3;
-	add.s64 	%rd94, %rd93, 8;
-	add.s64 	%rd95, %rd8, %rd94;
-	ld.u64 	%rd96, [%rd95];
-	mov.b64 	%rd14, %rd96;
-$L__tmp36:
-	.loc	10 135 10
-	sub.s64 	%rd15, %rd14, %rd13;
-$L__tmp37:
-	.loc	10 137 25
-	shl.b64 	%rd97, %rd89, 3;
-	add.s64 	%rd98, %rd9, %rd97;
-	ld.u64 	%rd99, [%rd98];
-	mov.b64 	%rd16, %rd99;
-$L__tmp38:
-	.loc	10 138 25
-	shl.b64 	%rd100, %rd89, 3;
-	add.s64 	%rd101, %rd100, 8;
-	add.s64 	%rd102, %rd9, %rd101;
-	ld.u64 	%rd103, [%rd102];
-	mov.b64 	%rd17, %rd103;
-$L__tmp39:
-	.loc	10 139 10
-	sub.s64 	%rd18, %rd17, %rd16;
-$L__tmp40:
-	.loc	10 144 10
-	sub.s64 	%rd104, %rd14, %rd13;
-	sub.s64 	%rd105, %rd17, %rd16;
-	setp.lt.s64 	%p21, %rd104, %rd105;
-	not.pred 	%p22, %p21;
-	@%p22 bra 	$L__BB0_4;
-	bra.uni 	$L__BB0_3;
-
-$L__BB0_3:
-	sub.s64 	%rd19, %rd14, %rd13;
-	bra.uni 	$L__BB0_5;
-
-$L__BB0_4:
-	sub.s64 	%rd20, %rd17, %rd16;
-	bra.uni 	$L__BB0_5;
-
-$L__BB0_5:
-$L__tmp41:
-	.loc	10 151 17
-	add.s64 	%rd21, %rd15, %rd18;
-$L__tmp42:
-	.loc	10 153 25
-	cvt.s64.s32 	%rd107, %r3;
-	add.s64 	%rd108, %rd21, %rd107;
-	sub.s64 	%rd109, %rd108, 1;
-	cvt.s64.s32 	%rd110, %r3;
-	div.s64 	%rd111, %rd109, %rd110;
-	cvt.u32.u64 	%r5, %rd111;
-$L__tmp43:
-	.loc	10 154 14
-	mul.lo.s32 	%r84, %r5, %r2;
-	cvt.s64.s32 	%rd112, %r84;
-	setp.lt.s64 	%p23, %rd112, %rd21;
-	not.pred 	%p24, %p23;
-	@%p24 bra 	$L__BB0_7;
-	bra.uni 	$L__BB0_6;
-
-$L__BB0_6:
-	mul.lo.s32 	%r85, %r5, %r2;
-	cvt.s64.s32 	%rd22, %r85;
-	mov.u64 	%rd220, %rd22;
-	bra.uni 	$L__BB0_8;
-
-$L__BB0_7:
-	mov.u64 	%rd220, %rd21;
-$L__tmp44:
-	bra.uni 	$L__BB0_8;
-
-$L__BB0_8:
-	mov.u64 	%rd23, %rd220;
-	cvt.u32.u64 	%r6, %rd23;
-$L__tmp45:
-	.loc	10 155 18
-	add.s32 	%r86, %r6, %r5;
-	cvt.s64.s32 	%rd113, %r86;
-	setp.lt.s64 	%p25, %rd113, %rd21;
-	not.pred 	%p26, %p25;
-	@%p26 bra 	$L__BB0_10;
-	bra.uni 	$L__BB0_9;
-
-$L__BB0_9:
-	add.s32 	%r87, %r6, %r5;
-	cvt.s64.s32 	%rd24, %r87;
-	mov.u64 	%rd221, %rd24;
-	bra.uni 	$L__BB0_11;
-
-$L__BB0_10:
-	mov.u64 	%rd221, %rd21;
-$L__tmp46:
-	bra.uni 	$L__BB0_11;
-
-$L__BB0_11:
-	mov.u64 	%rd25, %rd221;
-	cvt.u32.u64 	%r7, %rd25;
-$L__tmp47:
-	.loc	10 158 15
-	cvt.s64.s32 	%rd114, %r6;
-	sub.s64 	%rd115, %rd114, %rd18;
-	cvt.u32.u64 	%r88, %rd115;
-	setp.gt.s32 	%p27, %r88, 0;
-	not.pred 	%p28, %p27;
-	@%p28 bra 	$L__BB0_13;
-	bra.uni 	$L__BB0_12;
-
-$L__BB0_12:
-	cvt.s64.s32 	%rd116, %r6;
-	sub.s64 	%rd117, %rd116, %rd18;
-	cvt.u32.u64 	%r8, %rd117;
-	mov.u32 	%r117, %r8;
-	bra.uni 	$L__BB0_14;
-
-$L__BB0_13:
-	.loc	10 0 15
-	mov.u32 	%r89, 0;
-	.loc	10 158 15
-	mov.u32 	%r117, %r89;
-	bra.uni 	$L__BB0_14;
-
-$L__BB0_14:
-	mov.u32 	%r9, %r117;
-$L__tmp48:
-	.loc	10 159 15
-	cvt.s64.s32 	%rd118, %r6;
-	setp.lt.s64 	%p29, %rd118, %rd15;
-	not.pred 	%p30, %p29;
-	@%p30 bra 	$L__BB0_16;
-	bra.uni 	$L__BB0_15;
-
-$L__BB0_15:
-	cvt.s64.s32 	%rd26, %r6;
-	mov.u64 	%rd222, %rd26;
-	bra.uni 	$L__BB0_17;
-
-$L__BB0_16:
-	mov.u64 	%rd222, %rd15;
-$L__tmp49:
-	bra.uni 	$L__BB0_17;
-
-$L__BB0_17:
-	mov.u64 	%rd27, %rd222;
-	cvt.u32.u64 	%r10, %rd27;
-$L__tmp50:
-	.loc	10 162 5
-	mov.u32 	%r118, %r9;
-$L__tmp51:
-	mov.u32 	%r119, %r10;
-$L__tmp52:
-	bra.uni 	$L__BB0_18;
-
-$L__BB0_18:
-	mov.u32 	%r12, %r119;
-	mov.u32 	%r11, %r118;
-$L__tmp53:
-	setp.lt.s32 	%p31, %r11, %r12;
-	not.pred 	%p32, %p31;
-	@%p32 bra 	$L__BB0_23;
-	bra.uni 	$L__BB0_19;
-
-$L__BB0_19:
-$L__tmp54:
-	.loc	10 163 17
-	add.s32 	%r113, %r11, %r12;
-	div.s32 	%r13, %r113, 2;
-$L__tmp55:
-	.loc	10 164 7
-	cvt.s64.s32 	%rd209, %r13;
-	add.s64 	%rd210, %rd209, %rd13;
-	shl.b64 	%rd211, %rd210, 3;
-	add.s64 	%rd212, %rd6, %rd211;
-	ld.u64 	%rd213, [%rd212];
-	sub.s32 	%r114, %r6, %r13;
-	sub.s32 	%r115, %r114, 1;
-	cvt.s64.s32 	%rd214, %r115;
-	add.s64 	%rd215, %rd214, %rd16;
-	shl.b64 	%rd216, %rd215, 3;
-	add.s64 	%rd217, %rd7, %rd216;
-	ld.u64 	%rd218, [%rd217];
-	setp.lt.s64 	%p72, %rd213, %rd218;
-	not.pred 	%p73, %p72;
-	@%p73 bra 	$L__BB0_21;
-	bra.uni 	$L__BB0_20;
-
-$L__BB0_20:
-$L__tmp56:
-	.loc	10 165 10
-	add.s32 	%r14, %r13, 1;
-$L__tmp57:
-	mov.u32 	%r120, %r14;
-$L__tmp58:
-	mov.u32 	%r121, %r12;
-$L__tmp59:
-	bra.uni 	$L__BB0_22;
-$L__tmp60:
-
-$L__BB0_21:
-	.loc	10 168 10
-	mov.b32 	%r15, %r13;
-$L__tmp61:
-	mov.u32 	%r120, %r11;
-$L__tmp62:
-	mov.u32 	%r121, %r15;
-$L__tmp63:
-	bra.uni 	$L__BB0_22;
-
-$L__BB0_22:
-	mov.u32 	%r17, %r121;
-	mov.u32 	%r16, %r120;
-$L__tmp64:
-	mov.u32 	%r118, %r16;
-$L__tmp65:
-	mov.u32 	%r119, %r17;
-$L__tmp66:
-	bra.uni 	$L__BB0_18;
-$L__tmp67:
-
-$L__BB0_23:
-	.loc	10 171 16
-	mov.b32 	%r18, %r11;
-$L__tmp68:
-	.loc	10 172 16
-	sub.s32 	%r90, %r6, %r11;
-	sub.s32 	%r19, %r90, 1;
-$L__tmp69:
-	.loc	10 173 5
-	setp.gt.s32 	%p34, %r6, 0;
-	mov.pred 	%p33, 0;
-	not.pred 	%p35, %p34;
-	mov.pred 	%p74, %p33;
-	@%p35 bra 	$L__BB0_25;
-	bra.uni 	$L__BB0_24;
-
-$L__BB0_24:
-	cvt.s64.s32 	%rd119, %r6;
-	add.s64 	%rd120, %rd15, %rd18;
-	setp.lt.s64 	%p1, %rd119, %rd120;
-	mov.pred 	%p74, %p1;
-	bra.uni 	$L__BB0_25;
-
-$L__BB0_25:
-	mov.pred 	%p2, %p74;
-	mov.pred 	%p36, 0;
-	not.pred 	%p37, %p2;
-	mov.pred 	%p75, %p36;
-	@%p37 bra 	$L__BB0_27;
-	bra.uni 	$L__BB0_26;
-
-$L__BB0_26:
-	cvt.s64.s32 	%rd121, %r18;
-	add.s64 	%rd122, %rd121, %rd13;
-	shl.b64 	%rd123, %rd122, 3;
-	add.s64 	%rd124, %rd6, %rd123;
-	ld.u64 	%rd125, [%rd124];
-	cvt.s64.s32 	%rd126, %r19;
-	add.s64 	%rd127, %rd126, %rd16;
-	shl.b64 	%rd128, %rd127, 3;
-	add.s64 	%rd129, %rd7, %rd128;
-	ld.u64 	%rd130, [%rd129];
-	setp.eq.s64 	%p3, %rd125, %rd130;
-	mov.pred 	%p75, %p3;
-	bra.uni 	$L__BB0_27;
-
-$L__BB0_27:
-	mov.pred 	%p4, %p75;
-	not.pred 	%p38, %p4;
-	mov.u32 	%r122, %r6;
-$L__tmp70:
-	@%p38 bra 	$L__BB0_29;
-	bra.uni 	$L__BB0_28;
-
-$L__BB0_28:
-$L__tmp71:
-	.loc	10 174 8
-	add.s32 	%r20, %r6, -1;
-$L__tmp72:
-	mov.u32 	%r122, %r20;
-$L__tmp73:
-	bra.uni 	$L__BB0_29;
-$L__tmp74:
-
-$L__BB0_29:
-	.loc	10 177 18
-	mov.u32 	%r21, %r122;
-$L__tmp75:
-	cvt.s64.s32 	%rd131, %r18;
-	add.s64 	%rd132, %rd131, %rd13;
-	cvt.u32.u64 	%r22, %rd132;
-$L__tmp76:
-	.loc	10 178 18
-	sub.s32 	%r91, %r21, %r18;
-	cvt.s64.s32 	%rd133, %r91;
-	add.s64 	%rd134, %rd133, %rd16;
-	cvt.u32.u64 	%r23, %rd134;
-$L__tmp77:
-	.loc	10 183 5
-	cvt.s64.s32 	%rd135, %r7;
-	sub.s64 	%rd136, %rd135, %rd18;
-	cvt.u32.u64 	%r92, %rd136;
-	setp.gt.s32 	%p39, %r92, 0;
-	not.pred 	%p40, %p39;
-	@%p40 bra 	$L__BB0_31;
-	bra.uni 	$L__BB0_30;
-
-$L__BB0_30:
-	cvt.s64.s32 	%rd137, %r7;
-	sub.s64 	%rd138, %rd137, %rd18;
-	cvt.u32.u64 	%r24, %rd138;
-	mov.u32 	%r123, %r24;
-	bra.uni 	$L__BB0_32;
-
-$L__BB0_31:
-	.loc	10 0 5
-	mov.u32 	%r93, 0;
-	.loc	10 183 5
-	mov.u32 	%r123, %r93;
-	bra.uni 	$L__BB0_32;
-
-$L__BB0_32:
-	mov.u32 	%r25, %r123;
-$L__tmp78:
-	.loc	10 184 5
-	cvt.s64.s32 	%rd139, %r7;
-	setp.lt.s64 	%p41, %rd139, %rd15;
-	not.pred 	%p42, %p41;
-	@%p42 bra 	$L__BB0_34;
-	bra.uni 	$L__BB0_33;
-
-$L__BB0_33:
-	cvt.s64.s32 	%rd28, %r7;
-	mov.u64 	%rd223, %rd28;
-	bra.uni 	$L__BB0_35;
-
-$L__BB0_34:
-	mov.u64 	%rd223, %rd15;
-$L__tmp79:
-	bra.uni 	$L__BB0_35;
-
-$L__BB0_35:
-	mov.u64 	%rd29, %rd223;
-	cvt.u32.u64 	%r26, %rd29;
-$L__tmp80:
-	.loc	10 186 5
-	mov.u32 	%r124, %r25;
-$L__tmp81:
-	mov.u32 	%r125, %r26;
-$L__tmp82:
-	bra.uni 	$L__BB0_36;
-
-$L__BB0_36:
-	mov.u32 	%r28, %r125;
-	mov.u32 	%r27, %r124;
-$L__tmp83:
-	setp.lt.s32 	%p43, %r27, %r28;
-	not.pred 	%p44, %p43;
-	@%p44 bra 	$L__BB0_41;
-	bra.uni 	$L__BB0_37;
-
-$L__BB0_37:
-$L__tmp84:
-	.loc	10 187 18
-	add.s32 	%r110, %r27, %r28;
-	div.s32 	%r29, %r110, 2;
-$L__tmp85:
-	.loc	10 189 8
-	cvt.s64.s32 	%rd199, %r29;
-	add.s64 	%rd200, %rd199, %rd13;
-	shl.b64 	%rd201, %rd200, 3;
-	add.s64 	%rd202, %rd6, %rd201;
-	ld.u64 	%rd203, [%rd202];
-	sub.s32 	%r111, %r7, %r29;
-	sub.s32 	%r112, %r111, 1;
-	cvt.s64.s32 	%rd204, %r112;
-	add.s64 	%rd205, %rd204, %rd16;
-	shl.b64 	%rd206, %rd205, 3;
-	add.s64 	%rd207, %rd7, %rd206;
-	ld.u64 	%rd208, [%rd207];
-	setp.lt.s64 	%p70, %rd203, %rd208;
-	not.pred 	%p71, %p70;
-	@%p71 bra 	$L__BB0_39;
-	bra.uni 	$L__BB0_38;
-
-$L__BB0_38:
-$L__tmp86:
-	.loc	10 190 11
-	add.s32 	%r30, %r29, 1;
-$L__tmp87:
-	mov.u32 	%r126, %r30;
-$L__tmp88:
-	mov.u32 	%r127, %r28;
-$L__tmp89:
-	bra.uni 	$L__BB0_40;
-$L__tmp90:
-
-$L__BB0_39:
-	.loc	10 193 11
-	mov.b32 	%r31, %r29;
-$L__tmp91:
-	mov.u32 	%r126, %r27;
-$L__tmp92:
-	mov.u32 	%r127, %r31;
-$L__tmp93:
-	bra.uni 	$L__BB0_40;
-
-$L__BB0_40:
-	mov.u32 	%r33, %r127;
-	mov.u32 	%r32, %r126;
-$L__tmp94:
-	mov.u32 	%r124, %r32;
-$L__tmp95:
-	mov.u32 	%r125, %r33;
-$L__tmp96:
-	bra.uni 	$L__BB0_36;
-$L__tmp97:
-
-$L__BB0_41:
-	.loc	10 197 5
-	mov.b32 	%r34, %r27;
-$L__tmp98:
-	.loc	10 198 5
-	sub.s32 	%r94, %r7, %r27;
-	sub.s32 	%r35, %r94, 1;
-$L__tmp99:
-	.loc	10 199 5
-	cvt.s64.s32 	%rd140, %r7;
-	add.s64 	%rd141, %rd15, %rd18;
-	setp.lt.s64 	%p46, %rd140, %rd141;
-	mov.pred 	%p45, 0;
-	not.pred 	%p47, %p46;
-	mov.pred 	%p76, %p45;
-	@%p47 bra 	$L__BB0_43;
-	bra.uni 	$L__BB0_42;
-
-$L__BB0_42:
-	cvt.s64.s32 	%rd142, %r34;
-	add.s64 	%rd143, %rd142, %rd13;
-	shl.b64 	%rd144, %rd143, 3;
-	add.s64 	%rd145, %rd6, %rd144;
-	ld.u64 	%rd146, [%rd145];
-	cvt.s64.s32 	%rd147, %r35;
-	add.s64 	%rd148, %rd147, %rd16;
-	shl.b64 	%rd149, %rd148, 3;
-	add.s64 	%rd150, %rd7, %rd149;
-	ld.u64 	%rd151, [%rd150];
-	setp.eq.s64 	%p5, %rd146, %rd151;
-	mov.pred 	%p76, %p5;
-	bra.uni 	$L__BB0_43;
-
-$L__BB0_43:
-	mov.pred 	%p6, %p76;
-	not.pred 	%p48, %p6;
-	@%p48 bra 	$L__BB0_45;
-	bra.uni 	$L__BB0_44;
-
-$L__BB0_44:
-$L__tmp100:
-	.loc	10 200 9
-	bra.uni 	$L__BB0_45;
-$L__tmp101:
-
-$L__BB0_45:
-	.loc	10 203 16
-	cvt.s64.s32 	%rd152, %r34;
-	add.s64 	%rd153, %rd152, %rd13;
-	cvt.u32.u64 	%r36, %rd153;
-$L__tmp102:
-	.loc	10 204 16
-	sub.s32 	%r95, %r7, %r34;
-	cvt.s64.s32 	%rd154, %r95;
-	add.s64 	%rd155, %rd154, %rd16;
-	cvt.u32.u64 	%r37, %rd155;
-$L__tmp103:
-	.loc	10 208 13
-	mov.u32 	%r96, 0;
-	mov.b32 	%r38, %r96;
-$L__tmp104:
-	.loc	10 213 21
-	mov.b32 	%r39, %r96;
-$L__tmp105:
-	.loc	10 217 11
-	mov.b32 	%r40, %r22;
-$L__tmp106:
-	.loc	10 218 11
-	mov.b32 	%r41, %r23;
-$L__tmp107:
-	.loc	10 224 5
-	mov.u32 	%r128, %r38;
-$L__tmp108:
-	mov.u32 	%r129, %r39;
-$L__tmp109:
-	mov.u32 	%r130, %r40;
-$L__tmp110:
-	mov.u32 	%r131, %r41;
-$L__tmp111:
-	bra.uni 	$L__BB0_46;
-
-$L__BB0_46:
-	mov.u32 	%r45, %r131;
-	mov.u32 	%r44, %r130;
-	mov.u32 	%r43, %r129;
-	mov.u32 	%r42, %r128;
-$L__tmp112:
-	setp.lt.s32 	%p50, %r44, %r36;
-	mov.pred 	%p49, 0;
-	not.pred 	%p51, %p50;
-	mov.pred 	%p77, %p49;
-	@%p51 bra 	$L__BB0_48;
-	bra.uni 	$L__BB0_47;
-
-$L__BB0_47:
-	setp.lt.s32 	%p7, %r45, %r37;
-	mov.pred 	%p77, %p7;
-	bra.uni 	$L__BB0_48;
-
-$L__BB0_48:
-	mov.pred 	%p8, %p77;
-	mov.pred 	%p52, 0;
-	not.pred 	%p53, %p8;
-	mov.pred 	%p78, %p52;
-	@%p53 bra 	$L__BB0_50;
-	bra.uni 	$L__BB0_49;
-
-$L__BB0_49:
-	setp.ne.s64 	%p9, %rd15, 0;
-	mov.pred 	%p78, %p9;
-	bra.uni 	$L__BB0_50;
-
-$L__BB0_50:
-	mov.pred 	%p10, %p78;
-	mov.pred 	%p54, 0;
-	not.pred 	%p55, %p10;
-	mov.pred 	%p79, %p54;
-	@%p55 bra 	$L__BB0_52;
-	bra.uni 	$L__BB0_51;
-
-$L__BB0_51:
-	setp.ne.s64 	%p11, %rd18, 0;
-	mov.pred 	%p79, %p11;
-	bra.uni 	$L__BB0_52;
-
-$L__BB0_52:
-	mov.pred 	%p12, %p79;
-	not.pred 	%p56, %p12;
-	@%p56 bra 	$L__BB0_60;
-	bra.uni 	$L__BB0_53;
-
-$L__BB0_53:
-$L__tmp113:
-	.loc	10 226 9
-	cvt.s64.s32 	%rd169, %r44;
-	shl.b64 	%rd170, %rd169, 3;
-	add.s64 	%rd171, %rd6, %rd170;
-	ld.u64 	%rd172, [%rd171];
-	cvt.s64.s32 	%rd173, %r45;
-	shl.b64 	%rd174, %rd173, 3;
-	add.s64 	%rd175, %rd7, %rd174;
-	ld.u64 	%rd176, [%rd175];
-	setp.eq.s64 	%p64, %rd172, %rd176;
-	not.pred 	%p65, %p64;
-	@%p65 bra 	$L__BB0_58;
-	bra.uni 	$L__BB0_54;
-
-$L__BB0_54:
-$L__tmp114:
-	.loc	10 228 13
-	cvt.s64.s32 	%rd193, %r44;
-	shl.b64 	%rd194, %rd193, 2;
-	add.s64 	%rd195, %rd1, %rd194;
-	ld.u32 	%r106, [%rd195];
-	mov.b32 	%r46, %r106;
-$L__tmp115:
-	.loc	10 229 13
-	cvt.s64.s32 	%rd196, %r45;
-	shl.b64 	%rd197, %rd196, 2;
-	add.s64 	%rd198, %rd2, %rd197;
-	ld.u32 	%r107, [%rd198];
-	mov.b32 	%r47, %r107;
-$L__tmp116:
-	.loc	10 230 13
-	setp.ne.s32 	%p68, %r43, 0;
-	not.pred 	%p69, %p68;
-	@%p69 bra 	$L__BB0_56;
-	bra.uni 	$L__BB0_55;
-
-$L__BB0_55:
-$L__tmp117:
-	.loc	10 232 23
-	mul.lo.s32 	%r109, %r46, %r47;
-$L__tmp118:
-	.loc	10 233 17
-	add.s32 	%r48, %r42, %r109;
-$L__tmp119:
-	mov.u32 	%r132, %r48;
-$L__tmp120:
-	mov.u32 	%r133, %r43;
-$L__tmp121:
-	bra.uni 	$L__BB0_57;
-$L__tmp122:
-
-$L__BB0_56:
-	.loc	10 239 17
-	mov.u32 	%r108, 1;
-	mov.b32 	%r49, %r108;
-$L__tmp123:
-	.loc	10 240 17
-	mul.lo.s32 	%r50, %r46, %r47;
-$L__tmp124:
-	mov.u32 	%r132, %r50;
-$L__tmp125:
-	mov.u32 	%r133, %r49;
-$L__tmp126:
-	bra.uni 	$L__BB0_57;
-$L__tmp127:
-
-$L__BB0_57:
-	.loc	10 245 13
-	mov.u32 	%r52, %r133;
-	mov.u32 	%r51, %r132;
-$L__tmp128:
-	add.s32 	%r53, %r44, 1;
-$L__tmp129:
-	.loc	10 246 13
-	add.s32 	%r54, %r45, 1;
-$L__tmp130:
-	mov.u32 	%r134, %r51;
-$L__tmp131:
-	mov.u32 	%r135, %r52;
-$L__tmp132:
-	mov.u32 	%r136, %r53;
-$L__tmp133:
-	mov.u32 	%r137, %r54;
-$L__tmp134:
-	bra.uni 	$L__BB0_59;
-$L__tmp135:
-
-$L__BB0_58:
-	.loc	10 252 13
-	cvt.s64.s32 	%rd177, %r44;
-	shl.b64 	%rd178, %rd177, 3;
-	add.s64 	%rd179, %rd6, %rd178;
-	ld.u64 	%rd180, [%rd179];
-	cvt.s64.s32 	%rd181, %r45;
-	shl.b64 	%rd182, %rd181, 3;
-	add.s64 	%rd183, %rd7, %rd182;
-	ld.u64 	%rd184, [%rd183];
-	setp.lt.s64 	%p66, %rd180, %rd184;
-	selp.u32 	%r104, 1, 0, %p66;
-	add.s32 	%r55, %r44, %r104;
-$L__tmp136:
-	.loc	10 253 13
-	cvt.s64.s32 	%rd185, %r55;
-	shl.b64 	%rd186, %rd185, 3;
-	add.s64 	%rd187, %rd6, %rd186;
-	ld.u64 	%rd188, [%rd187];
-	cvt.s64.s32 	%rd189, %r45;
-	shl.b64 	%rd190, %rd189, 3;
-	add.s64 	%rd191, %rd7, %rd190;
-	ld.u64 	%rd192, [%rd191];
-	setp.gt.s64 	%p67, %rd188, %rd192;
-	selp.u32 	%r105, 1, 0, %p67;
-	add.s32 	%r56, %r45, %r105;
-$L__tmp137:
-	mov.u32 	%r134, %r42;
-$L__tmp138:
-	mov.u32 	%r135, %r43;
-$L__tmp139:
-	mov.u32 	%r136, %r55;
-$L__tmp140:
-	mov.u32 	%r137, %r56;
-$L__tmp141:
-	bra.uni 	$L__BB0_59;
-
-$L__BB0_59:
-	mov.u32 	%r60, %r137;
-	mov.u32 	%r59, %r136;
-	mov.u32 	%r58, %r135;
-	mov.u32 	%r57, %r134;
-$L__tmp142:
-	mov.u32 	%r128, %r57;
-$L__tmp143:
-	mov.u32 	%r129, %r58;
-$L__tmp144:
-	mov.u32 	%r130, %r59;
-$L__tmp145:
-	mov.u32 	%r131, %r60;
-$L__tmp146:
-	bra.uni 	$L__BB0_46;
-$L__tmp147:
-
-$L__BB0_60:
-	.loc	10 0 13
-	add.u64 	%rd156, %SP, 128;
-	mov.b64 	%rd157, %rd156;
-	st.u64 	[%SP+0], %rd157;
-	mov.b32 	%r97, %r43;
-$L__tmp148:
-	.loc	3 1081 50
-	bra.uni	$L__tmp149;
-$L__tmp149:
-	.loc	3 1012 27
-	mov.u32 	%r98, -1;
-	mov.b32 	%r99, %r98;
-$L__tmp150:
-	.loc	3 1019 9
-	mov.b32 	%r100, %r99;
-$L__tmp151:
-	.loc	3 1081 36
-	{ // callseq 6, 0
-	.reg .b32 temp_param_reg;
-	.param .b32 param0;
-	st.param.b32 	[param0+0], %r100;
-	.param .b32 param1;
-	st.param.b32 	[param1+0], %r97;
-	.param .b32 retval0;
-	call.uni (retval0), 
-	_Z13__ballot_syncji, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.b32 	%r101, [retval0+0];
-$L__tmp152:
-	} // callseq 6
-	.loc	3 1082 9
-	setp.ne.s32 	%p57, %r101, 0;
-	selp.u32 	%r102, 1, 0, %p57;
-$L__tmp153:
-	.loc	10 270 19
-	mov.b32 	%r61, %r102;
-$L__tmp154:
-	.loc	10 273 5
-	setp.ne.s32 	%p58, %r61, 0;
-	not.pred 	%p59, %p58;
-	mov.u32 	%r138, %r42;
-$L__tmp155:
-	@%p59 bra 	$L__BB0_62;
-	bra.uni 	$L__BB0_61;
-
-$L__BB0_61:
-$L__tmp156:
-	.loc	10 275 8
-	ld.u64 	%rd158, [%SP+136];
-	ld.u64 	%rd159, [%SP+128];
-	.loc	10 275 14
-	{ // callseq 7, 0
-	.reg .b32 temp_param_reg;
-	.param .align 8 .b8 param0[16];
-	st.param.b64 	[param0+0], %rd159;
-	st.param.b64 	[param0+8], %rd158;
-	.param .b32 param1;
-	st.param.b32 	[param1+0], %r42;
-	.param .b32 retval0;
-	call.uni (retval0), 
-	_Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0_, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.b32 	%r62, [retval0+0];
-$L__tmp157:
-	} // callseq 7
-	mov.u32 	%r138, %r62;
-$L__tmp158:
-	bra.uni 	$L__BB0_62;
-$L__tmp159:
-
-$L__BB0_62:
-	.loc	10 284 5
-	mov.u32 	%r63, %r138;
-$L__tmp160:
-	setp.eq.s32 	%p60, %r2, 0;
-	not.pred 	%p61, %p60;
-	mov.u32 	%r140, %r4;
-$L__tmp161:
-	@%p61 bra 	$L__BB0_67;
-	bra.uni 	$L__BB0_63;
-
-$L__BB0_63:
-$L__tmp162:
-	.loc	10 287 9
-	setp.ne.s32 	%p62, %r61, 0;
-	not.pred 	%p63, %p62;
-	@%p63 bra 	$L__BB0_65;
-	bra.uni 	$L__BB0_64;
-
-$L__BB0_64:
-$L__tmp163:
-	.loc	10 295 12
-	shl.b64 	%rd164, %rd11, 2;
-	add.s64 	%rd165, %rd3, %rd164;
-	st.u32 	[%rd165], %r63;
-	.loc	10 296 12
-	shl.b64 	%rd166, %rd11, 3;
-	add.s64 	%rd167, %rd4, %rd166;
-	st.u64 	[%rd167], %rd12;
-	mov.u32 	%r139, %r4;
-$L__tmp164:
-	bra.uni 	$L__BB0_66;
-$L__tmp165:
-
-$L__BB0_65:
-	.loc	10 301 12
-	add.s32 	%r64, %r4, 1;
-$L__tmp166:
-	.loc	10 302 12
-	neg.s64 	%rd160, %rd12;
-	sub.s64 	%rd161, %rd160, 2;
-	shl.b64 	%rd162, %rd11, 3;
-	add.s64 	%rd163, %rd4, %rd162;
-	st.u64 	[%rd163], %rd161;
-	mov.u32 	%r139, %r64;
-$L__tmp167:
-	bra.uni 	$L__BB0_66;
-
-$L__BB0_66:
-	mov.u32 	%r65, %r139;
-$L__tmp168:
-	mov.u32 	%r140, %r65;
-$L__tmp169:
-	bra.uni 	$L__BB0_67;
-$L__tmp170:
-
-$L__BB0_67:
-	.loc	10 127 10
-	mov.u32 	%r66, %r140;
-$L__tmp171:
-	bra.uni 	$L__BB0_68;
-
-$L__BB0_68:
-	mov.u32 	%r103, %nctaid.x;
-	cvt.u64.u32 	%rd168, %r103;
-	add.s64 	%rd30, %rd11, %rd168;
-$L__tmp172:
-	mov.u32 	%r116, %r66;
-$L__tmp173:
-	mov.u64 	%rd219, %rd30;
-$L__tmp174:
-	bra.uni 	$L__BB0_1;
-$L__tmp175:
-
-$L__BB0_69:
-	.loc	10 310 3
-	setp.eq.s32 	%p18, %r2, 0;
-	mov.pred 	%p17, 0;
-	not.pred 	%p19, %p18;
-	mov.pred 	%p80, %p17;
-	@%p19 bra 	$L__BB0_71;
-	bra.uni 	$L__BB0_70;
-
-$L__BB0_70:
-	setp.gt.s32 	%p13, %r4, 0;
-	mov.pred 	%p80, %p13;
-	bra.uni 	$L__BB0_71;
-
-$L__BB0_71:
-	mov.pred 	%p14, %p80;
-	not.pred 	%p20, %p14;
-	@%p20 bra 	$L__BB0_73;
-	bra.uni 	$L__BB0_72;
-
-$L__BB0_72:
-$L__tmp176:
-	.loc	10 312 7
-	mov.u32 	%r81, %ctaid.x;
-	ld.u64 	%rd73, [%rd34+176];
-	st.u32 	[%SP+144], %r81;
-	st.u32 	[%SP+148], %r4;
-	st.u64 	[%SP+152], %rd73;
-	mov.u64 	%rd74, $str;
-	cvta.global.u64 	%rd75, %rd74;
-	add.u64 	%rd76, %SP, 144;
-	{ // callseq 3, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd75;
-	.param .b64 param1;
-	st.param.b64 	[param1+0], %rd76;
-	.param .b32 retval0;
-	call.uni (retval0), 
-	vprintf, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.b32 	%r82, [retval0+0];
-	} // callseq 3
-	.loc	10 313 7
-	add.s64 	%rd77, %rd34, 176;
-	cvt.s64.s32 	%rd78, %r4;
-	{ // callseq 4, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd77;
-	.param .b64 param1;
-	st.param.b64 	[param1+0], %rd78;
-	.param .b64 retval0;
-	call.uni (retval0), 
-	_Z9atomicAddPyy, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.b64 	%rd79, [retval0+0];
-	} // callseq 4
-	.loc	10 314 7
-	ld.u64 	%rd80, [%rd34+176];
-	st.u64 	[%SP+144], %rd80;
-	mov.u64 	%rd81, $str$1;
-	cvta.global.u64 	%rd82, %rd81;
-	{ // callseq 5, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd82;
-	.param .b64 param1;
-	st.param.b64 	[param1+0], %rd76;
-	.param .b32 retval0;
-	call.uni (retval0), 
-	vprintf, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.b32 	%r83, [retval0+0];
-	} // callseq 5
-	bra.uni 	$L__BB0_73;
-$L__tmp177:
-
-$L__BB0_73:
-	.loc	10 319 1
-	ret;
-$L__tmp178:
-$L__func_end0:
-
-}
-	// .weak	_Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0_
-.weak .func  (.param .b32 func_retval0) _Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0_(
-	.param .align 8 .b8 _Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0__param_0[16],
-	.param .b32 _Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0__param_1
-)
-{
-	.local .align 8 .b8 	__local_depot1[32];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .pred 	%p<3>;
-	.reg .b32 	%r<24>;
-	.reg .b64 	%rd<9>;
-	.loc	10 48 0
-$L__func_begin1:
-	.loc	10 48 0
-
-
-	mov.u64 	%SPL, __local_depot1;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd2, [_Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0__param_0+8];
-	ld.param.u32 	%r4, [_Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0__param_1];
-	ld.param.u64 	%rd1, [_Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0__param_0];
-	st.u64 	[%SP+16], %rd2;
-	st.u64 	[%SP+8], %rd1;
-	st.u32 	[%SP+24], %r4;
-$L__tmp179:
-	.loc	10 52 16
-	mov.u32 	%r5, 16;
-	mov.b32 	%r1, %r5;
-$L__tmp180:
-	.loc	10 52 5
-	mov.u32 	%r23, %r1;
-$L__tmp181:
-	bra.uni 	$L__BB1_1;
-
-$L__BB1_1:
-	mov.u32 	%r2, %r23;
-$L__tmp182:
-	setp.gt.s32 	%p1, %r2, 0;
-	not.pred 	%p2, %p1;
-	@%p2 bra 	$L__BB1_4;
-	bra.uni 	$L__BB1_2;
-
-$L__BB1_2:
-	.loc	10 0 5
-	add.u64 	%rd3, %SP, 8;
-	mov.b64 	%rd4, %rd3;
-	st.u64 	[%SP+0], %rd4;
-	add.u64 	%rd5, %SP, 24;
-	mov.b64 	%rd6, %rd5;
-$L__tmp183:
-	mov.b32 	%r8, %r2;
-$L__tmp184:
-	mov.b64 	%rd7, %rd6;
-$L__tmp185:
-	.loc	3 1040 13
-	bra.uni	$L__tmp186;
-$L__tmp186:
-	.loc	4 232 5
-	mov.b64 	%rd8, %rd7;
-$L__tmp187:
-	.loc	3 1040 13
-	ld.u32 	%r9, [%rd8];
-	.loc	3 1040 55
-	bra.uni	$L__tmp188;
-$L__tmp188:
-	.loc	3 1012 27
-	mov.u32 	%r10, -1;
-	mov.b32 	%r11, %r10;
-$L__tmp189:
-	.loc	3 1019 9
-	mov.b32 	%r12, %r11;
-	mov.b32 	%r13, %r9;
-$L__tmp190:
-	.loc	3 0 9
-	mov.b32 	%r14, %r12;
-$L__tmp191:
-	mov.b32 	%r15, %r8;
-$L__tmp192:
-	mov.u32 	%r16, 32;
-	mov.b32 	%r17, %r16;
-$L__tmp193:
-	.loc	3 1039 16
-	bra.uni	$L__tmp194;
-$L__tmp194:
-	.loc	5 327 44
-	{ // callseq 8, 0
-	.reg .b32 temp_param_reg;
-	.param .b32 param0;
-	st.param.b32 	[param0+0], %r14;
-	.param .b32 param1;
-	st.param.b32 	[param1+0], %r13;
-	.param .b32 param2;
-	st.param.b32 	[param2+0], %r15;
-	.param .b32 param3;
-	st.param.b32 	[param3+0], %r17;
-	.param .b32 retval0;
-	call.uni (retval0), 
-	_Z16__shfl_down_syncjiji, 
-	(
-	param0, 
-	param1, 
-	param2, 
-	param3
-	);
-	ld.param.b32 	%r18, [retval0+0];
-	} // callseq 8
-$L__tmp195:
-	.loc	3 1039 16
-	mov.b32 	%r19, %r18;
-$L__tmp196:
-	.loc	10 54 18
-	mov.b32 	%r20, %r19;
-$L__tmp197:
-	.loc	10 55 9
-	ld.u32 	%r21, [%SP+24];
-	add.s32 	%r22, %r21, %r20;
-	st.u32 	[%SP+24], %r22;
-$L__tmp198:
-	.loc	10 52 39
-	bra.uni 	$L__BB1_3;
-
-$L__BB1_3:
-	div.s32 	%r3, %r2, 2;
-$L__tmp199:
-	mov.u32 	%r23, %r3;
-$L__tmp200:
-	bra.uni 	$L__BB1_1;
-$L__tmp201:
-
-$L__BB1_4:
-	.loc	10 57 5
-	ld.u32 	%r6, [%SP+24];
-	mov.b32 	%r7, %r6;
-	st.param.b32 	[func_retval0+0], %r7;
-	ret;
-$L__tmp202:
-$L__func_end1:
-
-}
-	// .weak	_ZN4dim3C1E5uint3
-.weak .func _ZN4dim3C1E5uint3(
-	.param .b64 _ZN4dim3C1E5uint3_param_0,
-	.param .align 4 .b8 _ZN4dim3C1E5uint3_param_1[12]
-)
-{
-	.local .align 4 .b8 	__local_depot2[12];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b32 	%r<7>;
-	.reg .b64 	%rd<2>;
-	.loc	7 979 0
-$L__func_begin2:
-	.loc	7 979 0
-
-
-	mov.u64 	%SPL, __local_depot2;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN4dim3C1E5uint3_param_0];
-	ld.param.u32 	%r2, [_ZN4dim3C1E5uint3_param_1+4];
-	ld.param.u32 	%r3, [_ZN4dim3C1E5uint3_param_1+8];
-	ld.param.u32 	%r1, [_ZN4dim3C1E5uint3_param_1];
-	st.u32 	[%SP+8], %r3;
-	st.u32 	[%SP+4], %r2;
-	st.u32 	[%SP+0], %r1;
-$L__tmp203:
-	.loc	7 979 47
-	ld.u32 	%r4, [%SP+0];
-	st.u32 	[%rd1], %r4;
-	.loc	7 979 55
-	ld.u32 	%r5, [%SP+4];
-	st.u32 	[%rd1+4], %r5;
-	.loc	7 979 63
-	ld.u32 	%r6, [%SP+8];
-	st.u32 	[%rd1+8], %r6;
-	.loc	7 979 71
-	ret;
-$L__tmp204:
-$L__func_end2:
-
-}
-	// .weak	_ZN18cooperative_groups4__v112thread_groupC2Ej
-.weak .func _ZN18cooperative_groups4__v112thread_groupC2Ej(
-	.param .b64 _ZN18cooperative_groups4__v112thread_groupC2Ej_param_0,
-	.param .b32 _ZN18cooperative_groups4__v112thread_groupC2Ej_param_1
-)
-{
-	.local .align 8 .b8 	__local_depot3[8];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b32 	%r<10>;
-	.reg .b64 	%rd<5>;
-
-
-	mov.u64 	%SPL, __local_depot3;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v112thread_groupC2Ej_param_0];
-	ld.param.u32 	%r1, [_ZN18cooperative_groups4__v112thread_groupC2Ej_param_1];
-	mov.b64 	%rd2, %rd1;
-	st.u64 	[%SP+0], %rd2;
-	mov.b32 	%r2, %r1;
-	ld.u64 	%rd3, [%SP+0];
-	and.b32  	%r3, %r2, 127;
-	ld.u32 	%r4, [%rd3];
-	and.b32  	%r5, %r4, -255;
-	shl.b32 	%r6, %r3, 1;
-	or.b32  	%r7, %r5, %r6;
-	st.u32 	[%rd3], %r7;
-	ld.u64 	%rd4, [%SP+0];
-	ld.u32 	%r8, [%rd4];
-	and.b32  	%r9, %r8, -2;
-	st.u32 	[%rd4], %r9;
-	ret;
-
-}
-	// .weak	_ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev
-.weak .func _ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev(
-	.param .b64 _ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev_param_0
-)
-{
-	.local .align 8 .b8 	__local_depot4[8];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b32 	%r<2>;
-	.reg .b64 	%rd<4>;
-
-
-	mov.u64 	%SPL, __local_depot4;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev_param_0];
-	mov.b64 	%rd2, %rd1;
-	st.u64 	[%SP+0], %rd2;
-	ld.u64 	%rd3, [%SP+0];
-	mov.u32 	%r1, 4;
-	{ // callseq 9, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd3;
-	.param .b32 param1;
-	st.param.b32 	[param1+0], %r1;
-	call.uni 
-	_ZN18cooperative_groups4__v112thread_groupC2Ej, 
-	(
-	param0, 
-	param1
-	);
-	} // callseq 9
-	ret;
-
-}
-	// .weak	_ZN18cooperative_groups4__v117thread_group_baseILj1EEC2Ev
-.weak .func _ZN18cooperative_groups4__v117thread_group_baseILj1EEC2Ev(
-	.param .b64 _ZN18cooperative_groups4__v117thread_group_baseILj1EEC2Ev_param_0
-)
-{
-	.local .align 8 .b8 	__local_depot5[8];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b32 	%r<2>;
-	.reg .b64 	%rd<4>;
-
-
-	mov.u64 	%SPL, __local_depot5;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v117thread_group_baseILj1EEC2Ev_param_0];
-	mov.b64 	%rd2, %rd1;
-	st.u64 	[%SP+0], %rd2;
-	ld.u64 	%rd3, [%SP+0];
-	mov.u32 	%r1, 1;
-	{ // callseq 10, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd3;
-	.param .b32 param1;
-	st.param.b32 	[param1+0], %r1;
-	call.uni 
-	_ZN18cooperative_groups4__v112thread_groupC2Ej, 
-	(
-	param0, 
-	param1
-	);
-	} // callseq 10
-	ret;
-
-}
-	// .weak	_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj
-.weak .func _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj(
-	.param .b64 _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj_param_0,
-	.param .b32 _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj_param_1,
-	.param .b32 _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj_param_2
-)
-{
-	.local .align 8 .b8 	__local_depot6[8];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b32 	%r<23>;
-	.reg .b64 	%rd<9>;
-
-
-	mov.u64 	%SPL, __local_depot6;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj_param_0];
-	ld.param.u32 	%r1, [_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj_param_1];
-	ld.param.u32 	%r2, [_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj_param_2];
-	mov.b64 	%rd2, %rd1;
-	st.u64 	[%SP+0], %rd2;
-	mov.b32 	%r3, %r1;
-	mov.b32 	%r4, %r2;
-	ld.u64 	%rd3, [%SP+0];
-	{ // callseq 11, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd3;
-	call.uni 
-	_ZN18cooperative_groups4__v117thread_group_baseILj1EEC2Ev, 
-	(
-	param0
-	);
-	} // callseq 11
-	mov.u32 	%r5, -1;
-	mov.b32 	%r6, %r5;
-	mov.b32 	%r7, %r6;
-	ld.u64 	%rd4, [%SP+0];
-	st.u32 	[%rd4+8], %r7;
-	ld.u64 	%rd5, [%SP+0];
-	ld.u32 	%r8, [%rd5];
-	and.b32  	%r9, %r8, 255;
-	or.b32  	%r10, %r9, 8192;
-	st.u32 	[%rd5], %r10;
-	ld.u64 	%rd6, [%SP+0];
-	and.b32  	%r11, %r3, 65535;
-	ld.u32 	%r12, [%rd6+4];
-	and.b32  	%r13, %r12, 65535;
-	shl.b32 	%r14, %r11, 16;
-	or.b32  	%r15, %r13, %r14;
-	st.u32 	[%rd6+4], %r15;
-	ld.u64 	%rd7, [%SP+0];
-	and.b32  	%r16, %r4, 65535;
-	ld.u32 	%r17, [%rd7+4];
-	and.b32  	%r18, %r17, -65536;
-	or.b32  	%r19, %r18, %r16;
-	st.u32 	[%rd7+4], %r19;
-	ld.u64 	%rd8, [%SP+0];
-	ld.u32 	%r20, [%rd8];
-	and.b32  	%r21, %r20, -2;
-	or.b32  	%r22, %r21, 1;
-	st.u32 	[%rd8], %r22;
-	ret;
-
-}
-	// .weak	_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE
-.weak .func _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE(
-	.param .b64 _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE_param_0,
-	.param .b64 _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE_param_1
-)
-{
-	.local .align 8 .b8 	__local_depot7[80];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b32 	%r<40>;
-	.reg .b64 	%rd<7>;
-
-
-	mov.u64 	%SPL, __local_depot7;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE_param_0];
-	ld.param.u64 	%rd2, [_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE_param_1];
-	mov.b64 	%rd3, %rd1;
-	st.u64 	[%SP+64], %rd3;
-	mov.b64 	%rd4, %rd2;
-	st.u64 	[%SP+72], %rd4;
-	ld.u64 	%rd5, [%SP+64];
-	mov.u32 	%r1, %tid.x;
-	st.u32 	[%SP+36], %r1;
-	mov.u32 	%r2, %tid.y;
-	st.u32 	[%SP+40], %r2;
-	mov.u32 	%r3, %tid.z;
-	st.u32 	[%SP+44], %r3;
-	ld.u32 	%r4, [%SP+44];
-	ld.u32 	%r5, [%SP+40];
-	ld.u32 	%r6, [%SP+36];
-	add.u64 	%rd6, %SP, 24;
-	{ // callseq 12, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd6;
-	.param .align 4 .b8 param1[12];
-	st.param.b32 	[param1+0], %r6;
-	st.param.b32 	[param1+4], %r5;
-	st.param.b32 	[param1+8], %r4;
-	call.uni 
-	_ZN4dim3C1E5uint3, 
-	(
-	param0, 
-	param1
-	);
-	} // callseq 12
-	ld.u32 	%r7, [%SP+24];
-	ld.u32 	%r8, [%SP+28];
-	ld.u32 	%r9, [%SP+32];
-	mov.u32 	%r10, %ntid.x;
-	st.u32 	[%SP+48], %r10;
-	mov.u32 	%r11, %ntid.y;
-	st.u32 	[%SP+52], %r11;
-	mov.u32 	%r12, %ntid.z;
-	st.u32 	[%SP+56], %r12;
-	ld.u32 	%r13, [%SP+48];
-	ld.u32 	%r14, [%SP+52];
-	ld.u32 	%r15, [%SP+56];
-	st.u32 	[%SP+8], %r9;
-	st.u32 	[%SP+4], %r8;
-	st.u32 	[%SP+0], %r7;
-	st.u32 	[%SP+20], %r15;
-	st.u32 	[%SP+16], %r14;
-	st.u32 	[%SP+12], %r13;
-	ld.u32 	%r16, [%SP+8];
-	ld.u32 	%r17, [%SP+16];
-	mul.lo.s32 	%r18, %r16, %r17;
-	ld.u32 	%r19, [%SP+12];
-	mul.lo.s32 	%r20, %r18, %r19;
-	ld.u32 	%r21, [%SP+4];
-	ld.u32 	%r22, [%SP+12];
-	mul.lo.s32 	%r23, %r21, %r22;
-	add.s32 	%r24, %r20, %r23;
-	ld.u32 	%r25, [%SP+0];
-	add.s32 	%r26, %r24, %r25;
-	mov.b32 	%r27, %r26;
-	mov.b32 	%r28, %r27;
-	div.u32 	%r29, %r28, 32;
-	mov.u32 	%r30, %ntid.x;
-	mov.u32 	%r31, %ntid.y;
-	mul.lo.s32 	%r32, %r30, %r31;
-	mov.u32 	%r33, %ntid.z;
-	mul.lo.s32 	%r34, %r32, %r33;
-	mov.b32 	%r35, %r34;
-	mov.b32 	%r36, %r35;
-	add.s32 	%r37, %r36, 32;
-	sub.s32 	%r38, %r37, 1;
-	div.u32 	%r39, %r38, 32;
-	{ // callseq 13, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd5;
-	.param .b32 param1;
-	st.param.b32 	[param1+0], %r29;
-	.param .b32 param2;
-	st.param.b32 	[param2+0], %r39;
-	call.uni 
-	_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj, 
-	(
-	param0, 
-	param1, 
-	param2
-	);
-	} // callseq 13
-	ret;
-
-}
-	// .weak	_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32ENS0_12thread_blockEEC2Ev
-.weak .func _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32ENS0_12thread_blockEEC2Ev(
-	.param .b64 _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32ENS0_12thread_blockEEC2Ev_param_0
-)
-{
-	.local .align 8 .b8 	__local_depot8[8];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b64 	%rd<3>;
-
-
-	mov.u64 	%SPL, __local_depot8;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32ENS0_12thread_blockEEC2Ev_param_0];
-	mov.b64 	%rd2, %rd1;
-	st.u64 	[%SP+0], %rd2;
-	ret;
-
-}
-	// .weak	_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3_
-.weak .func _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3_(
-	.param .b64 _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3__param_0,
-	.param .b64 _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3__param_1
-)
-{
-	.local .align 8 .b8 	__local_depot9[16];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b64 	%rd<6>;
-
-
-	mov.u64 	%SPL, __local_depot9;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3__param_0];
-	ld.param.u64 	%rd2, [_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3__param_1];
-	mov.b64 	%rd3, %rd1;
-	st.u64 	[%SP+0], %rd3;
-	mov.b64 	%rd4, %rd2;
-	st.u64 	[%SP+8], %rd4;
-	ld.u64 	%rd5, [%SP+0];
-	{ // callseq 14, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd5;
-	call.uni 
-	_ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32ENS0_12thread_blockEEC2Ev, 
-	(
-	param0
-	);
-	} // callseq 14
-	ret;
-
-}
-	// .weak	_ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2_
-.weak .func _ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2_(
-	.param .b64 _ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2__param_0,
-	.param .b64 _ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2__param_1
-)
-{
-	.local .align 8 .b8 	__local_depot10[16];
-	.reg .b64 	%SP;
-	.reg .b64 	%SPL;
-	.reg .b64 	%rd<7>;
-
-
-	mov.u64 	%SPL, __local_depot10;
-	cvta.local.u64 	%SP, %SPL;
-	ld.param.u64 	%rd1, [_ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2__param_0];
-	ld.param.u64 	%rd2, [_ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2__param_1];
-	mov.b64 	%rd3, %rd1;
-	st.u64 	[%SP+0], %rd3;
-	mov.b64 	%rd4, %rd2;
-	st.u64 	[%SP+8], %rd4;
-	ld.u64 	%rd5, [%SP+0];
-	ld.u64 	%rd6, [%SP+8];
-	{ // callseq 15, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd5;
-	.param .b64 param1;
-	st.param.b64 	[param1+0], %rd6;
-	call.uni 
-	_ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3_, 
-	(
-	param0, 
-	param1
-	);
-	} // callseq 15
-	ret;
-
-}
-.func  (.param .b64 func_retval0) _Z9atomicAddPyy(
-	.param .b64 _Z9atomicAddPyy_param_0,
-	.param .b64 _Z9atomicAddPyy_param_1
-)
-{
-	.reg .b64 	%rd<4>;
-
-
-	ld.param.u64 	%rd1, [_Z9atomicAddPyy_param_0];
-	ld.param.u64 	%rd2, [_Z9atomicAddPyy_param_1];
-	{ // callseq 16, 0
-	.reg .b32 temp_param_reg;
-	.param .b64 param0;
-	st.param.b64 	[param0+0], %rd1;
-	.param .b64 param1;
-	st.param.b64 	[param1+0], %rd2;
-	.param .b64 retval0;
-	call.uni (retval0), 
-	__ullAtomicAdd, 
-	(
-	param0, 
-	param1
-	);
-	ld.param.b64 	%rd3, [retval0+0];
-	} // callseq 16
-	st.param.b64 	[func_retval0+0], %rd3;
-	ret;
-
-}
-.func  (.param .b32 func_retval0) _Z13__ballot_syncji(
-	.param .b32 _Z13__ballot_syncji_param_0,
-	.param .b32 _Z13__ballot_syncji_param_1
-)
-{
-	.reg .pred 	%p<3>;
-	.reg .b32 	%r<4>;
-
-
-	ld.param.u32 	%r1, [_Z13__ballot_syncji_param_0];
-	ld.param.u32 	%r2, [_Z13__ballot_syncji_param_1];
-	setp.ne.s32 	%p1, %r2, 0;
-	vote.sync.ballot.b32 	%r3, %p1, %r1;
-	st.param.b32 	[func_retval0+0], %r3;
-	ret;
-
-}
-.func  (.param .b32 func_retval0) _Z16__shfl_down_syncjiji(
-	.param .b32 _Z16__shfl_down_syncjiji_param_0,
-	.param .b32 _Z16__shfl_down_syncjiji_param_1,
-	.param .b32 _Z16__shfl_down_syncjiji_param_2,
-	.param .b32 _Z16__shfl_down_syncjiji_param_3
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<10>;
-
-
-	ld.param.u32 	%r1, [_Z16__shfl_down_syncjiji_param_0];
-	ld.param.u32 	%r2, [_Z16__shfl_down_syncjiji_param_1];
-	ld.param.u32 	%r3, [_Z16__shfl_down_syncjiji_param_2];
-	ld.param.u32 	%r4, [_Z16__shfl_down_syncjiji_param_3];
-	mov.u32 	%r5, 32;
-	sub.s32 	%r6, %r5, %r4;
-	shl.b32 	%r7, %r6, 8;
-	or.b32  	%r8, %r7, 31;
-	shfl.sync.down.b32 	%r9|%p1, %r2, %r3, %r8, %r1;
-	st.param.b32 	[func_retval0+0], %r9;
-	ret;
-
-}
-.func  (.param .b64 func_retval0) __ullAtomicAdd(
-	.param .b64 __ullAtomicAdd_param_0,
-	.param .b64 __ullAtomicAdd_param_1
-)
-{
-	.reg .b64 	%rd<4>;
-
-
-	ld.param.u64 	%rd1, [__ullAtomicAdd_param_0];
-	ld.param.u64 	%rd2, [__ullAtomicAdd_param_1];
-	atom.add.u64 	%rd3, [%rd1], %rd2;
-	st.param.b64 	[func_retval0+0], %rd3;
-	ret;
-
-}
-	.file	1 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/GB_opaque.h"
-	.file	2 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/matrix.h"
-	.file	3 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/cooperative_groups.h"
-	.file	4 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/cooperative_groups/details/info.h"
-	.file	5 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/cooperative_groups/details/helpers.h"
-	.file	6 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/cstdint"
-	.file	7 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/__nv_nvrtc_builtin_header.h"
-	.file	8 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/GB_matrix.h"
-	.file	9 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/cooperative_groups/details/driver_abi.h"
-	.file	10 "/share/workspace/nvidia_projects/GraphBLAS/CUDA/test/GB_jit_AxB_dot3_phase3_mp.cuh"
-	.section	.debug_loc
-	{
-.b64 $L__tmp10
-.b64 $L__tmp29
-.b8 5
-.b8 0
-.b8 144
-.b8 177
-.b8 228
-.b8 149
-.b8 1
-.b64 $L__tmp29
-.b64 $L__tmp31
-.b8 7
-.b8 0
-.b8 144
-.b8 182
-.b8 226
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp31
-.b64 $L__tmp161
-.b8 5
-.b8 0
-.b8 144
-.b8 180
-.b8 228
-.b8 149
-.b8 1
-.b64 $L__tmp161
-.b64 $L__tmp164
-.b8 7
-.b8 0
-.b8 144
-.b8 176
-.b8 232
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp164
-.b64 $L__tmp166
-.b8 7
-.b8 0
-.b8 144
-.b8 185
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp166
-.b64 $L__tmp167
-.b8 6
-.b8 0
-.b8 144
-.b8 180
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp167
-.b64 $L__tmp168
-.b8 7
-.b8 0
-.b8 144
-.b8 185
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp168
-.b64 $L__tmp169
-.b8 6
-.b8 0
-.b8 144
-.b8 181
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp169
-.b64 $L__tmp171
-.b8 7
-.b8 0
-.b8 144
-.b8 176
-.b8 232
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp171
-.b64 $L__tmp173
-.b8 6
-.b8 0
-.b8 144
-.b8 182
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp173
-.b64 $L__func_end0
-.b8 7
-.b8 0
-.b8 144
-.b8 182
-.b8 226
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 0
-.b64 0
-.b64 $L__tmp14
-.b64 $L__tmp37
-.b8 7
-.b8 0
-.b8 144
-.b8 182
-.b8 232
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp37
-.b64 $L__tmp49
-.b8 7
-.b8 0
-.b8 144
-.b8 181
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp49
-.b64 $L__tmp79
-.b8 8
-.b8 0
-.b8 144
-.b8 178
-.b8 228
-.b8 200
-.b8 161
-.b8 166
-.b8 174
-.b8 9
-.b64 $L__tmp79
-.b64 $L__func_end0
-.b8 8
-.b8 0
-.b8 144
-.b8 179
-.b8 228
-.b8 200
-.b8 161
-.b8 166
-.b8 174
-.b8 9
-.b64 0
-.b64 0
-.b64 $L__tmp15
-.b64 $L__tmp40
-.b8 7
-.b8 0
-.b8 144
-.b8 183
-.b8 232
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp40
-.b64 $L__func_end0
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 0
-.b64 0
-.b64 $L__tmp16
-.b64 $L__tmp41
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 232
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp41
-.b64 $L__func_end0
-.b8 8
-.b8 0
-.b8 144
-.b8 182
-.b8 224
-.b8 196
-.b8 161
-.b8 166
-.b8 174
-.b8 9
-.b64 0
-.b64 0
-.b64 $L__tmp28
-.b64 $L__tmp30
-.b8 7
-.b8 0
-.b8 144
-.b8 176
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp30
-.b64 $L__tmp31
-.b8 8
-.b8 0
-.b8 144
-.b8 185
-.b8 226
-.b8 200
-.b8 161
-.b8 166
-.b8 174
-.b8 9
-.b64 $L__tmp31
-.b64 $L__tmp172
-.b8 7
-.b8 0
-.b8 144
-.b8 177
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp172
-.b64 $L__tmp174
-.b8 7
-.b8 0
-.b8 144
-.b8 176
-.b8 230
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp174
-.b64 $L__func_end0
-.b8 8
-.b8 0
-.b8 144
-.b8 185
-.b8 226
-.b8 200
-.b8 161
-.b8 166
-.b8 174
-.b8 9
-.b64 0
-.b64 0
-.b64 $L__tmp42
-.b64 $L__tmp44
-.b8 7
-.b8 0
-.b8 144
-.b8 177
-.b8 228
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b64 $L__tmp44
-.b64 $L__tmp46
-.b8 8
-.b8 0
-.b8 144
-.b8 176
-.b8 228
-.b8 200
-.b8 161
-.b8 166
-.b8 174
-.b8 9
-.b64 $L__tmp46
-.b64 $L__func_end0
-.b8 8
-.b8 0
-.b8 144
-.b8 177
-.b8 228
-.b8 200
-.b8 161
-.b8 166
-.b8 174
-.b8 9
-.b64 0
-.b64 0
-.b64 $L__tmp45
-.b64 $L__tmp70
-.b8 5
-.b8 0
-.b8 144
-.b8 182
-.b8 228
-.b8 149
-.b8 1
-.b64 $L__tmp70
-.b64 $L__tmp72
-.b8 7
-.b8 0
-.b8 144
-.b8 178
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp72
-.b64 $L__tmp73
-.b8 6
-.b8 0
-.b8 144
-.b8 176
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp73
-.b64 $L__tmp75
-.b8 7
-.b8 0
-.b8 144
-.b8 178
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp75
-.b64 $L__func_end0
-.b8 6
-.b8 0
-.b8 144
-.b8 177
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 0
-.b64 0
-.b64 $L__tmp48
-.b64 $L__tmp51
-.b8 5
-.b8 0
-.b8 144
-.b8 185
-.b8 228
-.b8 149
-.b8 1
-.b64 $L__tmp51
-.b64 $L__tmp53
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 226
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp53
-.b64 $L__tmp57
-.b8 6
-.b8 0
-.b8 144
-.b8 177
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp57
-.b64 $L__tmp58
-.b8 6
-.b8 0
-.b8 144
-.b8 180
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp58
-.b64 $L__tmp64
-.b8 7
-.b8 0
-.b8 144
-.b8 176
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp64
-.b64 $L__tmp65
-.b8 6
-.b8 0
-.b8 144
-.b8 182
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp65
-.b64 $L__tmp78
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 226
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp78
-.b64 $L__tmp81
-.b8 6
-.b8 0
-.b8 144
-.b8 181
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp81
-.b64 $L__tmp83
-.b8 7
-.b8 0
-.b8 144
-.b8 180
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp83
-.b64 $L__tmp87
-.b8 6
-.b8 0
-.b8 144
-.b8 183
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp87
-.b64 $L__tmp88
-.b8 6
-.b8 0
-.b8 144
-.b8 176
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp88
-.b64 $L__tmp94
-.b8 7
-.b8 0
-.b8 144
-.b8 182
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp94
-.b64 $L__tmp95
-.b8 6
-.b8 0
-.b8 144
-.b8 178
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp95
-.b64 $L__func_end0
-.b8 7
-.b8 0
-.b8 144
-.b8 180
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 0
-.b64 0
-.b64 $L__tmp50
-.b64 $L__tmp52
-.b8 6
-.b8 0
-.b8 144
-.b8 176
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp52
-.b64 $L__tmp53
-.b8 7
-.b8 0
-.b8 144
-.b8 185
-.b8 226
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp53
-.b64 $L__tmp59
-.b8 6
-.b8 0
-.b8 144
-.b8 178
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp59
-.b64 $L__tmp61
-.b8 7
-.b8 0
-.b8 144
-.b8 177
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp61
-.b64 $L__tmp63
-.b8 6
-.b8 0
-.b8 144
-.b8 181
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp63
-.b64 $L__tmp64
-.b8 7
-.b8 0
-.b8 144
-.b8 177
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp64
-.b64 $L__tmp66
-.b8 6
-.b8 0
-.b8 144
-.b8 183
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp66
-.b64 $L__tmp80
-.b8 7
-.b8 0
-.b8 144
-.b8 185
-.b8 226
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp80
-.b64 $L__tmp82
-.b8 6
-.b8 0
-.b8 144
-.b8 182
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp82
-.b64 $L__tmp83
-.b8 7
-.b8 0
-.b8 144
-.b8 181
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp83
-.b64 $L__tmp89
-.b8 6
-.b8 0
-.b8 144
-.b8 184
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp89
-.b64 $L__tmp91
-.b8 7
-.b8 0
-.b8 144
-.b8 183
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp91
-.b64 $L__tmp93
-.b8 6
-.b8 0
-.b8 144
-.b8 177
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp93
-.b64 $L__tmp94
-.b8 7
-.b8 0
-.b8 144
-.b8 183
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp94
-.b64 $L__tmp96
-.b8 6
-.b8 0
-.b8 144
-.b8 179
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp96
-.b64 $L__func_end0
-.b8 7
-.b8 0
-.b8 144
-.b8 181
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 0
-.b64 0
-.b64 $L__tmp68
-.b64 $L__tmp98
-.b8 6
-.b8 0
-.b8 144
-.b8 184
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp98
-.b64 $L__func_end0
-.b8 6
-.b8 0
-.b8 144
-.b8 180
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 0
-.b64 0
-.b64 $L__tmp69
-.b64 $L__tmp99
-.b8 6
-.b8 0
-.b8 144
-.b8 185
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp99
-.b64 $L__func_end0
-.b8 6
-.b8 0
-.b8 144
-.b8 181
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 0
-.b64 0
-.b64 $L__tmp104
-.b64 $L__tmp108
-.b8 6
-.b8 0
-.b8 144
-.b8 184
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp108
-.b64 $L__tmp112
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp112
-.b64 $L__tmp119
-.b8 6
-.b8 0
-.b8 144
-.b8 178
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp119
-.b64 $L__tmp120
-.b8 6
-.b8 0
-.b8 144
-.b8 184
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp120
-.b64 $L__tmp124
-.b8 7
-.b8 0
-.b8 144
-.b8 178
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp124
-.b64 $L__tmp125
-.b8 6
-.b8 0
-.b8 144
-.b8 176
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp125
-.b64 $L__tmp130
-.b8 7
-.b8 0
-.b8 144
-.b8 178
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp130
-.b64 $L__tmp131
-.b8 6
-.b8 0
-.b8 144
-.b8 177
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp131
-.b64 $L__tmp142
-.b8 7
-.b8 0
-.b8 144
-.b8 180
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp142
-.b64 $L__tmp143
-.b8 6
-.b8 0
-.b8 144
-.b8 183
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp143
-.b64 $L__tmp155
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp155
-.b64 $L__tmp157
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp157
-.b64 $L__tmp158
-.b8 6
-.b8 0
-.b8 144
-.b8 178
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp158
-.b64 $L__tmp160
-.b8 7
-.b8 0
-.b8 144
-.b8 184
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp160
-.b64 $L__func_end0
-.b8 6
-.b8 0
-.b8 144
-.b8 179
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b64 0
-.b64 0
-.b64 $L__tmp105
-.b64 $L__tmp109
-.b8 6
-.b8 0
-.b8 144
-.b8 185
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp109
-.b64 $L__tmp112
-.b8 7
-.b8 0
-.b8 144
-.b8 185
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp112
-.b64 $L__tmp121
-.b8 6
-.b8 0
-.b8 144
-.b8 179
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp121
-.b64 $L__tmp123
-.b8 7
-.b8 0
-.b8 144
-.b8 179
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp123
-.b64 $L__tmp126
-.b8 6
-.b8 0
-.b8 144
-.b8 185
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp126
-.b64 $L__tmp128
-.b8 7
-.b8 0
-.b8 144
-.b8 179
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp128
-.b64 $L__tmp132
-.b8 6
-.b8 0
-.b8 144
-.b8 178
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp132
-.b64 $L__tmp142
-.b8 7
-.b8 0
-.b8 144
-.b8 181
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp142
-.b64 $L__tmp144
-.b8 6
-.b8 0
-.b8 144
-.b8 184
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp144
-.b64 $L__tmp154
-.b8 7
-.b8 0
-.b8 144
-.b8 185
-.b8 228
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp154
-.b64 $L__func_end0
-.b8 6
-.b8 0
-.b8 144
-.b8 177
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b64 0
-.b64 0
-.b64 $L__tmp106
-.b64 $L__tmp110
-.b8 6
-.b8 0
-.b8 144
-.b8 176
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp110
-.b64 $L__tmp112
-.b8 7
-.b8 0
-.b8 144
-.b8 176
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp112
-.b64 $L__tmp129
-.b8 6
-.b8 0
-.b8 144
-.b8 180
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp129
-.b64 $L__tmp133
-.b8 6
-.b8 0
-.b8 144
-.b8 179
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp133
-.b64 $L__tmp136
-.b8 7
-.b8 0
-.b8 144
-.b8 182
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp136
-.b64 $L__tmp140
-.b8 6
-.b8 0
-.b8 144
-.b8 181
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp140
-.b64 $L__tmp142
-.b8 7
-.b8 0
-.b8 144
-.b8 182
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp142
-.b64 $L__tmp145
-.b8 6
-.b8 0
-.b8 144
-.b8 185
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp145
-.b64 $L__func_end0
-.b8 7
-.b8 0
-.b8 144
-.b8 176
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 0
-.b64 0
-.b64 $L__tmp107
-.b64 $L__tmp111
-.b8 6
-.b8 0
-.b8 144
-.b8 177
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp111
-.b64 $L__tmp112
-.b8 7
-.b8 0
-.b8 144
-.b8 177
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp112
-.b64 $L__tmp130
-.b8 6
-.b8 0
-.b8 144
-.b8 181
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp130
-.b64 $L__tmp134
-.b8 6
-.b8 0
-.b8 144
-.b8 180
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp134
-.b64 $L__tmp137
-.b8 7
-.b8 0
-.b8 144
-.b8 183
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp137
-.b64 $L__tmp141
-.b8 6
-.b8 0
-.b8 144
-.b8 182
-.b8 234
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp141
-.b64 $L__tmp142
-.b8 7
-.b8 0
-.b8 144
-.b8 183
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 $L__tmp142
-.b64 $L__tmp146
-.b8 6
-.b8 0
-.b8 144
-.b8 176
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp146
-.b64 $L__func_end0
-.b8 7
-.b8 0
-.b8 144
-.b8 177
-.b8 230
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b64 0
-.b64 0
-.b64 $L__tmp180
-.b64 $L__tmp181
-.b8 5
-.b8 0
-.b8 144
-.b8 177
-.b8 228
-.b8 149
-.b8 1
-.b64 $L__tmp181
-.b64 $L__tmp182
-.b8 6
-.b8 0
-.b8 144
-.b8 179
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 $L__tmp182
-.b64 $L__tmp199
-.b8 5
-.b8 0
-.b8 144
-.b8 178
-.b8 228
-.b8 149
-.b8 1
-.b64 $L__tmp199
-.b64 $L__tmp200
-.b8 5
-.b8 0
-.b8 144
-.b8 179
-.b8 228
-.b8 149
-.b8 1
-.b64 $L__tmp200
-.b64 $L__func_end1
-.b8 6
-.b8 0
-.b8 144
-.b8 179
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b64 0
-.b64 0
-	}
-	.section	.debug_abbrev
-	{
-.b8 1
-.b8 17
-.b8 1
-.b8 37
-.b8 8
-.b8 19
-.b8 5
-.b8 3
-.b8 8
-.b8 16
-.b8 6
-.b8 27
-.b8 8
-.b8 17
-.b8 1
-.b8 0
-.b8 0
-.b8 2
-.b8 52
-.b8 0
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 51
-.b8 11
-.b8 2
-.b8 10
-.b8 135,64
-.b8 8
-.b8 0
-.b8 0
-.b8 3
-.b8 22
-.b8 0
-.b8 73
-.b8 19
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 0
-.b8 0
-.b8 4
-.b8 15
-.b8 0
-.b8 73
-.b8 19
-.b8 51
-.b8 6
-.b8 0
-.b8 0
-.b8 5
-.b8 19
-.b8 0
-.b8 3
-.b8 8
-.b8 11
-.b8 11
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 0
-.b8 0
-.b8 6
-.b8 52
-.b8 0
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 51
-.b8 11
-.b8 2
-.b8 10
-.b8 135,64
-.b8 8
-.b8 0
-.b8 0
-.b8 7
-.b8 38
-.b8 0
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 8
-.b8 36
-.b8 0
-.b8 3
-.b8 8
-.b8 62
-.b8 11
-.b8 11
-.b8 11
-.b8 0
-.b8 0
-.b8 9
-.b8 19
-.b8 1
-.b8 3
-.b8 8
-.b8 11
-.b8 11
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 0
-.b8 0
-.b8 10
-.b8 13
-.b8 0
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 56
-.b8 10
-.b8 0
-.b8 0
-.b8 11
-.b8 13
-.b8 0
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 56
-.b8 10
-.b8 0
-.b8 0
-.b8 12
-.b8 59
-.b8 0
-.b8 3
-.b8 8
-.b8 0
-.b8 0
-.b8 13
-.b8 22
-.b8 0
-.b8 73
-.b8 19
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 0
-.b8 0
-.b8 14
-.b8 19
-.b8 1
-.b8 3
-.b8 8
-.b8 11
-.b8 11
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 0
-.b8 0
-.b8 15
-.b8 13
-.b8 0
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 56
-.b8 10
-.b8 0
-.b8 0
-.b8 16
-.b8 46
-.b8 1
-.b8 135,64
-.b8 8
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 73
-.b8 19
-.b8 32
-.b8 11
-.b8 0
-.b8 0
-.b8 17
-.b8 5
-.b8 0
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 18
-.b8 46
-.b8 0
-.b8 135,64
-.b8 8
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 73
-.b8 19
-.b8 32
-.b8 11
-.b8 0
-.b8 0
-.b8 19
-.b8 5
-.b8 0
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 20
-.b8 16
-.b8 0
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 21
-.b8 11
-.b8 1
-.b8 0
-.b8 0
-.b8 22
-.b8 52
-.b8 0
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 23
-.b8 46
-.b8 1
-.b8 17
-.b8 1
-.b8 18
-.b8 1
-.b8 64
-.b8 10
-.b8 135,64
-.b8 8
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 63
-.b8 12
-.b8 0
-.b8 0
-.b8 24
-.b8 5
-.b8 0
-.b8 2
-.b8 10
-.b8 51
-.b8 11
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 25
-.b8 11
-.b8 1
-.b8 17
-.b8 1
-.b8 18
-.b8 1
-.b8 0
-.b8 0
-.b8 26
-.b8 52
-.b8 0
-.b8 51
-.b8 11
-.b8 2
-.b8 10
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 27
-.b8 52
-.b8 0
-.b8 2
-.b8 10
-.b8 51
-.b8 11
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 28
-.b8 52
-.b8 0
-.b8 2
-.b8 6
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 29
-.b8 23
-.b8 1
-.b8 3
-.b8 8
-.b8 11
-.b8 11
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 0
-.b8 0
-.b8 30
-.b8 13
-.b8 0
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 11
-.b8 11
-.b8 13
-.b8 11
-.b8 12
-.b8 15
-.b8 56
-.b8 10
-.b8 0
-.b8 0
-.b8 31
-.b8 13
-.b8 0
-.b8 73
-.b8 19
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 11
-.b8 11
-.b8 13
-.b8 11
-.b8 12
-.b8 15
-.b8 56
-.b8 10
-.b8 0
-.b8 0
-.b8 32
-.b8 19
-.b8 0
-.b8 3
-.b8 8
-.b8 11
-.b8 11
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 0
-.b8 0
-.b8 33
-.b8 29
-.b8 1
-.b8 49
-.b8 19
-.b8 17
-.b8 1
-.b8 18
-.b8 1
-.b8 88
-.b8 11
-.b8 89
-.b8 11
-.b8 0
-.b8 0
-.b8 34
-.b8 29
-.b8 1
-.b8 49
-.b8 19
-.b8 17
-.b8 1
-.b8 18
-.b8 1
-.b8 88
-.b8 11
-.b8 89
-.b8 5
-.b8 0
-.b8 0
-.b8 35
-.b8 5
-.b8 0
-.b8 51
-.b8 11
-.b8 2
-.b8 10
-.b8 49
-.b8 19
-.b8 0
-.b8 0
-.b8 36
-.b8 5
-.b8 0
-.b8 2
-.b8 10
-.b8 51
-.b8 11
-.b8 49
-.b8 19
-.b8 0
-.b8 0
-.b8 37
-.b8 52
-.b8 0
-.b8 2
-.b8 10
-.b8 51
-.b8 11
-.b8 49
-.b8 19
-.b8 0
-.b8 0
-.b8 38
-.b8 46
-.b8 1
-.b8 135,64
-.b8 8
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 32
-.b8 11
-.b8 0
-.b8 0
-.b8 39
-.b8 5
-.b8 0
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 40
-.b8 5
-.b8 0
-.b8 51
-.b8 11
-.b8 2
-.b8 10
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 11
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 41
-.b8 46
-.b8 1
-.b8 17
-.b8 1
-.b8 18
-.b8 1
-.b8 64
-.b8 10
-.b8 135,64
-.b8 8
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 73
-.b8 19
-.b8 63
-.b8 12
-.b8 0
-.b8 0
-.b8 42
-.b8 5
-.b8 0
-.b8 2
-.b8 10
-.b8 51
-.b8 11
-.b8 3
-.b8 8
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 43
-.b8 5
-.b8 0
-.b8 51
-.b8 11
-.b8 2
-.b8 10
-.b8 3
-.b8 8
-.b8 58
-.b8 11
-.b8 59
-.b8 5
-.b8 73
-.b8 19
-.b8 0
-.b8 0
-.b8 0
-	}
-	.section	.debug_info
-	{
-.b32 16706
-.b8 2
-.b8 0
-.b32 .debug_abbrev
-.b8 8
-.b8 1
-.b8 108,103,101,110,102,101,58,32,69,68,71,32,54,46,50
-.b8 0
-.b8 4
-.b8 0
-.b8 71,66,95,106,105,116,95,65,120,66,95,100,111,116,51,95,112,104,97,115,101,51,95,109,112
-.b8 0
-.b32 .debug_line
-.b8 47,115,104,97,114,101,47,119,111,114,107,115,112,97,99,101,47,110,118,105,100,105,97,95,112,114,111,106,101,99,116,115,47,71,114,97,112,104,66,76
-.b8 65,83,47,67,85,68,65,47,116,101,115,116
-.b8 0
-.b64 0
-.b8 2
-.b8 71,120,66,95,70,76,73,80,68,73,65,71,73,78,68,69,88,95,73,78,84,51,50
-.b8 0
-.b32 188
-.b8 1
-.b8 110
-.b8 1
-.b8 5
-.b8 9
-.b8 3
-.b64 GxB_FLIPDIAGINDEX_INT32
-.b8 71,120,66,95,70,76,73,80,68,73,65,71,73,78,68,69,88,95,73,78,84,51,50
-.b8 0
-.b8 3
-.b32 212
-.b8 71,114,66,95,73,110,100,101,120,85,110,97,114,121,79,112
-.b8 0
-.b8 2
-.b8 71
-.b8 4
-.b32 221
-.b32 12
-.b8 5
-.b8 71,66,95,73,110,100,101,120,85,110,97,114,121,79,112,95,111,112,97,113,117,101
-.b8 0
-.b8 0
-.b8 1
-.b8 144
-.b8 1
-.b8 2
-.b8 71,120,66,95,70,76,73,80,68,73,65,71,73,78,68,69,88,95,73,78,84,54,52
-.b8 0
-.b32 188
-.b8 1
-.b8 110
-.b8 1
-.b8 5
-.b8 9
-.b8 3
-.b64 GxB_FLIPDIAGINDEX_INT64
-.b8 71,120,66,95,70,76,73,80,68,73,65,71,73,78,68,69,88,95,73,78,84,54,52
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,51,69,69,50,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 164
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v117thread_group_baseILj3EE2idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,51,69,69,50,105,100,69
-.b8 0
-.b8 7
-.b32 455
-.b8 8
-.b8 117,110,115,105,103,110,101,100,32,105,110,116
-.b8 0
-.b8 7
-.b8 4
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,48,103,114,105,100,95,103,114,111,117,112
-.b8 57,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 68
-.b8 1
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v110grid_group9_group_idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,48,103,114,105,100,95,103,114,111,117,112
-.b8 57,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,52,69,69,50,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 164
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v117thread_group_baseILj4EE2idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,52,69,69,50,105,100,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,57,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 125
-.b8 2
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v112thread_block9_group_idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,57,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,49,69,69,50,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 164
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v117thread_group_baseILj1EE2idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,49,69,69,50,105,100,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,53,99,111,97,108,101,115,99,101,100,95
-.b8 103,114,111,117,112,57,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 25
-.b8 3
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v115coalesced_group9_group_idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,53,99,111,97,108,101,115,99,101,100,95
-.b8 103,114,111,117,112,57,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,50,51,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,105,108,101,95,98,97,115,101,73,76,106,49,69,69,49,48,110,117,109,84,104,114,101,97,100,115,69
-.b8 0
-.b32 450
-.b8 3
-.b8 208
-.b8 3
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v123__static_size_tile_baseILj1EE10numThreadsE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,50,51,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,105,108,101,95,98,97,115,101,73,76,106,49,69,69,49,48,110,117,109,84,104,114,101,97,100,115,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,49,69,69,56,102,117,108,108,77,97,115
-.b8 107,69
-.b8 0
-.b32 450
-.b8 3
-.b8 240
-.b8 3
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v136__static_size_thread_block_tile_baseILj1EE8fullMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,49,69,69,56,102,117,108,108,77,97,115
-.b8 107,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,49,69,69,57,95,103,114,111,117,112,95
-.b8 105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 255
-.b8 3
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v136__static_size_thread_block_tile_baseILj1EE9_group_idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,49,69,69,57,95,103,114,111,117,112,95
-.b8 105,100,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,49,95,95,115,105,110,103,108,101,95,119
-.b8 97,114,112,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,76,106,49,69,118,69,49,48,110,117,109,84,104,114,101,97,100,115
-.b8 69
-.b8 0
-.b32 450
-.b8 3
-.b8 135
-.b8 4
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj1EvE10numThreadsE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,49,95,95,115,105,110,103,108,101,95,119
-.b8 97,114,112,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,76,106,49,69,118,69,49,48,110,117,109,84,104,114,101,97,100,115
-.b8 69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,50,51,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,49,48,110,117,109,84,104,114,101,97,100,115,69
-.b8 0
-.b32 450
-.b8 3
-.b8 208
-.b8 3
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v123__static_size_tile_baseILj32EE10numThreadsE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,50,51,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,49,48,110,117,109,84,104,114,101,97,100,115,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,56,102,117,108,108,77,97
-.b8 115,107,69
-.b8 0
-.b32 450
-.b8 3
-.b8 240
-.b8 3
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v136__static_size_thread_block_tile_baseILj32EE8fullMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,56,102,117,108,108,77,97
-.b8 115,107,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,57,95,103,114,111,117,112
-.b8 95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 255
-.b8 3
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v136__static_size_thread_block_tile_baseILj32EE9_group_idE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,57,95,103,114,111,117,112
-.b8 95,105,100,69
-.b8 0
-.b8 2
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,49,95,95,115,105,110,103,108,101,95,119
-.b8 97,114,112,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,76,106,51,50,69,118,69,49,48,110,117,109,84,104,114,101,97,100
-.b8 115,69
-.b8 0
-.b32 450
-.b8 3
-.b8 135
-.b8 4
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvE10numThreadsE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,49,95,95,115,105,110,103,108,101,95,119
-.b8 97,114,112,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,76,106,51,50,69,118,69,49,48,110,117,109,84,104,114,101,97,100
-.b8 115,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,55,105,110,116,101,103,114,97,108,95,99,111,110,115,116,97,110,116,73,98,76,98,48,69,69,53,118,97,108,117,101,69
-.b8 0
-.b32 2748
-.b8 4
-.b8 196
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details9templates17integral_constantIbLb0EE5valueE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,55,105,110,116,101,103,114,97,108,95,99,111,110,115,116,97,110,116,73,98,76,98,48,69,69,53,118,97,108,117,101,69
-.b8 0
-.b8 7
-.b32 2753
-.b8 8
-.b8 98,111,111,108
-.b8 0
-.b8 2
-.b8 1
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,55,105,110,116,101,103,114,97,108,95,99,111,110,115,116,97,110,116,73,98,76,98,49,69,69,53,118,97,108,117,101,69
-.b8 0
-.b32 2748
-.b8 4
-.b8 196
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details9templates17integral_constantIbLb1EE5valueE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,55,105,110,116,101,103,114,97,108,95,99,111,110,115,116,97,110,116,73,98,76,98,49,69,69,53,118,97,108,117,101,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,57,116,105,108,101,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 157
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj1ELj4294967295ELj31ELj5EE9tileCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,57,116,105,108,101,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,56,116,105,108,101,77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 158
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj1ELj4294967295ELj31ELj5EE8tileMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,56,116,105,108,101,77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,56,108,97,110,101,77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 159
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj1ELj4294967295ELj31ELj5EE8laneMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,56,108,97,110,101,77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,49,48,115,104,105,102,116,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 160
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj1ELj4294967295ELj31ELj5EE10shiftCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,69,76,106,52,50,57,52,57,54,55,50,57,53,69,76,106,51,49,69,76
-.b8 106,53,69,69,49,48,115,104,105,102,116,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,57
-.b8 116,105,108,101,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 157
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj2ELj65535ELj15ELj4EE9tileCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,57
-.b8 116,105,108,101,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,56
-.b8 116,105,108,101,77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 158
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj2ELj65535ELj15ELj4EE8tileMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,56
-.b8 116,105,108,101,77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,56
-.b8 108,97,110,101,77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 159
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj2ELj65535ELj15ELj4EE8laneMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,56
-.b8 108,97,110,101,77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,49
-.b8 48,115,104,105,102,116,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 160
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj2ELj65535ELj15ELj4EE10shiftCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,50,69,76,106,54,53,53,51,53,69,76,106,49,53,69,76,106,52,69,69,49
-.b8 48,115,104,105,102,116,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,57,116,105,108
-.b8 101,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 157
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj4ELj255ELj7ELj3EE9tileCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,57,116,105,108
-.b8 101,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,56,116,105,108
-.b8 101,77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 158
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj4ELj255ELj7ELj3EE8tileMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,56,116,105,108
-.b8 101,77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,56,108,97,110
-.b8 101,77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 159
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj4ELj255ELj7ELj3EE8laneMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,56,108,97,110
-.b8 101,77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,49,48,115,104
-.b8 105,102,116,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 160
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj4ELj255ELj7ELj3EE10shiftCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,52,69,76,106,50,53,53,69,76,106,55,69,76,106,51,69,69,49,48,115,104
-.b8 105,102,116,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,57,116,105,108,101
-.b8 67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 157
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj8ELj15ELj3ELj2EE9tileCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,57,116,105,108,101
-.b8 67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,56,116,105,108,101
-.b8 77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 158
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj8ELj15ELj3ELj2EE8tileMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,56,116,105,108,101
-.b8 77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,56,108,97,110,101
-.b8 77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 159
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj8ELj15ELj3ELj2EE8laneMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,56,108,97,110,101
-.b8 77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,49,48,115,104,105
-.b8 102,116,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 160
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj8ELj15ELj3ELj2EE10shiftCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,56,69,76,106,49,53,69,76,106,51,69,76,106,50,69,69,49,48,115,104,105
-.b8 102,116,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,57,116,105,108,101
-.b8 67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 157
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj16ELj3ELj1ELj1EE9tileCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,57,116,105,108,101
-.b8 67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,56,116,105,108,101
-.b8 77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 158
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj16ELj3ELj1ELj1EE8tileMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,56,116,105,108,101
-.b8 77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,56,108,97,110,101
-.b8 77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 159
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj16ELj3ELj1ELj1EE8laneMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,56,108,97,110,101
-.b8 77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,49,48,115,104,105
-.b8 102,116,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 160
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj16ELj3ELj1ELj1EE10shiftCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,49,54,69,76,106,51,69,76,106,49,69,76,106,49,69,69,49,48,115,104,105
-.b8 102,116,67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,57,116,105,108,101
-.b8 67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 157
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj32ELj1ELj0ELj0EE9tileCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,57,116,105,108,101
-.b8 67,111,117,110,116,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,56,116,105,108,101
-.b8 77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 158
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj32ELj1ELj0ELj0EE8tileMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,56,116,105,108,101
-.b8 77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,56,108,97,110,101
-.b8 77,97,115,107,69
-.b8 0
-.b32 450
-.b8 5
-.b8 159
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj32ELj1ELj0ELj0EE8laneMaskE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,56,108,97,110,101
-.b8 77,97,115,107,69
-.b8 0
-.b8 6
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,49,48,115,104,105
-.b8 102,116,67,111,117,110,116,69
-.b8 0
-.b32 450
-.b8 5
-.b8 160
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN18cooperative_groups4__v17details4tile13_tile_helpersILj32ELj1ELj0ELj0EE10shiftCountE
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,51,95,116,105,108,101,95,104,101,108,112,101,114,115,73,76,106,51,50,69,76,106,49,69,76,106,48,69,76,106,48,69,69,49,48,115,104,105
-.b8 102,116,67,111,117,110,116,69
-.b8 0
-.b8 2
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108,101,50,48,95,77,101,109,111,114,121,83,104,117,102
-.b8 102,108,101,67,117,116,111,102,102,69
-.b8 0
-.b32 7940
-.b8 5
-.b8 98
-.b8 1
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN64_INTERNAL_00000000_25_GB_jit_AxB_dot3_phase3_mp_f71137a9_402139318cooperative_groups4__v17details4tile20_MemoryShuffleCutoffE
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108,101,50,48,95,77,101,109,111,114,121,83,104,117,102
-.b8 102,108,101,67,117,116,111,102,102,69
-.b8 0
-.b8 7
-.b32 7945
-.b8 8
-.b8 117,110,115,105,103,110,101,100,32,108,111,110,103,32,108,111,110,103
-.b8 0
-.b8 7
-.b8 8
-.b8 6
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,56,99,111,97,108,101,115,99,101,100,95,103,114,111,117,112,95
-.b8 105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 69
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN64_INTERNAL_00000000_25_GB_jit_AxB_dot3_phase3_mp_f71137a9_402139318cooperative_groups4__v17details18coalesced_group_idE
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,56,99,111,97,108,101,115,99,101,100,95,103,114,111,117,112,95
-.b8 105,100,69
-.b8 0
-.b8 6
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,57,109,117,108,116,105,95,103,114,105,100,95,103,114,111,117,112
-.b8 95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 70
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN64_INTERNAL_00000000_25_GB_jit_AxB_dot3_phase3_mp_f71137a9_402139318cooperative_groups4__v17details19multi_grid_group_idE
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,57,109,117,108,116,105,95,103,114,105,100,95,103,114,111,117,112
-.b8 95,105,100,69
-.b8 0
-.b8 6
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,51,103,114,105,100,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 71
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN64_INTERNAL_00000000_25_GB_jit_AxB_dot3_phase3_mp_f71137a9_402139318cooperative_groups4__v17details13grid_group_idE
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,51,103,114,105,100,95,103,114,111,117,112,95,105,100,69
-.b8 0
-.b8 6
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,53,116,104,114,101,97,100,95,98,108,111,99,107,95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 72
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN64_INTERNAL_00000000_25_GB_jit_AxB_dot3_phase3_mp_f71137a9_402139318cooperative_groups4__v17details15thread_block_idE
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,53,116,104,114,101,97,100,95,98,108,111,99,107,95,105,100,69
-.b8 0
-.b8 6
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,57,109,117,108,116,105,95,116,105,108,101,95,103,114,111,117,112
-.b8 95,105,100,69
-.b8 0
-.b32 450
-.b8 3
-.b8 73
-.b8 5
-.b8 9
-.b8 3
-.b64 _ZN64_INTERNAL_00000000_25_GB_jit_AxB_dot3_phase3_mp_f71137a9_402139318cooperative_groups4__v17details19multi_tile_group_idE
-.b8 95,90,78,54,52,95,73,78,84,69,82,78,65,76,95,48,48,48,48,48,48,48,48,95,50,53,95,71,66,95,106,105,116,95,65,120,66,95,100,111
-.b8 116,51,95,112,104,97,115,101,51,95,109,112,95,102,55,49,49,51,55,97,57,95,52,48,50,49,51,57,51,49,56,99,111,111,112,101,114,97,116,105
-.b8 118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,57,109,117,108,116,105,95,116,105,108,101,95,103,114,111,117,112
-.b8 95,105,100,69
-.b8 0
-.b8 3
-.b32 9325
-.b8 95,90,78,49,56,95,95,106,105,116,105,102,121,95,115,116,100,105,110,116,95,110,115,55,105,110,116,54,52,95,116,69
-.b8 0
-.b8 6
-.b8 8
-.b8 8
-.b8 108,111,110,103,32,108,111,110,103
-.b8 0
-.b8 5
-.b8 8
-.b8 3
-.b32 9352
-.b8 115,105,122,101,95,116
-.b8 0
-.b8 7
-.b8 55
-.b8 8
-.b8 117,110,115,105,103,110,101,100,32,108,111,110,103
-.b8 0
-.b8 7
-.b8 8
-.b8 9
-.b8 71,66,95,77,97,116,114,105,120,95,111,112,97,113,117,101
-.b8 0
-.b8 208
-.b8 1
-.b8 247
-.b8 1
-.b8 10
-.b8 109,97,103,105,99
-.b8 0
-.b32 9285
-.b8 8
-.b8 33
-.b8 2
-.b8 35
-.b8 0
-.b8 10
-.b8 104,101,97,100,101,114,95,115,105,122,101
-.b8 0
-.b32 9338
-.b8 8
-.b8 34
-.b8 2
-.b8 35
-.b8 8
-.b8 10
-.b8 108,111,103,103,101,114
-.b8 0
-.b32 10089
-.b8 8
-.b8 35
-.b8 2
-.b8 35
-.b8 16
-.b8 10
-.b8 108,111,103,103,101,114,95,115,105,122,101
-.b8 0
-.b32 9338
-.b8 8
-.b8 36
-.b8 2
-.b8 35
-.b8 24
-.b8 10
-.b8 116,121,112,101
-.b8 0
-.b32 10106
-.b8 8
-.b8 40
-.b8 2
-.b8 35
-.b8 32
-.b8 5
-.b8 71,66,95,84,121,112,101,95,111,112,97,113,117,101
-.b8 0
-.b8 0
-.b8 1
-.b8 129
-.b8 1
-.b8 10
-.b8 112,108,101,110
-.b8 0
-.b32 9285
-.b8 8
-.b8 205
-.b8 2
-.b8 35
-.b8 40
-.b8 10
-.b8 118,108,101,110
-.b8 0
-.b32 9285
-.b8 8
-.b8 206
-.b8 2
-.b8 35
-.b8 48
-.b8 10
-.b8 118,100,105,109
-.b8 0
-.b32 9285
-.b8 8
-.b8 207
-.b8 2
-.b8 35
-.b8 56
-.b8 10
-.b8 110,118,101,99
-.b8 0
-.b32 9285
-.b8 8
-.b8 208
-.b8 2
-.b8 35
-.b8 64
-.b8 10
-.b8 110,118,101,99,95,110,111,110,101,109,112,116,121
-.b8 0
-.b32 9285
-.b8 8
-.b8 211
-.b8 2
-.b8 35
-.b8 72
-.b8 10
-.b8 104
-.b8 0
-.b32 10131
-.b8 8
-.b8 214
-.b8 2
-.b8 35
-.b8 80
-.b8 10
-.b8 112
-.b8 0
-.b32 10131
-.b8 8
-.b8 215
-.b8 2
-.b8 35
-.b8 88
-.b8 10
-.b8 105
-.b8 0
-.b32 10131
-.b8 8
-.b8 216
-.b8 2
-.b8 35
-.b8 96
-.b8 10
-.b8 120
-.b8 0
-.b32 10140
-.b8 8
-.b8 217
-.b8 2
-.b8 35
-.b8 104
-.b8 10
-.b8 98
-.b8 0
-.b32 10155
-.b8 8
-.b8 219
-.b8 2
-.b8 35
-.b8 112
-.b8 10
-.b8 110,118,97,108,115
-.b8 0
-.b32 9285
-.b8 8
-.b8 220
-.b8 2
-.b8 35
-.b8 120
-.b8 10
-.b8 112,95,115,105,122,101
-.b8 0
-.b32 9338
-.b8 8
-.b8 222
-.b8 3
-.b8 35
-.b8 128,1
-.b8 10
-.b8 104,95,115,105,122,101
-.b8 0
-.b32 9338
-.b8 8
-.b8 223
-.b8 3
-.b8 35
-.b8 136,1
-.b8 10
-.b8 98,95,115,105,122,101
-.b8 0
-.b32 9338
-.b8 8
-.b8 224
-.b8 3
-.b8 35
-.b8 144,1
-.b8 10
-.b8 105,95,115,105,122,101
-.b8 0
-.b32 9338
-.b8 8
-.b8 225
-.b8 3
-.b8 35
-.b8 152,1
-.b8 10
-.b8 120,95,115,105,122,101
-.b8 0
-.b32 9338
-.b8 8
-.b8 226
-.b8 3
-.b8 35
-.b8 160,1
-.b8 11
-.b8 80,101,110,100,105,110,103
-.b8 0
-.b32 10218
-.b8 8
-.b8 21
-.b8 1
-.b8 3
-.b8 35
-.b8 168,1
-.b8 5
-.b8 71,66,95,80,101,110,100,105,110,103,95,115,116,114,117,99,116
-.b8 0
-.b8 0
-.b8 1
-.b8 214
-.b8 1
-.b8 11
-.b8 110,122,111,109,98,105,101,115
-.b8 0
-.b32 10246
-.b8 8
-.b8 51
-.b8 1
-.b8 3
-.b8 35
-.b8 176,1
-.b8 11
-.b8 104,121,112,101,114,95,115,119,105,116,99,104
-.b8 0
-.b32 10287
-.b8 8
-.b8 116
-.b8 1
-.b8 3
-.b8 35
-.b8 184,1
-.b8 11
-.b8 98,105,116,109,97,112,95,115,119,105,116,99,104
-.b8 0
-.b32 10287
-.b8 8
-.b8 117
-.b8 1
-.b8 3
-.b8 35
-.b8 188,1
-.b8 11
-.b8 115,112,97,114,115,105,116,121,95,99,111,110,116,114,111,108
-.b8 0
-.b32 10296
-.b8 8
-.b8 118
-.b8 1
-.b8 3
-.b8 35
-.b8 192,1
-.b8 11
-.b8 112,95,115,104,97,108,108,111,119
-.b8 0
-.b32 2753
-.b8 8
-.b8 135
-.b8 1
-.b8 3
-.b8 35
-.b8 196,1
-.b8 11
-.b8 104,95,115,104,97,108,108,111,119
-.b8 0
-.b32 2753
-.b8 8
-.b8 136
-.b8 1
-.b8 3
-.b8 35
-.b8 197,1
-.b8 11
-.b8 98,95,115,104,97,108,108,111,119
-.b8 0
-.b32 2753
-.b8 8
-.b8 137
-.b8 1
-.b8 3
-.b8 35
-.b8 198,1
-.b8 11
-.b8 105,95,115,104,97,108,108,111,119
-.b8 0
-.b32 2753
-.b8 8
-.b8 138
-.b8 1
-.b8 3
-.b8 35
-.b8 199,1
-.b8 11
-.b8 120,95,115,104,97,108,108,111,119
-.b8 0
-.b32 2753
-.b8 8
-.b8 139
-.b8 1
-.b8 3
-.b8 35
-.b8 200,1
-.b8 11
-.b8 115,116,97,116,105,99,95,104,101,97,100,101,114
-.b8 0
-.b32 2753
-.b8 8
-.b8 140
-.b8 1
-.b8 3
-.b8 35
-.b8 201,1
-.b8 11
-.b8 105,115,95,99,115,99
-.b8 0
-.b32 2753
-.b8 8
-.b8 146
-.b8 1
-.b8 3
-.b8 35
-.b8 202,1
-.b8 11
-.b8 106,117,109,98,108,101,100
-.b8 0
-.b32 2753
-.b8 8
-.b8 147
-.b8 1
-.b8 3
-.b8 35
-.b8 203,1
-.b8 11
-.b8 105,115,111
-.b8 0
-.b32 2753
-.b8 8
-.b8 172
-.b8 1
-.b8 3
-.b8 35
-.b8 204,1
-.b8 0
-.b8 4
-.b32 10098
-.b32 12
-.b8 8
-.b8 99,104,97,114
-.b8 0
-.b8 6
-.b8 1
-.b8 3
-.b32 10122
-.b8 71,114,66,95,84,121,112,101
-.b8 0
-.b8 2
-.b8 67
-.b8 4
-.b32 9483
-.b32 12
-.b8 4
-.b32 9285
-.b32 12
-.b8 4
-.b32 10149
-.b32 12
-.b8 12
-.b8 118,111,105,100
-.b8 0
-.b8 4
-.b32 10164
-.b32 12
-.b8 3
-.b32 10203
-.b8 95,90,78,49,56,95,95,106,105,116,105,102,121,95,115,116,100,105,110,116,95,110,115,54,105,110,116,56,95,116,69
-.b8 0
-.b8 6
-.b8 5
-.b8 8
-.b8 115,105,103,110,101,100,32,99,104,97,114
-.b8 0
-.b8 6
-.b8 1
-.b8 13
-.b32 10237
-.b8 71,66,95,80,101,110,100,105,110,103
-.b8 0
-.b8 1
-.b8 231
-.b8 1
-.b8 4
-.b32 9773
-.b32 12
-.b8 3
-.b32 7945
-.b8 95,90,78,49,56,95,95,106,105,116,105,102,121,95,115,116,100,105,110,116,95,110,115,56,117,105,110,116,54,52,95,116,69
-.b8 0
-.b8 6
-.b8 22
-.b8 8
-.b8 102,108,111,97,116
-.b8 0
-.b8 4
-.b8 4
-.b8 8
-.b8 105,110,116
-.b8 0
-.b8 5
-.b8 4
-.b8 3
-.b32 10321
-.b8 71,114,66,95,77,97,116,114,105,120
-.b8 0
-.b8 2
-.b8 76
-.b8 4
-.b32 9369
-.b32 12
-.b8 3
-.b32 10296
-.b8 95,90,78,49,56,95,95,106,105,116,105,102,121,95,115,116,100,105,110,116,95,110,115,55,105,110,116,51,50,95,116,69
-.b8 0
-.b8 6
-.b8 7
-.b8 4
-.b32 14534
-.b32 12
-.b8 9
-.b8 100,105,109,51
-.b8 0
-.b8 12
-.b8 7
-.b8 205
-.b8 3
-.b8 11
-.b8 120
-.b8 0
-.b32 455
-.b8 7
-.b8 207
-.b8 3
-.b8 2
-.b8 35
-.b8 0
-.b8 11
-.b8 121
-.b8 0
-.b32 455
-.b8 7
-.b8 207
-.b8 3
-.b8 2
-.b8 35
-.b8 4
-.b8 11
-.b8 122
-.b8 0
-.b32 455
-.b8 7
-.b8 207
-.b8 3
-.b8 2
-.b8 35
-.b8 8
-.b8 0
-.b8 9
-.b8 117,105,110,116,51
-.b8 0
-.b8 12
-.b8 7
-.b8 32
-.b8 3
-.b8 11
-.b8 120
-.b8 0
-.b32 455
-.b8 7
-.b8 34
-.b8 3
-.b8 2
-.b8 35
-.b8 0
-.b8 11
-.b8 121
-.b8 0
-.b32 455
-.b8 7
-.b8 34
-.b8 3
-.b8 2
-.b8 35
-.b8 4
-.b8 11
-.b8 122
-.b8 0
-.b32 455
-.b8 7
-.b8 34
-.b8 3
-.b8 2
-.b8 35
-.b8 8
-.b8 0
-.b8 13
-.b32 10429
-.b8 117,105,110,116,51
-.b8 0
-.b8 7
-.b8 178
-.b8 3
-.b8 14
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,52,69,69,69
-.b8 0
-.b8 16
-.b8 3
-.b8 162
-.b8 15
-.b8 95,95,98,95,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,103
-.b8 114,111,117,112,69
-.b8 0
-.b32 13983
-.b8 2
-.b8 35
-.b8 0
-.b8 0
-.b8 9
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,69
-.b8 0
-.b8 16
-.b8 3
-.b8 61
-.b8 2
-.b8 15
-.b8 95,95,98,95,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103
-.b8 114,111,117,112,95,98,97,115,101,73,76,106,52,69,69,69
-.b8 0
-.b32 10494
-.b8 2
-.b8 35
-.b8 0
-.b8 0
-.b8 5
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,69
-.b8 0
-.b8 1
-.b8 3
-.b8 229
-.b8 3
-.b8 3
-.b32 10330
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,54,114,101,109,111,118,101,95,114,101,102,101,114,101,110,99,101,73,82,105,69,52,116,121,112,101,69
-.b8 0
-.b8 4
-.b8 213
-.b8 3
-.b32 10803
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,50,114,101,109,111,118,101,95,99,111,110,115,116,73,105,69,52,116,121,112,101,69
-.b8 0
-.b8 4
-.b8 219
-.b8 3
-.b32 10885
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,53,114,101,109,111,118,101,95,118,111,108,97,116,105,108,101,73,105,69,52,116,121,112,101,69
-.b8 0
-.b8 4
-.b8 222
-.b8 3
-.b32 10962
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,57,114,101,109,111,118,101,95,99,118,73,105,69,52,116,121,112,101,69
-.b8 0
-.b8 4
-.b8 225
-.b8 5
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,95,116,105,108,101,73,76,106,51,50,69,78,83,48,95,49,50,116,104,114,101,97,100,95,98,108,111,99,107,69,69,69
-.b8 0
-.b8 1
-.b8 3
-.b8 66
-.b8 6
-.b8 5
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,49,95,95,115,105,110,103,108,101,95,119
-.b8 97,114,112,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,76,106,51,50,69,78,83,48,95,49,50,116,104,114,101,97,100,95
-.b8 98,108,111,99,107,69,69,69
-.b8 0
-.b8 1
-.b8 3
-.b8 114
-.b8 4
-.b8 5
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,50,50,116,104
-.b8 114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,105,109,112,108,73,76,106,51,50,69,78,83,48,95,49,50,116,104,114,101,97,100,95,98
-.b8 108,111,99,107,69,76,98,48,69,69,69
-.b8 0
-.b8 1
-.b8 3
-.b8 31
-.b8 6
-.b8 5
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,50,48,116,105
-.b8 108,101,100,95,112,97,114,116,105,116,105,111,110,95,105,109,112,108,73,76,106,51,50,69,78,83,48,95,49,50,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,69,69,69
-.b8 0
-.b8 1
-.b8 3
-.b8 107
-.b8 6
-.b8 3
-.b32 10803
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,49,56,114,101,109,111,118,101,95,114,101,102,101,114,101,110,99,101,95,116,73,82,105,69,69
-.b8 0
-.b8 4
-.b8 217
-.b8 13
-.b32 10379
-.b8 100,105,109,51
-.b8 0
-.b8 7
-.b8 220
-.b8 3
-.b8 16
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,67,49,69,118
-.b8 0
-.b8 116,104,114,101,97,100,95,98,108,111,99,107
-.b8 0
-.b8 3
-.b8 93
-.b8 2
-.b32 10149
-.b8 1
-.b8 17
-.b8 116,104,105,115
-.b8 0
-.b32 11649
-.b8 0
-.b8 7
-.b32 11654
-.b8 4
-.b32 10608
-.b32 12
-.b8 18
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,105,115,95,116,104,114,101,97
-.b8 100,95,98,108,111,99,107,69,118
-.b8 0
-.b8 116,104,105,115,95,116,104,114,101,97,100,95,98,108,111,99,107
-.b8 0
-.b8 3
-.b8 168
-.b8 2
-.b32 10608
-.b8 1
-.b8 16
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,53,116,105,108,101,100,95,112,97,114,116
-.b8 105,116,105,111,110,73,76,106,51,50,69,78,83,48,95,49,50,116,104,114,101,97,100,95,98,108,111,99,107,69,69,69,78,83,48,95,49,55,116,104
-.b8 114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,88,84,95,69,84,48,95,69,69,82,75,83,52,95
-.b8 0
-.b8 116,105,108,101,100,95,112,97,114,116,105,116,105,111,110,60,51,50,85,44,116,104,114,101,97,100,95,98,108,111,99,107,62
-.b8 0
-.b8 3
-.b8 142
-.b8 6
-.b32 11115
-.b8 1
-.b8 19
-.b8 103
-.b8 0
-.b8 3
-.b8 142
-.b8 6
-.b32 11905
-.b8 0
-.b8 20
-.b32 11910
-.b8 7
-.b32 10608
-.b8 16
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,50,48,116,105
-.b8 108,101,100,95,112,97,114,116,105,116,105,111,110,95,105,109,112,108,73,76,106,51,50,69,78,83,48,95,49,50,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,69,69,67,49,69,82,75,83,51,95
-.b8 0
-.b8 116,105,108,101,100,95,112,97,114,116,105,116,105,111,110,95,105,109,112,108
-.b8 0
-.b8 3
-.b8 111
-.b8 6
-.b32 10149
-.b8 1
-.b8 17
-.b8 116,104,105,115
-.b8 0
-.b32 12059
-.b8 19
-.b8 103
-.b8 0
-.b8 3
-.b8 111
-.b8 6
-.b32 11905
-.b8 0
-.b8 7
-.b32 12064
-.b8 4
-.b32 11386
-.b32 12
-.b8 16
-.b8 95,90,78,75,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,98,108
-.b8 111,99,107,95,116,105,108,101,73,76,106,51,50,69,78,83,48,95,49,50,116,104,114,101,97,100,95,98,108,111,99,107,69,69,99,118,78,83,49,95
-.b8 73,76,106,51,50,69,118,69,69,69,118
-.b8 0
-.b8 111,112,101,114,97,116,111,114,32,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,58,58,95,95,118,49,58,58,116,104,114,101,97
-.b8 100,95,98,108,111,99,107,95,116,105,108,101,60,51,50,85,44,32,118,111,105,100,62
-.b8 0
-.b8 3
-.b8 75
-.b8 6
-.b32 13423
-.b8 1
-.b8 17
-.b8 116,104,105,115
-.b8 0
-.b32 12249
-.b8 0
-.b8 7
-.b32 12254
-.b8 4
-.b32 12263
-.b32 12
-.b8 7
-.b32 11115
-.b8 16
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,95,116,105,108,101,73,76,106,51,50,69,118,69,67,49,73,78,83,48,95,49,50,116,104,114,101,97,100,95,98,108,111,99,107,69,69,69,82
-.b8 75,78,83,49,95,73,76,106,51,50,69,84,95,69,69
-.b8 0
-.b8 116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,60,116,104,114,101,97,100,95,98,108,111,99,107,62
-.b8 0
-.b8 3
-.b8 93
-.b8 6
-.b32 10149
-.b8 1
-.b8 17
-.b8 116,104,105,115
-.b8 0
-.b32 12426
-.b8 19
-.b8 103
-.b8 0
-.b8 3
-.b8 93
-.b8 6
-.b32 12440
-.b8 0
-.b8 7
-.b32 12431
-.b8 4
-.b32 13423
-.b32 12
-.b8 20
-.b32 12263
-.b8 16
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95,115
-.b8 105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,49,48,98,117,105,108,100
-.b8 95,109,97,115,107,69,118
-.b8 0
-.b8 98,117,105,108,100,95,109,97,115,107
-.b8 0
-.b8 3
-.b8 243
-.b8 3
-.b32 455
-.b8 1
-.b8 21
-.b8 22
-.b8 109,97,115,107
-.b8 0
-.b8 3
-.b8 244
-.b8 3
-.b32 455
-.b8 21
-.b8 22
-.b8 108,97,110,101,73,100
-.b8 0
-.b8 3
-.b8 247
-.b8 3
-.b32 455
-.b8 0
-.b8 0
-.b8 0
-.b8 16
-.b8 95,90,78,75,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95
-.b8 115,105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,51,97,110,121,69,105
-.b8 0
-.b8 97,110,121
-.b8 0
-.b8 3
-.b8 56
-.b8 4
-.b32 10296
-.b8 1
-.b8 17
-.b8 116,104,105,115
-.b8 0
-.b32 12731
-.b8 19
-.b8 112,114,101,100,105,99,97,116,101
-.b8 0
-.b8 3
-.b8 56
-.b8 4
-.b32 10296
-.b8 21
-.b8 22
-.b8 108,97,110,101,95,98,97,108,108,111,116
-.b8 0
-.b8 3
-.b8 57
-.b8 4
-.b32 455
-.b8 0
-.b8 0
-.b8 7
-.b32 12736
-.b8 4
-.b32 12745
-.b32 12
-.b8 7
-.b32 10723
-.b8 23
-.b64 $L__func_begin0
-.b64 $L__func_end0
-.b8 1
-.b8 156
-.b8 95,90,49,56,65,120,66,95,100,111,116,51,95,112,104,97,115,101,51,95,109,112,73,105,105,105,69,118,120,120,80,120,80,49,54,71,66,95,77,97
-.b8 116,114,105,120,95,111,112,97,113,117,101,83,50,95,83,50,95,83,50,95,105
-.b8 0
-.b8 65,120,66,95,100,111,116,51,95,112,104,97,115,101,51,95,109,112,60,105,110,116,51,50,95,116,44,105,110,116,51,50,95,116,44,105,110,116,51,50
-.b8 95,116,62
-.b8 0
-.b8 10
-.b8 76
-.b32 10149
-.b8 1
-.b8 24
-.b8 9
-.b8 3
-.b64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_0
-.b8 7
-.b8 115,116,97,114,116
-.b8 0
-.b8 10
-.b8 78
-.b32 9285
-.b8 24
-.b8 9
-.b8 3
-.b64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_1
-.b8 7
-.b8 101,110,100
-.b8 0
-.b8 10
-.b8 79
-.b32 9285
-.b8 24
-.b8 7
-.b8 144
-.b8 184
-.b8 230
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b8 2
-.b8 66,117,99,107,101,116
-.b8 0
-.b8 10
-.b8 80
-.b32 10131
-.b8 24
-.b8 9
-.b8 3
-.b64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_3
-.b8 7
-.b8 67
-.b8 0
-.b8 10
-.b8 81
-.b32 10303
-.b8 24
-.b8 9
-.b8 3
-.b64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_4
-.b8 7
-.b8 77
-.b8 0
-.b8 10
-.b8 82
-.b32 10303
-.b8 24
-.b8 9
-.b8 3
-.b64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_5
-.b8 7
-.b8 65
-.b8 0
-.b8 10
-.b8 83
-.b32 10303
-.b8 24
-.b8 9
-.b8 3
-.b64 _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i_param_6
-.b8 7
-.b8 66
-.b8 0
-.b8 10
-.b8 84
-.b32 10303
-.b8 24
-.b8 6
-.b8 144
-.b8 184
-.b8 236
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 115,122
-.b8 0
-.b8 10
-.b8 85
-.b32 10296
-.b8 25
-.b64 $L__tmp0
-.b64 $L__tmp178
-.b8 26
-.b8 6
-.b8 12
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 128,1
-.b8 116,105,108,101
-.b8 0
-.b8 10
-.b8 118
-.b32 13423
-.b8 27
-.b8 6
-.b8 144
-.b8 177
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 65,120
-.b8 0
-.b8 10
-.b8 90
-.b32 16686
-.b8 27
-.b8 6
-.b8 144
-.b8 178
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 66,120
-.b8 0
-.b8 10
-.b8 91
-.b32 16686
-.b8 27
-.b8 6
-.b8 144
-.b8 179
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 67,120
-.b8 0
-.b8 10
-.b8 92
-.b32 16686
-.b8 27
-.b8 6
-.b8 144
-.b8 180
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 67,105
-.b8 0
-.b8 10
-.b8 93
-.b32 10131
-.b8 27
-.b8 6
-.b8 144
-.b8 181
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 77,105
-.b8 0
-.b8 10
-.b8 94
-.b32 10131
-.b8 27
-.b8 6
-.b8 144
-.b8 182
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 65,105
-.b8 0
-.b8 10
-.b8 95
-.b32 10131
-.b8 27
-.b8 6
-.b8 144
-.b8 183
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 66,105
-.b8 0
-.b8 10
-.b8 96
-.b32 10131
-.b8 27
-.b8 6
-.b8 144
-.b8 184
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 65,112
-.b8 0
-.b8 10
-.b8 97
-.b32 10131
-.b8 27
-.b8 6
-.b8 144
-.b8 185
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 66,112
-.b8 0
-.b8 10
-.b8 98
-.b32 10131
-.b8 28
-.b32 .debug_loc
-.b8 122,99
-.b8 0
-.b8 10
-.b8 102
-.b32 10296
-.b8 27
-.b8 6
-.b8 144
-.b8 181
-.b8 238
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 116,105,100,95,103,108,111,98,97,108
-.b8 0
-.b8 10
-.b8 108
-.b32 10296
-.b8 27
-.b8 5
-.b8 144
-.b8 178
-.b8 228
-.b8 149
-.b8 1
-.b8 2
-.b8 116,105,100
-.b8 0
-.b8 10
-.b8 109
-.b32 10296
-.b8 27
-.b8 6
-.b8 144
-.b8 184
-.b8 238
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 98
-.b8 0
-.b8 10
-.b8 111
-.b32 10296
-.b8 28
-.b32 .debug_loc+284
-.b8 110,110,122,65
-.b8 0
-.b8 10
-.b8 114
-.b32 9285
-.b8 28
-.b32 .debug_loc+402
-.b8 110,110,122,66
-.b8 0
-.b8 10
-.b8 115
-.b32 9285
-.b8 28
-.b32 .debug_loc+468
-.b8 110,95,105,110,116,101,114,115,101,99,116
-.b8 0
-.b8 10
-.b8 116
-.b32 9285
-.b8 27
-.b8 5
-.b8 144
-.b8 179
-.b8 228
-.b8 149
-.b8 1
-.b8 2
-.b8 112,97,114,116,115
-.b8 0
-.b8 10
-.b8 120
-.b32 10296
-.b8 28
-.b32 .debug_loc+535
-.b8 112,97,105,114,95,105,100
-.b8 0
-.b8 10
-.b8 105
-.b32 9285
-.b8 9
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,98,108,111
-.b8 99,107,95,116,105,108,101,73,76,106,51,50,69,118,69,69
-.b8 0
-.b8 16
-.b8 3
-.b8 66
-.b8 6
-.b8 15
-.b8 95,95,98,95,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,50,50
-.b8 116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,105,109,112,108,73,76,106,51,50,69,118,76,98,48,69,69,69
-.b8 0
-.b32 13569
-.b8 2
-.b8 35
-.b8 0
-.b8 9
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,50,50,116,104
-.b8 114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,105,109,112,108,73,76,106,51,50,69,118,76,98,48,69,69,69
-.b8 0
-.b8 16
-.b8 3
-.b8 31
-.b8 6
-.b8 15
-.b8 95,95,98,95,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,49,95,95,115,105,110,103,108,101
-.b8 95,119,97,114,112,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,76,106,51,50,69,118,69,69
-.b8 0
-.b32 13729
-.b8 2
-.b8 35
-.b8 0
-.b8 9
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,49,95,95,115,105,110,103,108,101,95,119
-.b8 97,114,112,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,76,106,51,50,69,118,69,69
-.b8 0
-.b8 16
-.b8 3
-.b8 114
-.b8 4
-.b8 15
-.b8 95,95,98,95,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103
-.b8 114,111,117,112,95,98,97,115,101,73,76,106,49,69,69,69
-.b8 0
-.b32 13870
-.b8 2
-.b8 35
-.b8 0
-.b8 14
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,95,98,97,115,101,73,76,106,49,69,69,69
-.b8 0
-.b8 16
-.b8 3
-.b8 162
-.b8 15
-.b8 95,95,98,95,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,103
-.b8 114,111,117,112,69
-.b8 0
-.b32 13983
-.b8 2
-.b8 35
-.b8 0
-.b8 14
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,69
-.b8 0
-.b8 16
-.b8 3
-.b8 87
-.b8 10
-.b8 95,100,97,116,97
-.b8 0
-.b32 14047
-.b8 3
-.b8 131
-.b8 2
-.b8 35
-.b8 0
-.b8 29
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,85,116,95,69
-.b8 0
-.b8 16
-.b8 3
-.b8 124
-.b8 10
-.b8 103,114,111,117,112
-.b8 0
-.b32 14114
-.b8 3
-.b8 125
-.b8 2
-.b8 35
-.b8 0
-.b8 14
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,49,48,103,114,111,117,112,95,100,97,116,97,69
-.b8 0
-.b8 4
-.b8 3
-.b8 90
-.b8 30
-.b8 95,117,110,117,115,101,100
-.b8 0
-.b32 455
-.b8 3
-.b8 91
-.b8 4
-.b8 1
-.b8 31
-.b8 2
-.b8 35
-.b8 0
-.b8 30
-.b8 116,121,112,101
-.b8 0
-.b32 455
-.b8 3
-.b8 92
-.b8 4
-.b8 7
-.b8 24
-.b8 2
-.b8 35
-.b8 0
-.b8 31
-.b32 455
-.b8 3
-.b8 92
-.b8 4
-.b8 0
-.b8 32
-.b8 2
-.b8 35
-.b8 4
-.b8 0
-.b8 10
-.b8 99,111,97,108,101,115,99,101,100
-.b8 0
-.b32 14247
-.b8 3
-.b8 126
-.b8 2
-.b8 35
-.b8 0
-.b8 14
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,55,116,103,95,100,97,116,97,69
-.b8 0
-.b8 16
-.b8 3
-.b8 108
-.b8 30
-.b8 105,115,95,116,105,108,101,100
-.b8 0
-.b32 455
-.b8 3
-.b8 109
-.b8 4
-.b8 1
-.b8 31
-.b8 2
-.b8 35
-.b8 0
-.b8 30
-.b8 116,121,112,101
-.b8 0
-.b32 455
-.b8 3
-.b8 110
-.b8 4
-.b8 7
-.b8 24
-.b8 2
-.b8 35
-.b8 0
-.b8 30
-.b8 115,105,122,101
-.b8 0
-.b32 455
-.b8 3
-.b8 111
-.b8 4
-.b8 24
-.b8 0
-.b8 2
-.b8 35
-.b8 0
-.b8 30
-.b8 109,101,116,97,71,114,111,117,112,83,105,122,101
-.b8 0
-.b32 455
-.b8 3
-.b8 113
-.b8 4
-.b8 16
-.b8 16
-.b8 2
-.b8 35
-.b8 4
-.b8 30
-.b8 109,101,116,97,71,114,111,117,112,82,97,110,107
-.b8 0
-.b32 455
-.b8 3
-.b8 114
-.b8 4
-.b8 16
-.b8 0
-.b8 2
-.b8 35
-.b8 4
-.b8 10
-.b8 109,97,115,107
-.b8 0
-.b32 455
-.b8 3
-.b8 116
-.b8 2
-.b8 35
-.b8 8
-.b8 10
-.b8 95,114,101,115
-.b8 0
-.b32 455
-.b8 3
-.b8 118
-.b8 2
-.b8 35
-.b8 12
-.b8 0
-.b8 10
-.b8 103,114,105,100
-.b8 0
-.b32 14461
-.b8 3
-.b8 127
-.b8 2
-.b8 35
-.b8 0
-.b8 14
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,49,50,116,104,114,101,97,100,95,103,114,111
-.b8 117,112,55,103,103,95,100,97,116,97,69
-.b8 0
-.b8 8
-.b8 3
-.b8 95
-.b8 10
-.b8 103,114,105,100,87,115
-.b8 0
-.b32 10370
-.b8 3
-.b8 96
-.b8 2
-.b8 35
-.b8 0
-.b8 32
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,49,52,103,114
-.b8 105,100,95,119,111,114,107,115,112,97,99,101,69
-.b8 0
-.b8 0
-.b8 9
-.b8 86
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 33
-.b32 11663
-.b64 $L__tmp17
-.b64 $L__tmp19
-.b8 10
-.b8 118
-.b8 34
-.b32 11569
-.b64 $L__tmp17
-.b64 $L__tmp18
-.b8 3
-.b8 170
-.b8 2
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 80
-.b32 11638
-.b8 0
-.b8 0
-.b8 33
-.b32 11740
-.b64 $L__tmp20
-.b64 $L__tmp22
-.b8 10
-.b8 118
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 64
-.b32 11894
-.b8 34
-.b32 11915
-.b64 $L__tmp21
-.b64 $L__tmp22
-.b8 3
-.b8 148
-.b8 6
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 48
-.b32 12038
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 56
-.b32 12048
-.b8 0
-.b8 0
-.b8 33
-.b32 12073
-.b64 $L__tmp23
-.b64 $L__tmp26
-.b8 10
-.b8 118
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 24
-.b32 12238
-.b8 34
-.b32 12268
-.b64 $L__tmp24
-.b64 $L__tmp25
-.b8 3
-.b8 76
-.b8 6
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 8
-.b32 12405
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 16
-.b32 12415
-.b8 0
-.b8 0
-.b8 25
-.b64 $L__tmp32
-.b64 $L__tmp170
-.b8 27
-.b8 7
-.b8 144
-.b8 178
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b8 2
-.b8 105
-.b8 0
-.b8 10
-.b8 130
-.b32 9285
-.b8 27
-.b8 7
-.b8 144
-.b8 185
-.b8 240
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b8 2
-.b8 106
-.b8 0
-.b8 10
-.b8 131
-.b32 9285
-.b8 27
-.b8 7
-.b8 144
-.b8 179
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b8 2
-.b8 120,115,116,97,114,116
-.b8 0
-.b8 10
-.b8 133
-.b32 9285
-.b8 27
-.b8 7
-.b8 144
-.b8 180
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b8 2
-.b8 120,101,110,100
-.b8 0
-.b8 10
-.b8 134
-.b32 9285
-.b8 27
-.b8 7
-.b8 144
-.b8 182
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b8 2
-.b8 121,115,116,97,114,116
-.b8 0
-.b8 10
-.b8 137
-.b32 9285
-.b8 27
-.b8 7
-.b8 144
-.b8 183
-.b8 226
-.b8 144
-.b8 147
-.b8 215
-.b8 4
-.b8 2
-.b8 121,101,110,100
-.b8 0
-.b8 10
-.b8 138
-.b32 9285
-.b8 28
-.b32 .debug_loc+678
-.b8 110,120,121
-.b8 0
-.b8 10
-.b8 151
-.b32 9285
-.b8 27
-.b8 5
-.b8 144
-.b8 181
-.b8 228
-.b8 149
-.b8 1
-.b8 2
-.b8 119,111,114,107,95,112,101,114,95,116,104,114,101,97,100
-.b8 0
-.b8 10
-.b8 153
-.b32 10296
-.b8 28
-.b32 .debug_loc+771
-.b8 100,105,97,103
-.b8 0
-.b8 10
-.b8 154
-.b32 10296
-.b8 27
-.b8 5
-.b8 144
-.b8 183
-.b8 228
-.b8 149
-.b8 1
-.b8 2
-.b8 100,105,97,103,95,101,110,100
-.b8 0
-.b8 10
-.b8 155
-.b32 10296
-.b8 28
-.b32 .debug_loc+908
-.b8 120,95,109,105,110
-.b8 0
-.b8 10
-.b8 158
-.b32 10296
-.b8 28
-.b32 .debug_loc+1265
-.b8 120,95,109,97,120
-.b8 0
-.b8 10
-.b8 159
-.b32 10296
-.b8 28
-.b32 .debug_loc+1673
-.b8 120,99,111,111,114,100
-.b8 0
-.b8 10
-.b8 171
-.b32 10296
-.b8 28
-.b32 .debug_loc+1737
-.b8 121,99,111,111,114,100
-.b8 0
-.b8 10
-.b8 172
-.b32 10296
-.b8 27
-.b8 6
-.b8 144
-.b8 178
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 116,120,95,115,116,97,114,116
-.b8 0
-.b8 10
-.b8 177
-.b32 10296
-.b8 27
-.b8 6
-.b8 144
-.b8 179
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 116,121,95,115,116,97,114,116
-.b8 0
-.b8 10
-.b8 178
-.b32 10296
-.b8 27
-.b8 6
-.b8 144
-.b8 182
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 116,120,95,101,110,100
-.b8 0
-.b8 10
-.b8 203
-.b32 10296
-.b8 27
-.b8 6
-.b8 144
-.b8 183
-.b8 230
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 116,121,95,101,110,100
-.b8 0
-.b8 10
-.b8 204
-.b32 10296
-.b8 28
-.b32 .debug_loc+1801
-.b8 99,105,106
-.b8 0
-.b8 10
-.b8 208
-.b32 10330
-.b8 28
-.b32 .debug_loc+2184
-.b8 99,105,106,95,101,120,105,115,116,115
-.b8 0
-.b8 10
-.b8 213
-.b32 10296
-.b8 28
-.b32 .debug_loc+2469
-.b8 107
-.b8 0
-.b8 10
-.b8 217
-.b32 10296
-.b8 28
-.b32 .debug_loc+2705
-.b8 108
-.b8 0
-.b8 10
-.b8 218
-.b32 10296
-.b8 27
-.b8 6
-.b8 144
-.b8 182
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 97,107,105
-.b8 0
-.b8 10
-.b8 206
-.b32 10330
-.b8 27
-.b8 6
-.b8 144
-.b8 183
-.b8 232
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 98,107,106
-.b8 0
-.b8 10
-.b8 207
-.b32 10330
-.b8 25
-.b64 $L__tmp54
-.b64 $L__tmp67
-.b8 27
-.b8 6
-.b8 144
-.b8 179
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 112,105,118,111,116
-.b8 0
-.b8 10
-.b8 163
-.b32 10296
-.b8 0
-.b8 25
-.b64 $L__tmp84
-.b64 $L__tmp97
-.b8 27
-.b8 6
-.b8 144
-.b8 185
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 112,105,118,111,116
-.b8 0
-.b8 10
-.b8 187
-.b32 10296
-.b8 0
-.b8 25
-.b64 $L__tmp117
-.b64 $L__tmp122
-.b8 27
-.b8 7
-.b8 144
-.b8 185
-.b8 224
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b8 2
-.b8 116
-.b8 0
-.b8 10
-.b8 232
-.b32 10330
-.b8 0
-.b8 34
-.b32 12586
-.b64 $L__tmp149
-.b64 $L__tmp153
-.b8 10
-.b8 14
-.b8 1
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot0
-.b8 35
-.b8 0
-.b32 12680
-.b8 36
-.b8 6
-.b8 144
-.b8 183
-.b8 242
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b32 12690
-.b8 25
-.b64 $L__tmp149
-.b64 $L__tmp153
-.b8 37
-.b8 7
-.b8 144
-.b8 177
-.b8 224
-.b8 196
-.b8 145
-.b8 215
-.b8 4
-.b8 2
-.b32 12709
-.b8 34
-.b32 12445
-.b64 $L__tmp149
-.b64 $L__tmp151
-.b8 3
-.b8 57
-.b8 4
-.b8 25
-.b64 $L__tmp149
-.b64 $L__tmp151
-.b8 37
-.b8 6
-.b8 144
-.b8 185
-.b8 242
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b32 12554
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 38
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,57,116,101,109
-.b8 112,108,97,116,101,115,55,102,111,114,119,97,114,100,73,82,105,69,69,79,84,95,82,78,83,50,95,49,54,114,101,109,111,118,101,95,114,101,102,101
-.b8 114,101,110,99,101,73,83,53,95,69,52,116,121,112,101,69
-.b8 0
-.b8 102,111,114,119,97,114,100,60,105,110,116,51,50,95,116,32,38,62
-.b8 0
-.b8 4
-.b8 231
-.b32 15755
-.b8 1
-.b8 39
-.b8 116
-.b8 0
-.b8 4
-.b8 231
-.b32 15760
-.b8 0
-.b8 20
-.b32 10330
-.b8 20
-.b32 11477
-.b8 16
-.b8 95,90,78,75,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,51,54,95,95,115,116,97,116,105,99,95
-.b8 115,105,122,101,95,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,95,98,97,115,101,73,76,106,51,50,69,69,57,115,104,102,108,95
-.b8 100,111,119,110,73,82,105,105,69,69,84,48,95,79,84,95,106
-.b8 0
-.b8 115,104,102,108,95,100,111,119,110,60,105,110,116,51,50,95,116,32,38,44,116,121,112,101,62
-.b8 0
-.b8 3
-.b8 14
-.b8 4
-.b32 11042
-.b8 1
-.b8 17
-.b8 116,104,105,115
-.b8 0
-.b32 12731
-.b8 19
-.b8 101,108,101,109
-.b8 0
-.b8 3
-.b8 14
-.b8 4
-.b32 15755
-.b8 19
-.b8 100,101,108,116,97
-.b8 0
-.b8 3
-.b8 14
-.b8 4
-.b32 455
-.b8 0
-.b8 16
-.b8 95,90,78,49,56,99,111,111,112,101,114,97,116,105,118,101,95,103,114,111,117,112,115,52,95,95,118,49,55,100,101,116,97,105,108,115,52,116,105,108
-.b8 101,49,53,95,110,97,116,105,118,101,95,115,104,117,102,102,108,101,57,115,104,102,108,95,100,111,119,110,73,105,69,69,84,95,83,53,95,106,106,106
-.b8 0
-.b8 115,104,102,108,95,100,111,119,110,60,105,110,116,51,50,95,116,62
-.b8 0
-.b8 5
-.b8 69
-.b8 1
-.b32 10330
-.b8 1
-.b8 19
-.b8 101,108,101,109
-.b8 0
-.b8 5
-.b8 70
-.b8 1
-.b32 10330
-.b8 19
-.b8 103,77,97,115,107
-.b8 0
-.b8 5
-.b8 70
-.b8 1
-.b32 455
-.b8 19
-.b8 100,101,108,116,97
-.b8 0
-.b8 5
-.b8 70
-.b8 1
-.b32 455
-.b8 19
-.b8 116,104,114,101,97,100,115
-.b8 0
-.b8 5
-.b8 70
-.b8 1
-.b32 455
-.b8 0
-.b8 23
-.b64 $L__func_begin1
-.b64 $L__func_end1
-.b8 1
-.b8 156
-.b8 95,90,49,51,71,66,95,114,101,100,117,99,101,95,115,117,109,73,105,76,105,51,50,69,69,84,95,78,49,56,99,111,111,112,101,114,97,116,105,118
-.b8 101,95,103,114,111,117,112,115,52,95,95,118,49,49,55,116,104,114,101,97,100,95,98,108,111,99,107,95,116,105,108,101,73,88,84,48,95,69,118,69
-.b8 69,83,48,95
-.b8 0
-.b8 71,66,95,114,101,100,117,99,101,95,115,117,109,60,105,110,116,51,50,95,116,44,51,50,62
-.b8 0
-.b8 10
-.b8 48
-.b32 10330
-.b8 1
-.b8 40
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot1
-.b8 35
-.b8 8
-.b8 103
-.b8 0
-.b8 10
-.b8 48
-.b32 13423
-.b8 40
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot1
-.b8 35
-.b8 24
-.b8 118,97,108
-.b8 0
-.b8 10
-.b8 48
-.b32 10330
-.b8 25
-.b64 $L__tmp179
-.b64 $L__tmp201
-.b8 28
-.b32 .debug_loc+2941
-.b8 105
-.b8 0
-.b8 10
-.b8 52
-.b32 10296
-.b8 25
-.b64 $L__tmp186
-.b64 $L__tmp198
-.b8 27
-.b8 6
-.b8 144
-.b8 176
-.b8 228
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b8 110,101,120,116
-.b8 0
-.b8 10
-.b8 54
-.b32 10330
-.b8 33
-.b32 15765
-.b64 $L__tmp186
-.b64 $L__tmp196
-.b8 10
-.b8 54
-.b8 35
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot1
-.b8 35
-.b8 0
-.b32 15898
-.b8 36
-.b8 6
-.b8 144
-.b8 182
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b32 15908
-.b8 36
-.b8 5
-.b8 144
-.b8 184
-.b8 228
-.b8 149
-.b8 1
-.b8 2
-.b32 15921
-.b8 34
-.b32 15621
-.b64 $L__tmp186
-.b64 $L__tmp187
-.b8 3
-.b8 16
-.b8 4
-.b8 36
-.b8 6
-.b8 144
-.b8 183
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b32 15745
-.b8 0
-.b8 34
-.b32 12445
-.b64 $L__tmp188
-.b64 $L__tmp193
-.b8 3
-.b8 16
-.b8 4
-.b8 25
-.b64 $L__tmp188
-.b64 $L__tmp193
-.b8 37
-.b8 6
-.b8 144
-.b8 177
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b32 12554
-.b8 0
-.b8 0
-.b8 34
-.b32 15936
-.b64 $L__tmp194
-.b64 $L__tmp195
-.b8 3
-.b8 15
-.b8 4
-.b8 36
-.b8 6
-.b8 144
-.b8 179
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b32 16045
-.b8 36
-.b8 6
-.b8 144
-.b8 180
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b32 16058
-.b8 36
-.b8 6
-.b8 144
-.b8 181
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b32 16072
-.b8 36
-.b8 6
-.b8 144
-.b8 183
-.b8 226
-.b8 200
-.b8 171
-.b8 2
-.b8 2
-.b32 16086
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 0
-.b8 41
-.b64 $L__func_begin2
-.b64 $L__func_end2
-.b8 1
-.b8 156
-.b8 95,90,78,52,100,105,109,51,67,49,69,53,117,105,110,116,51
-.b8 0
-.b8 100,105,109,51
-.b8 0
-.b8 7
-.b8 211
-.b8 3
-.b32 10149
-.b8 1
-.b8 42
-.b8 6
-.b8 144
-.b8 177
-.b8 200
-.b8 201
-.b8 171
-.b8 2
-.b8 2
-.b8 116,104,105,115
-.b8 0
-.b32 16695
-.b8 43
-.b8 6
-.b8 11
-.b8 3
-.b64 __local_depot2
-.b8 35
-.b8 0
-.b8 118
-.b8 0
-.b8 7
-.b8 211
-.b8 3
-.b32 10480
-.b8 0
-.b8 4
-.b32 10330
-.b32 12
-.b8 7
-.b32 16700
-.b8 4
-.b32 10379
-.b32 12
-.b8 0
-	}
-	.section	.debug_macinfo
-	{
-.b8 0
-
-	}
- 
-
----------------------------------------
-instantiated kernel
----------------------------------------
---- Linker for void AxB_dot3_phase3_mp<int, int, int>(long long, long long, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, int) ---
----------------------------------------
-ptxas info    : 59 bytes gmem
-ptxas info    : Function properties for _Z13GB_reduce_sumIiLi32EET_N18cooperative_groups4__v117thread_block_tileIXT0_EvEES0_
-ptxas         .     88 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v117thread_group_baseILj4EEC2Ev
-ptxas         .     16 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32EvLb0EEC2ILj32ENS0_12thread_blockELb0EEERKNS2_IXT_ET0_XT1_EEE
-ptxas         .     104 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v117thread_block_tileILj32ENS0_12thread_blockEEC2ERKS2_
-ptxas         .     24 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
-ptxas info    : Compiling entry function '_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i' for 'sm_70'
-ptxas info    : Function properties for _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i
-ptxas         .     160 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
-ptxas info    : Used 138 registers, 412 bytes cmem[0]
-ptxas info    : Function properties for _ZN4dim3C1E5uint3
-ptxas         .     16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v112thread_groupC2Ej
-ptxas         .     8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v117thread_group_baseILj1EEC2Ev
-ptxas         .     16 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32EvEC2Ejj
-ptxas         .     32 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v131__single_warp_thread_block_tileILj32ENS0_12thread_blockEEC2Ev
-ptxas         .     8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
-ptxas info    : Function properties for _ZN18cooperative_groups4__v17details22thread_block_tile_implILj32ENS0_12thread_blockELb0EEC2ERKS3_
-ptxas         .     24 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
-ptxas info    : Function properties for _Z9atomicAddPyy
-ptxas         .     8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
-ptxas info    : Function properties for _Z13__ballot_syncji
-ptxas         .     8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
-ptxas info    : Function properties for _Z16__shfl_down_syncjiji
-ptxas         .     8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads
-ptxas info    : Function properties for __ullAtomicAdd
-ptxas         .     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
-info    : 59 bytes gmem
-info    : Function properties for '_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i':
-info    : used 138 registers, 320 stack, 0 bytes smem, 412 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-completed func()
-Inside serialize!!!!
- compiled serialized prog AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-writing prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
----------------------------------------
---- Linker for void AxB_dot3_phase3_mp<int, int, int>(long long, long long, long long*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, GB_Matrix_opaque*, int) ---
----------------------------------------
-info    : 59 bytes gmem
-info    : Function properties for '_Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i':
-info    : used 138 registers, 320 stack, 0 bytes smem, 412 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-warp 0 zombie count = 27, nzombies = 0
- Czombie = 27
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 2222.69ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 27
-
-    (0,6) zombie
-    (1,1)   1
-    (3,12) zombie
-    (3,17) zombie
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24)   0
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30) zombie
-    (16,20)   0
-    (17,30) zombie
-    (18,18) zombie
-    (19,1) zombie
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 5 entries, memory: 864 bytes
-
-    (1,1)   1
-    (6,24)   0
-    (9,31)   0
-    (16,20)   0
-    (25,4)   1
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 5 entries, memory: 864 bytes
-
-    (1,1)   1
-    (6,24)   0
-    (9,31)   0
-    (16,20)   0
-    (25,4)   1
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 5 entries, memory: 896 bytes
-
-    (1,1)    0
-    (6,24)    0
-    (9,31)    0
-    (16,20)    0
-    (25,4)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 5 entries, memory: 840 bytes
-
-    (1,1)   1
-    (6,24)   1
-    (9,31)   1
-    (16,20)   1
-    (25,4)   1
- work:5 gpus:0 [       OK ] AxB_dot3_tests_PLUS_TIMES_3.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (2249 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_3.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
-Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-1024 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5305700
-inside enumify: 0x7f1ff5305700
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-bucket 1 has 1024 dots to do
-LAUNCHING BUCKET CODE: 1
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_dndn
-found memory-cached prog GB_jit_AxB_dot3_phase3_dndn
- got kernel instance AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_dndnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-tid=0, i,j = 428,26  nnzA= 1024, nnzB=1024
-tid=0, i,j = 397,12  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,21  nnzA= 1024, nnzB=1024
-tid=0, i,j = 904,16  nnzA= 1024, nnzB=1024
-tid=0, i,j = 478,0  nnzA= 1024, nnzB=1024
-tid=0, i,j = 666,7  nnzA= 1024, nnzB=1024
-tid=0, i,j = 569,22  nnzA= 1024, nnzB=1024
-tid=0, i,j = 192,13  nnzA= 1024, nnzB=1024
-tid=0, i,j = 103,18  nnzA= 1024, nnzB=1024
-tid=0, i,j = 886,19  nnzA= 1024, nnzB=1024
-tid=0, i,j = 905,24  nnzA= 1024, nnzB=1024
-tid=0, i,j = 568,15  nnzA= 1024, nnzB=1024
-tid=0, i,j = 996,6  nnzA= 1024, nnzB=1024
-tid=0, i,j = 187,9  nnzA= 1024, nnzB=1024
-tid=0, i,j = 376,2  nnzA= 1024, nnzB=1024
-tid=0, i,j = 975,21  nnzA= 1024, nnzB=1024
-tid=0, i,j = 107,28  nnzA= 1024, nnzB=1024
-tid=0, i,j = 46,11  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,0  nnzA= 1024, nnzB=1024
-tid=0, i,j = 928,17  nnzA= 1024, nnzB=1024
-tid=0, i,j = 310,23  nnzA= 1024, nnzB=1024
-tid=0, i,j = 896,8  nnzA= 1024, nnzB=1024
-tid=0, i,j = 560,5  nnzA= 1024, nnzB=1024
-tid=0, i,j = 821,19  nnzA= 1024, nnzB=1024
-tid=0, i,j = 953,12  nnzA= 1024, nnzB=1024
-tid=0, i,j = 446,10  nnzA= 1024, nnzB=1024
-tid=0, i,j = 421,14  nnzA= 1024, nnzB=1024
-tid=0, i,j = 241,25  nnzA= 1024, nnzB=1024
-tid=0, i,j = 474,20  nnzA= 1024, nnzB=1024
-tid=0, i,j = 788,16  nnzA= 1024, nnzB=1024
-tid=0, i,j = 955,11  nnzA= 1024, nnzB=1024
-tid=0, i,j = 183,7  nnzA= 1024, nnzB=1024
-tid=0, i,j = 960,54  nnzA= 1024, nnzB=1024
-tid=0, i,j = 893,57  nnzA= 1024, nnzB=1024
-tid=0, i,j = 476,49  nnzA= 1024, nnzB=1024
-tid=0, i,j = 940,53  nnzA= 1024, nnzB=1024
-tid=0, i,j = 590,61  nnzA= 1024, nnzB=1024
-tid=0, i,j = 108,59  nnzA= 1024, nnzB=1024
-tid=0, i,j = 70,47  nnzA= 1024, nnzB=1024
-tid=0, i,j = 90,56  nnzA= 1024, nnzB=1024
-tid=0, i,j = 804,34  nnzA= 1024, nnzB=1024
-tid=0, i,j = 121,32  nnzA= 1024, nnzB=1024
-tid=0, i,j = 846,57  nnzA= 1024, nnzB=1024
-tid=0, i,j = 568,41  nnzA= 1024, nnzB=1024
-tid=0, i,j = 698,39  nnzA= 1024, nnzB=1024
-tid=0, i,j = 771,62  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,30  nnzA= 1024, nnzB=1024
-tid=0, i,j = 138,38  nnzA= 1024, nnzB=1024
-tid=0, i,j = 999,52  nnzA= 1024, nnzB=1024
-tid=0, i,j = 208,46  nnzA= 1024, nnzB=1024
-tid=0, i,j = 35,58  nnzA= 1024, nnzB=1024
-tid=0, i,j = 81,33  nnzA= 1024, nnzB=1024
-tid=0, i,j = 558,54  nnzA= 1024, nnzB=1024
-tid=0, i,j = 950,40  nnzA= 1024, nnzB=1024
-tid=0, i,j = 336,48  nnzA= 1024, nnzB=1024
-tid=0, i,j = 798,43  nnzA= 1024, nnzB=1024
-tid=0, i,j = 556,51  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,60  nnzA= 1024, nnzB=1024
-tid=0, i,j = 35,50  nnzA= 1024, nnzB=1024
-tid=0, i,j = 609,37  nnzA= 1024, nnzB=1024
-tid=0, i,j = 979,55  nnzA= 1024, nnzB=1024
-tid=0, i,j = 441,28  nnzA= 1024, nnzB=1024
-tid=0, i,j = 324,42  nnzA= 1024, nnzB=1024
-tid=0, i,j = 451,36  nnzA= 1024, nnzB=1024
-tid=0, i,j = 665,73  nnzA= 1024, nnzB=1024
-tid=0, i,j = 50,63  nnzA= 1024, nnzB=1024
-tid=0, i,j = 297,72  nnzA= 1024, nnzB=1024
-tid=0, i,j = 324,68  nnzA= 1024, nnzB=1024
-tid=0, i,j = 234,91  nnzA= 1024, nnzB=1024
-tid=0, i,j = 690,92  nnzA= 1024, nnzB=1024
-tid=0, i,j = 243,82  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,92  nnzA= 1024, nnzB=1024
-tid=0, i,j = 748,97  nnzA= 1024, nnzB=1024
-tid=0, i,j = 268,64  nnzA= 1024, nnzB=1024
-tid=0, i,j = 657,87  nnzA= 1024, nnzB=1024
-tid=0, i,j = 825,89  nnzA= 1024, nnzB=1024
-tid=0, i,j = 564,96  nnzA= 1024, nnzB=1024
-tid=0, i,j = 90,78  nnzA= 1024, nnzB=1024
-tid=0, i,j = 248,76  nnzA= 1024, nnzB=1024
-tid=0, i,j = 637,94  nnzA= 1024, nnzB=1024
-tid=0, i,j = 74,92  nnzA= 1024, nnzB=1024
-tid=0, i,j = 539,70  nnzA= 1024, nnzB=1024
-tid=0, i,j = 37,90  nnzA= 1024, nnzB=1024
-tid=0, i,j = 228,71  nnzA= 1024, nnzB=1024
-tid=0, i,j = 411,67  nnzA= 1024, nnzB=1024
-tid=0, i,j = 722,94  nnzA= 1024, nnzB=1024
-tid=0, i,j = 719,66  nnzA= 1024, nnzB=1024
-tid=0, i,j = 104,86  nnzA= 1024, nnzB=1024
-tid=0, i,j = 402,110  nnzA= 1024, nnzB=1024
-tid=0, i,j = 519,91  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,65  nnzA= 1024, nnzB=1024
-tid=0, i,j = 477,69  nnzA= 1024, nnzB=1024
-tid=0, i,j = 326,99  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,125  nnzA= 1024, nnzB=1024
-tid=0, i,j = 240,115  nnzA= 1024, nnzB=1024
-tid=0, i,j = 820,128  nnzA= 1024, nnzB=1024
-tid=0, i,j = 660,129  nnzA= 1024, nnzB=1024
-tid=0, i,j = 623,130  nnzA= 1024, nnzB=1024
-tid=0, i,j = 99,118  nnzA= 1024, nnzB=1024
-tid=0, i,j = 278,113  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,112  nnzA= 1024, nnzB=1024
-tid=0, i,j = 151,122  nnzA= 1024, nnzB=1024
-tid=0, i,j = 338,117  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,104  nnzA= 1024, nnzB=1024
-tid=0, i,j = 522,107  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,103  nnzA= 1024, nnzB=1024
-tid=0, i,j = 835,128  nnzA= 1024, nnzB=1024
-tid=0, i,j = 219,117  nnzA= 1024, nnzB=1024
-tid=0, i,j = 284,139  nnzA= 1024, nnzB=1024
-tid=0, i,j = 609,102  nnzA= 1024, nnzB=1024
-tid=0, i,j = 715,121  nnzA= 1024, nnzB=1024
-tid=0, i,j = 239,107  nnzA= 1024, nnzB=1024
-tid=0, i,j = 601,162  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,163  nnzA= 1024, nnzB=1024
-tid=0, i,j = 959,158  nnzA= 1024, nnzB=1024
-tid=0, i,j = 492,133  nnzA= 1024, nnzB=1024
-tid=0, i,j = 108,136  nnzA= 1024, nnzB=1024
-tid=0, i,j = 570,132  nnzA= 1024, nnzB=1024
-tid=0, i,j = 297,160  nnzA= 1024, nnzB=1024
-tid=0, i,j = 714,157  nnzA= 1024, nnzB=1024
-tid=0, i,j = 995,146  nnzA= 1024, nnzB=1024
-tid=0, i,j = 123,143  nnzA= 1024, nnzB=1024
-tid=0, i,j = 484,151  nnzA= 1024, nnzB=1024
-tid=0, i,j = 355,131  nnzA= 1024, nnzB=1024
-tid=0, i,j = 717,150  nnzA= 1024, nnzB=1024
-tid=0, i,j = 887,140  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,123  nnzA= 1024, nnzB=1024
-tid=0, i,j = 554,121  nnzA= 1024, nnzB=1024
-tid=0, i,j = 486,93  nnzA= 1024, nnzB=1024
-tid=0, i,j = 87,142  nnzA= 1024, nnzB=1024
-tid=0, i,j = 194,146  nnzA= 1024, nnzB=1024
-tid=0, i,j = 281,100  nnzA= 1024, nnzB=1024
-tid=0, i,j = 477,120  nnzA= 1024, nnzB=1024
-tid=0, i,j = 552,145  nnzA= 1024, nnzB=1024
-tid=0, i,j = 754,81  nnzA= 1024, nnzB=1024
-tid=0, i,j = 884,109  nnzA= 1024, nnzB=1024
-tid=0, i,j = 433,77  nnzA= 1024, nnzB=1024
-tid=0, i,j = 131,108  nnzA= 1024, nnzB=1024
-tid=0, i,j = 595,174  nnzA= 1024, nnzB=1024
-tid=0, i,j = 253,84  nnzA= 1024, nnzB=1024
-tid=0, i,j = 295,135  nnzA= 1024, nnzB=1024
-tid=0, i,j = 855,75  nnzA= 1024, nnzB=1024
-tid=0, i,j = 652,106  nnzA= 1024, nnzB=1024
-tid=0, i,j = 807,130  nnzA= 1024, nnzB=1024
-tid=0, i,j = 953,192  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,192  nnzA= 1024, nnzB=1024
-tid=0, i,j = 321,168  nnzA= 1024, nnzB=1024
-tid=0, i,j = 15,171  nnzA= 1024, nnzB=1024
-tid=0, i,j = 997,187  nnzA= 1024, nnzB=1024
-tid=0, i,j = 990,190  nnzA= 1024, nnzB=1024
-tid=0, i,j = 111,189  nnzA= 1024, nnzB=1024
-tid=0, i,j = 675,182  nnzA= 1024, nnzB=1024
-tid=0, i,j = 879,167  nnzA= 1024, nnzB=1024
-tid=0, i,j = 949,149  nnzA= 1024, nnzB=1024
-tid=0, i,j = 17,177  nnzA= 1024, nnzB=1024
-tid=0, i,j = 290,156  nnzA= 1024, nnzB=1024
-tid=0, i,j = 450,185  nnzA= 1024, nnzB=1024
-tid=0, i,j = 801,166  nnzA= 1024, nnzB=1024
-tid=0, i,j = 36,126  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 143,184  nnzA= 1024, nnzB=1024
-tid=0, i,j = 830,180  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,148  nnzA= 1024, nnzB=1024
-tid=0, i,j = 253,131  nnzA= 1024, nnzB=1024
-tid=0, i,j = 519,114  nnzA= 1024, nnzB=1024
-tid=0, i,j = 108,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 932,179  nnzA= 1024, nnzB=1024
-tid=0, i,j = 779,112  nnzA= 1024, nnzB=1024
-tid=0, i,j = 834,137  nnzA= 1024, nnzB=1024
-tid=0, i,j = 288,138  nnzA= 1024, nnzB=1024
-tid=0, i,j = 198,116  nnzA= 1024, nnzB=1024
-tid=0, i,j = 612,170  nnzA= 1024, nnzB=1024
-tid=0, i,j = 905,111  nnzA= 1024, nnzB=1024
-tid=0, i,j = 821,134  nnzA= 1024, nnzB=1024
-tid=0, i,j = 221,164  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,198  nnzA= 1024, nnzB=1024
-tid=0, i,j = 863,202  nnzA= 1024, nnzB=1024
-tid=0, i,j = 692,183  nnzA= 1024, nnzB=1024
-tid=0, i,j = 779,186  nnzA= 1024, nnzB=1024
-tid=0, i,j = 974,157  nnzA= 1024, nnzB=1024
-tid=0, i,j = 859,197  nnzA= 1024, nnzB=1024
-tid=0, i,j = 922,195  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1001,182  nnzA= 1024, nnzB=1024
-tid=0, i,j = 396,165  nnzA= 1024, nnzB=1024
-tid=0, i,j = 199,141  nnzA= 1024, nnzB=1024
-tid=0, i,j = 225,142  nnzA= 1024, nnzB=1024
-tid=0, i,j = 951,172  nnzA= 1024, nnzB=1024
-tid=0, i,j = 0,174  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,144  nnzA= 1024, nnzB=1024
-tid=0, i,j = 907,201  nnzA= 1024, nnzB=1024
-tid=0, i,j = 945,139  nnzA= 1024, nnzB=1024
-tid=0, i,j = 901,169  nnzA= 1024, nnzB=1024
-tid=0, i,j = 135,193  nnzA= 1024, nnzB=1024
-tid=0, i,j = 71,211  nnzA= 1024, nnzB=1024
-tid=0, i,j = 865,203  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,188  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,220  nnzA= 1024, nnzB=1024
-tid=0, i,j = 550,216  nnzA= 1024, nnzB=1024
-tid=0, i,j = 545,222  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,218  nnzA= 1024, nnzB=1024
-tid=0, i,j = 426,212  nnzA= 1024, nnzB=1024
-tid=0, i,j = 721,224  nnzA= 1024, nnzB=1024
-tid=0, i,j = 935,225  nnzA= 1024, nnzB=1024
-tid=0, i,j = 268,215  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,204  nnzA= 1024, nnzB=1024
-tid=0, i,j = 66,219  nnzA= 1024, nnzB=1024
-tid=0, i,j = 808,209  nnzA= 1024, nnzB=1024
-tid=0, i,j = 922,218  nnzA= 1024, nnzB=1024
-tid=0, i,j = 781,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 520,234  nnzA= 1024, nnzB=1024
-tid=0, i,j = 382,230  nnzA= 1024, nnzB=1024
-tid=0, i,j = 535,229  nnzA= 1024, nnzB=1024
-tid=0, i,j = 551,231  nnzA= 1024, nnzB=1024
-tid=0, i,j = 826,207  nnzA= 1024, nnzB=1024
-tid=0, i,j = 631,178  nnzA= 1024, nnzB=1024
-tid=0, i,j = 137,194  nnzA= 1024, nnzB=1024
-tid=0, i,j = 56,221  nnzA= 1024, nnzB=1024
-tid=0, i,j = 188,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 921,217  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1016,223  nnzA= 1024, nnzB=1024
-tid=0, i,j = 928,214  nnzA= 1024, nnzB=1024
-tid=0, i,j = 727,226  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,175  nnzA= 1024, nnzB=1024
-tid=0, i,j = 570,233  nnzA= 1024, nnzB=1024
-tid=0, i,j = 531,199  nnzA= 1024, nnzB=1024
-tid=0, i,j = 500,241  nnzA= 1024, nnzB=1024
-tid=0, i,j = 522,235  nnzA= 1024, nnzB=1024
-tid=0, i,j = 551,221  nnzA= 1024, nnzB=1024
-tid=0, i,j = 233,248  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1004,244  nnzA= 1024, nnzB=1024
-tid=0, i,j = 197,252  nnzA= 1024, nnzB=1024
-tid=0, i,j = 326,246  nnzA= 1024, nnzB=1024
-tid=0, i,j = 40,254  nnzA= 1024, nnzB=1024
-tid=0, i,j = 193,242  nnzA= 1024, nnzB=1024
-tid=0, i,j = 221,236  nnzA= 1024, nnzB=1024
-tid=0, i,j = 238,254  nnzA= 1024, nnzB=1024
-tid=0, i,j = 588,244  nnzA= 1024, nnzB=1024
-tid=0, i,j = 147,247  nnzA= 1024, nnzB=1024
-tid=0, i,j = 931,211  nnzA= 1024, nnzB=1024
-tid=0, i,j = 796,257  nnzA= 1024, nnzB=1024
-tid=0, i,j = 368,262  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,247  nnzA= 1024, nnzB=1024
-tid=0, i,j = 82,239  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,250  nnzA= 1024, nnzB=1024
-tid=0, i,j = 461,257  nnzA= 1024, nnzB=1024
-tid=0, i,j = 743,228  nnzA= 1024, nnzB=1024
-tid=0, i,j = 152,213  nnzA= 1024, nnzB=1024
-tid=0, i,j = 494,245  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,237  nnzA= 1024, nnzB=1024
-tid=0, i,j = 233,258  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,253  nnzA= 1024, nnzB=1024
-tid=0, i,j = 945,261  nnzA= 1024, nnzB=1024
-tid=0, i,j = 659,210  nnzA= 1024, nnzB=1024
-tid=0, i,j = 985,208  nnzA= 1024, nnzB=1024
-tid=0, i,j = 897,232  nnzA= 1024, nnzB=1024
-tid=0, i,j = 895,255  nnzA= 1024, nnzB=1024
-tid=0, i,j = 300,243  nnzA= 1024, nnzB=1024
-tid=0, i,j = 708,251  nnzA= 1024, nnzB=1024
-tid=0, i,j = 333,276  nnzA= 1024, nnzB=1024
-tid=0, i,j = 32,285  nnzA= 1024, nnzB=1024
-tid=0, i,j = 179,282  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,270  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,264  nnzA= 1024, nnzB=1024
-tid=0, i,j = 126,287  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,279  nnzA= 1024, nnzB=1024
-tid=0, i,j = 394,287  nnzA= 1024, nnzB=1024
-tid=0, i,j = 124,265  nnzA= 1024, nnzB=1024
-tid=0, i,j = 979,274  nnzA= 1024, nnzB=1024
-tid=0, i,j = 449,281  nnzA= 1024, nnzB=1024
-tid=0, i,j = 944,271  nnzA= 1024, nnzB=1024
-tid=0, i,j = 562,292  nnzA= 1024, nnzB=1024
-tid=0, i,j = 106,295  nnzA= 1024, nnzB=1024
-tid=0, i,j = 18,280  nnzA= 1024, nnzB=1024
-tid=0, i,j = 114,256  nnzA= 1024, nnzB=1024
-tid=0, i,j = 377,277  nnzA= 1024, nnzB=1024
-tid=0, i,j = 466,294  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1007,283  nnzA= 1024, nnzB=1024
-tid=0, i,j = 124,242  nnzA= 1024, nnzB=1024
-tid=0, i,j = 352,266  nnzA= 1024, nnzB=1024
-tid=0, i,j = 234,268  nnzA= 1024, nnzB=1024
-tid=0, i,j = 594,291  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,242  nnzA= 1024, nnzB=1024
-tid=0, i,j = 37,286  nnzA= 1024, nnzB=1024
-tid=0, i,j = 100,273  nnzA= 1024, nnzB=1024
-tid=0, i,j = 964,238  nnzA= 1024, nnzB=1024
-tid=0, i,j = 884,260  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1,295  nnzA= 1024, nnzB=1024
-tid=0, i,j = 388,240  nnzA= 1024, nnzB=1024
-tid=0, i,j = 848,288  nnzA= 1024, nnzB=1024
-tid=0, i,j = 595,284  nnzA= 1024, nnzB=1024
-tid=0, i,j = 571,314  nnzA= 1024, nnzB=1024
-tid=0, i,j = 264,307  nnzA= 1024, nnzB=1024
-tid=0, i,j = 355,311  nnzA= 1024, nnzB=1024
-tid=0, i,j = 298,315  nnzA= 1024, nnzB=1024
-tid=0, i,j = 109,296  nnzA= 1024, nnzB=1024
-tid=0, i,j = 741,315  nnzA= 1024, nnzB=1024
-tid=0, i,j = 30,302  nnzA= 1024, nnzB=1024
-tid=0, i,j = 183,296  nnzA= 1024, nnzB=1024
-tid=0, i,j = 328,309  nnzA= 1024, nnzB=1024
-tid=0, i,j = 554,300  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,310  nnzA= 1024, nnzB=1024
-tid=0, i,j = 622,305  nnzA= 1024, nnzB=1024
-tid=0, i,j = 317,290  nnzA= 1024, nnzB=1024
-tid=0, i,j = 28,308  nnzA= 1024, nnzB=1024
-tid=0, i,j = 627,309  nnzA= 1024, nnzB=1024
-tid=0, i,j = 61,312  nnzA= 1024, nnzB=1024
-tid=0, i,j = 595,318  nnzA= 1024, nnzB=1024
-tid=0, i,j = 235,322  nnzA= 1024, nnzB=1024
-tid=0, i,j = 924,270  nnzA= 1024, nnzB=1024
-tid=0, i,j = 126,319  nnzA= 1024, nnzB=1024
-tid=0, i,j = 245,296  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1006,297  nnzA= 1024, nnzB=1024
-tid=0, i,j = 323,317  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,315  nnzA= 1024, nnzB=1024
-tid=0, i,j = 67,272  nnzA= 1024, nnzB=1024
-tid=0, i,j = 73,321  nnzA= 1024, nnzB=1024
-tid=0, i,j = 960,294  nnzA= 1024, nnzB=1024
-tid=0, i,j = 229,304  nnzA= 1024, nnzB=1024
-tid=0, i,j = 400,269  nnzA= 1024, nnzB=1024
-tid=0, i,j = 10,267  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,316  nnzA= 1024, nnzB=1024
-tid=0, i,j = 758,313  nnzA= 1024, nnzB=1024
-tid=0, i,j = 695,334  nnzA= 1024, nnzB=1024
-tid=0, i,j = 993,338  nnzA= 1024, nnzB=1024
-tid=0, i,j = 678,343  nnzA= 1024, nnzB=1024
-tid=0, i,j = 36,341  nnzA= 1024, nnzB=1024
-tid=0, i,j = 775,336  nnzA= 1024, nnzB=1024
-tid=0, i,j = 384,344  nnzA= 1024, nnzB=1024
-tid=0, i,j = 375,323  nnzA= 1024, nnzB=1024
-tid=0, i,j = 176,329  nnzA= 1024, nnzB=1024
-tid=0, i,j = 795,332  nnzA= 1024, nnzB=1024
-tid=0, i,j = 651,323  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,327  nnzA= 1024, nnzB=1024
-tid=0, i,j = 608,337  nnzA= 1024, nnzB=1024
-tid=0, i,j = 308,316  nnzA= 1024, nnzB=1024
-tid=0, i,j = 680,339  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,335  nnzA= 1024, nnzB=1024
-tid=0, i,j = 808,336  nnzA= 1024, nnzB=1024
-tid=0, i,j = 60,348  nnzA= 1024, nnzB=1024
-tid=0, i,j = 142,351  nnzA= 1024, nnzB=1024
-tid=0, i,j = 821,348  nnzA= 1024, nnzB=1024
-tid=0, i,j = 549,324  nnzA= 1024, nnzB=1024
-tid=0, i,j = 996,347  nnzA= 1024, nnzB=1024
-tid=0, i,j = 774,301  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,325  nnzA= 1024, nnzB=1024
-tid=0, i,j = 723,342  nnzA= 1024, nnzB=1024
-tid=0, i,j = 645,303  nnzA= 1024, nnzB=1024
-tid=0, i,j = 282,350  nnzA= 1024, nnzB=1024
-tid=0, i,j = 468,320  nnzA= 1024, nnzB=1024
-tid=0, i,j = 965,330  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,297  nnzA= 1024, nnzB=1024
-tid=0, i,j = 680,344  nnzA= 1024, nnzB=1024
-tid=0, i,j = 159,299  nnzA= 1024, nnzB=1024
-tid=0, i,j = 849,340  nnzA= 1024, nnzB=1024
-tid=0, i,j = 990,365  nnzA= 1024, nnzB=1024
-tid=0, i,j = 471,372  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,374  nnzA= 1024, nnzB=1024
-tid=0, i,j = 684,369  nnzA= 1024, nnzB=1024
-tid=0, i,j = 971,366  nnzA= 1024, nnzB=1024
-tid=0, i,j = 937,351  nnzA= 1024, nnzB=1024
-tid=0, i,j = 992,375  nnzA= 1024, nnzB=1024
-tid=0, i,j = 629,364  nnzA= 1024, nnzB=1024
-tid=0, i,j = 679,360  nnzA= 1024, nnzB=1024
-tid=0, i,j = 160,352  nnzA= 1024, nnzB=1024
-tid=0, i,j = 587,368  nnzA= 1024, nnzB=1024
-tid=0, i,j = 989,361  nnzA= 1024, nnzB=1024
-tid=0, i,j = 270,370  nnzA= 1024, nnzB=1024
-tid=0, i,j = 841,366  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,382  nnzA= 1024, nnzB=1024
-tid=0, i,j = 592,379  nnzA= 1024, nnzB=1024
-tid=0, i,j = 888,367  nnzA= 1024, nnzB=1024
-tid=0, i,j = 77,380  nnzA= 1024, nnzB=1024
-tid=0, i,j = 75,345  nnzA= 1024, nnzB=1024
-tid=0, i,j = 536,353  nnzA= 1024, nnzB=1024
-tid=0, i,j = 882,378  nnzA= 1024, nnzB=1024
-tid=0, i,j = 142,328  nnzA= 1024, nnzB=1024
-tid=0, i,j = 848,330  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1012,381  nnzA= 1024, nnzB=1024
-tid=0, i,j = 340,356  nnzA= 1024, nnzB=1024
-tid=0, i,j = 88,373  nnzA= 1024, nnzB=1024
-tid=0, i,j = 306,325  nnzA= 1024, nnzB=1024
-tid=0, i,j = 336,376  nnzA= 1024, nnzB=1024
-tid=0, i,j = 206,363  nnzA= 1024, nnzB=1024
-tid=0, i,j = 804,349  nnzA= 1024, nnzB=1024
-tid=0, i,j = 649,326  nnzA= 1024, nnzB=1024
-tid=0, i,j = 327,371  nnzA= 1024, nnzB=1024
-tid=0, i,j = 835,397  nnzA= 1024, nnzB=1024
-tid=0, i,j = 49,405  nnzA= 1024, nnzB=1024
-tid=0, i,j = 382,400  nnzA= 1024, nnzB=1024
-tid=0, i,j = 40,403  nnzA= 1024, nnzB=1024
-tid=0, i,j = 246,398  nnzA= 1024, nnzB=1024
-tid=0, i,j = 645,396  nnzA= 1024, nnzB=1024
-tid=0, i,j = 733,389  nnzA= 1024, nnzB=1024
-tid=0, i,j = 790,401  nnzA= 1024, nnzB=1024
-tid=0, i,j = 475,405  nnzA= 1024, nnzB=1024
-tid=0, i,j = 670,385  nnzA= 1024, nnzB=1024
-tid=0, i,j = 107,398  nnzA= 1024, nnzB=1024
-tid=0, i,j = 711,383  nnzA= 1024, nnzB=1024
-tid=0, i,j = 26,415  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,411  nnzA= 1024, nnzB=1024
-tid=0, i,j = 86,377  nnzA= 1024, nnzB=1024
-tid=0, i,j = 172,400  nnzA= 1024, nnzB=1024
-tid=0, i,j = 59,391  nnzA= 1024, nnzB=1024
-tid=0, i,j = 537,386  nnzA= 1024, nnzB=1024
-tid=0, i,j = 949,412  nnzA= 1024, nnzB=1024
-tid=0, i,j = 794,361  nnzA= 1024, nnzB=1024
-tid=0, i,j = 436,399  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,362  nnzA= 1024, nnzB=1024
-tid=0, i,j = 754,410  nnzA= 1024, nnzB=1024
-tid=0, i,j = 641,404  nnzA= 1024, nnzB=1024
-tid=0, i,j = 991,414  nnzA= 1024, nnzB=1024
-tid=0, i,j = 494,388  nnzA= 1024, nnzB=1024
-tid=0, i,j = 352,355  nnzA= 1024, nnzB=1024
-tid=0, i,j = 320,407  nnzA= 1024, nnzB=1024
-tid=0, i,j = 692,394  nnzA= 1024, nnzB=1024
-tid=0, i,j = 678,358  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,380  nnzA= 1024, nnzB=1024
-tid=0, i,j = 320,402  nnzA= 1024, nnzB=1024
-tid=0, i,j = 377,426  nnzA= 1024, nnzB=1024
-tid=0, i,j = 105,437  nnzA= 1024, nnzB=1024
-tid=0, i,j = 51,431  nnzA= 1024, nnzB=1024
-tid=0, i,j = 152,435  nnzA= 1024, nnzB=1024
-tid=0, i,j = 950,428  nnzA= 1024, nnzB=1024
-tid=0, i,j = 857,432  nnzA= 1024, nnzB=1024
-tid=0, i,j = 263,425  nnzA= 1024, nnzB=1024
-tid=0, i,j = 748,420  nnzA= 1024, nnzB=1024
-tid=0, i,j = 366,417  nnzA= 1024, nnzB=1024
-tid=0, i,j = 575,416  nnzA= 1024, nnzB=1024
-tid=0, i,j = 970,447  nnzA= 1024, nnzB=1024
-tid=0, i,j = 753,443  nnzA= 1024, nnzB=1024
-tid=0, i,j = 814,438  nnzA= 1024, nnzB=1024
-tid=0, i,j = 61,408  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,422  nnzA= 1024, nnzB=1024
-tid=0, i,j = 149,427  nnzA= 1024, nnzB=1024
-tid=0, i,j = 160,418  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,444  nnzA= 1024, nnzB=1024
-tid=0, i,j = 551,390  nnzA= 1024, nnzB=1024
-tid=0, i,j = 718,430  nnzA= 1024, nnzB=1024
-tid=0, i,j = 880,442  nnzA= 1024, nnzB=1024
-tid=0, i,j = 305,429  nnzA= 1024, nnzB=1024
-tid=0, i,j = 356,436  nnzA= 1024, nnzB=1024
-tid=0, i,j = 600,391  nnzA= 1024, nnzB=1024
-tid=0, i,j = 209,419  nnzA= 1024, nnzB=1024
-tid=0, i,j = 328,389  nnzA= 1024, nnzB=1024
-tid=0, i,j = 741,446  nnzA= 1024, nnzB=1024
-tid=0, i,j = 542,424  nnzA= 1024, nnzB=1024
-tid=0, i,j = 347,387  nnzA= 1024, nnzB=1024
-tid=0, i,j = 338,440  nnzA= 1024, nnzB=1024
-tid=0, i,j = 94,413  nnzA= 1024, nnzB=1024
-tid=0, i,j = 604,434  nnzA= 1024, nnzB=1024
-tid=0, i,j = 985,456  nnzA= 1024, nnzB=1024
-tid=0, i,j = 849,456  nnzA= 1024, nnzB=1024
-tid=0, i,j = 868,452  nnzA= 1024, nnzB=1024
-tid=0, i,j = 646,448  nnzA= 1024, nnzB=1024
-tid=0, i,j = 744,448  nnzA= 1024, nnzB=1024
-tid=0, i,j = 415,454  nnzA= 1024, nnzB=1024
-tid=0, i,j = 982,441  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,457  nnzA= 1024, nnzB=1024
-tid=0, i,j = 835,449  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,421  nnzA= 1024, nnzB=1024
-tid=0, i,j = 147,451  nnzA= 1024, nnzB=1024
-tid=0, i,j = 873,423  nnzA= 1024, nnzB=1024
-tid=0, i,j = 285,419  nnzA= 1024, nnzB=1024
-tid=0, i,j = 43,455  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,418  nnzA= 1024, nnzB=1024
-tid=0, i,j = 952,445  nnzA= 1024, nnzB=1024
-tid=0, i,j = 26,453  nnzA= 1024, nnzB=1024
-tid=0, i,j = 668,454  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1017,451  nnzA= 1024, nnzB=1024
-tid=0, i,j = 579,450  nnzA= 1024, nnzB=1024
-tid=0, i,j = 849,461  nnzA= 1024, nnzB=1024
-tid=0, i,j = 801,465  nnzA= 1024, nnzB=1024
-tid=0, i,j = 900,462  nnzA= 1024, nnzB=1024
-tid=0, i,j = 112,467  nnzA= 1024, nnzB=1024
-tid=0, i,j = 917,461  nnzA= 1024, nnzB=1024
-tid=0, i,j = 316,463  nnzA= 1024, nnzB=1024
-tid=0, i,j = 510,458  nnzA= 1024, nnzB=1024
-tid=0, i,j = 836,460  nnzA= 1024, nnzB=1024
-tid=0, i,j = 889,470  nnzA= 1024, nnzB=1024
-tid=0, i,j = 762,464  nnzA= 1024, nnzB=1024
-tid=0, i,j = 737,459  nnzA= 1024, nnzB=1024
-tid=0, i,j = 673,466  nnzA= 1024, nnzB=1024
-tid=0, i,j = 355,465  nnzA= 1024, nnzB=1024
-tid=0, i,j = 288,468  nnzA= 1024, nnzB=1024
-tid=0, i,j = 680,482  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,489  nnzA= 1024, nnzB=1024
-tid=0, i,j = 194,487  nnzA= 1024, nnzB=1024
-tid=0, i,j = 558,481  nnzA= 1024, nnzB=1024
-tid=0, i,j = 811,490  nnzA= 1024, nnzB=1024
-tid=0, i,j = 44,476  nnzA= 1024, nnzB=1024
-tid=0, i,j = 667,480  nnzA= 1024, nnzB=1024
-tid=0, i,j = 650,471  nnzA= 1024, nnzB=1024
-tid=0, i,j = 382,475  nnzA= 1024, nnzB=1024
-tid=0, i,j = 274,485  nnzA= 1024, nnzB=1024
-tid=0, i,j = 342,477  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1015,486  nnzA= 1024, nnzB=1024
-tid=0, i,j = 786,478  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,473  nnzA= 1024, nnzB=1024
-tid=0, i,j = 517,483  nnzA= 1024, nnzB=1024
-tid=0, i,j = 121,473  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,474  nnzA= 1024, nnzB=1024
-tid=0, i,j = 961,484  nnzA= 1024, nnzB=1024
-tid=0, i,j = 319,491  nnzA= 1024, nnzB=1024
-tid=0, i,j = 377,492  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1013,521  nnzA= 1024, nnzB=1024
-tid=0, i,j = 267,496  nnzA= 1024, nnzB=1024
-tid=0, i,j = 809,495  nnzA= 1024, nnzB=1024
-tid=0, i,j = 194,498  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,501  nnzA= 1024, nnzB=1024
-tid=0, i,j = 362,525  nnzA= 1024, nnzB=1024
-tid=0, i,j = 84,500  nnzA= 1024, nnzB=1024
-tid=0, i,j = 284,522  nnzA= 1024, nnzB=1024
-tid=0, i,j = 270,520  nnzA= 1024, nnzB=1024
-tid=0, i,j = 432,494  nnzA= 1024, nnzB=1024
-tid=0, i,j = 94,525  nnzA= 1024, nnzB=1024
-tid=0, i,j = 271,513  nnzA= 1024, nnzB=1024
-tid=0, i,j = 61,527  nnzA= 1024, nnzB=1024
-tid=0, i,j = 758,515  nnzA= 1024, nnzB=1024
-tid=0, i,j = 404,514  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,507  nnzA= 1024, nnzB=1024
-tid=0, i,j = 369,517  nnzA= 1024, nnzB=1024
-tid=0, i,j = 887,504  nnzA= 1024, nnzB=1024
-tid=0, i,j = 945,524  nnzA= 1024, nnzB=1024
-tid=0, i,j = 952,499  nnzA= 1024, nnzB=1024
-tid=0, i,j = 519,503  nnzA= 1024, nnzB=1024
-tid=0, i,j = 892,512  nnzA= 1024, nnzB=1024
-tid=0, i,j = 510,504  nnzA= 1024, nnzB=1024
-tid=0, i,j = 902,496  nnzA= 1024, nnzB=1024
-tid=0, i,j = 449,508  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,505  nnzA= 1024, nnzB=1024
-tid=0, i,j = 52,526  nnzA= 1024, nnzB=1024
-tid=0, i,j = 293,518  nnzA= 1024, nnzB=1024
-tid=0, i,j = 632,523  nnzA= 1024, nnzB=1024
-tid=0, i,j = 786,519  nnzA= 1024, nnzB=1024
-tid=0, i,j = 294,528  nnzA= 1024, nnzB=1024
-tid=0, i,j = 505,534  nnzA= 1024, nnzB=1024
-tid=0, i,j = 908,531  nnzA= 1024, nnzB=1024
-tid=0, i,j = 137,542  nnzA= 1024, nnzB=1024
-tid=0, i,j = 500,536  nnzA= 1024, nnzB=1024
-tid=0, i,j = 710,552  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,557  nnzA= 1024, nnzB=1024
-tid=0, i,j = 998,529  nnzA= 1024, nnzB=1024
-tid=0, i,j = 792,560  nnzA= 1024, nnzB=1024
-tid=0, i,j = 767,547  nnzA= 1024, nnzB=1024
-tid=0, i,j = 927,550  nnzA= 1024, nnzB=1024
-tid=0, i,j = 391,553  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1018,547  nnzA= 1024, nnzB=1024
-tid=0, i,j = 46,548  nnzA= 1024, nnzB=1024
-tid=0, i,j = 26,556  nnzA= 1024, nnzB=1024
-tid=0, i,j = 660,535  nnzA= 1024, nnzB=1024
-tid=0, i,j = 861,538  nnzA= 1024, nnzB=1024
-tid=0, i,j = 10,555  nnzA= 1024, nnzB=1024
-tid=0, i,j = 492,538  nnzA= 1024, nnzB=1024
-tid=0, i,j = 112,530  nnzA= 1024, nnzB=1024
-tid=0, i,j = 799,537  nnzA= 1024, nnzB=1024
-tid=0, i,j = 145,529  nnzA= 1024, nnzB=1024
-tid=0, i,j = 674,533  nnzA= 1024, nnzB=1024
-tid=0, i,j = 776,535  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,547  nnzA= 1024, nnzB=1024
-tid=0, i,j = 552,558  nnzA= 1024, nnzB=1024
-tid=0, i,j = 245,540  nnzA= 1024, nnzB=1024
-tid=0, i,j = 658,545  nnzA= 1024, nnzB=1024
-tid=0, i,j = 351,554  nnzA= 1024, nnzB=1024
-tid=0, i,j = 697,548  nnzA= 1024, nnzB=1024
-tid=0, i,j = 213,546  nnzA= 1024, nnzB=1024
-tid=0, i,j = 602,549  nnzA= 1024, nnzB=1024
-tid=0, i,j = 597,561  nnzA= 1024, nnzB=1024
-tid=0, i,j = 490,565  nnzA= 1024, nnzB=1024
-tid=0, i,j = 872,569  nnzA= 1024, nnzB=1024
-tid=0, i,j = 603,574  nnzA= 1024, nnzB=1024
-tid=0, i,j = 914,588  nnzA= 1024, nnzB=1024
-tid=0, i,j = 310,564  nnzA= 1024, nnzB=1024
-tid=0, i,j = 182,580  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,582  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,563  nnzA= 1024, nnzB=1024
-tid=0, i,j = 118,583  nnzA= 1024, nnzB=1024
-tid=0, i,j = 271,571  nnzA= 1024, nnzB=1024
-tid=0, i,j = 564,566  nnzA= 1024, nnzB=1024
-tid=0, i,j = 919,571  nnzA= 1024, nnzB=1024
-tid=0, i,j = 862,562  nnzA= 1024, nnzB=1024
-tid=0, i,j = 254,587  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1016,589  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,590  nnzA= 1024, nnzB=1024
-tid=0, i,j = 609,564  nnzA= 1024, nnzB=1024
-tid=0, i,j = 607,566  nnzA= 1024, nnzB=1024
-tid=0, i,j = 445,581  nnzA= 1024, nnzB=1024
-tid=0, i,j = 182,562  nnzA= 1024, nnzB=1024
-tid=0, i,j = 95,590  nnzA= 1024, nnzB=1024
-tid=0, i,j = 630,572  nnzA= 1024, nnzB=1024
-tid=0, i,j = 433,585  nnzA= 1024, nnzB=1024
-tid=0, i,j = 399,584  nnzA= 1024, nnzB=1024
-tid=0, i,j = 274,579  nnzA= 1024, nnzB=1024
-tid=0, i,j = 465,570  nnzA= 1024, nnzB=1024
-tid=0, i,j = 256,576  nnzA= 1024, nnzB=1024
-tid=0, i,j = 787,595  nnzA= 1024, nnzB=1024
-tid=0, i,j = 759,611  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,599  nnzA= 1024, nnzB=1024
-tid=0, i,j = 832,608  nnzA= 1024, nnzB=1024
-tid=0, i,j = 527,592  nnzA= 1024, nnzB=1024
-tid=0, i,j = 76,605  nnzA= 1024, nnzB=1024
-tid=0, i,j = 478,601  nnzA= 1024, nnzB=1024
-tid=0, i,j = 794,610  nnzA= 1024, nnzB=1024
-tid=0, i,j = 270,604  nnzA= 1024, nnzB=1024
-tid=0, i,j = 430,594  nnzA= 1024, nnzB=1024
-tid=0, i,j = 143,593  nnzA= 1024, nnzB=1024
-tid=0, i,j = 5,620  nnzA= 1024, nnzB=1024
-tid=0, i,j = 928,616  nnzA= 1024, nnzB=1024
-tid=0, i,j = 41,621  nnzA= 1024, nnzB=1024
-tid=0, i,j = 788,598  nnzA= 1024, nnzB=1024
-tid=0, i,j = 996,619  nnzA= 1024, nnzB=1024
-tid=0, i,j = 978,591  nnzA= 1024, nnzB=1024
-tid=0, i,j = 398,613  nnzA= 1024, nnzB=1024
-tid=0, i,j = 287,609  nnzA= 1024, nnzB=1024
-tid=0, i,j = 79,623  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1006,612  nnzA= 1024, nnzB=1024
-tid=0, i,j = 339,600  nnzA= 1024, nnzB=1024
-tid=0, i,j = 386,614  nnzA= 1024, nnzB=1024
-tid=0, i,j = 44,623  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,602  nnzA= 1024, nnzB=1024
-tid=0, i,j = 337,620  nnzA= 1024, nnzB=1024
-tid=0, i,j = 677,596  nnzA= 1024, nnzB=1024
-tid=0, i,j = 759,603  nnzA= 1024, nnzB=1024
-tid=0, i,j = 30,617  nnzA= 1024, nnzB=1024
-tid=0, i,j = 930,606  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,615  nnzA= 1024, nnzB=1024
-tid=0, i,j = 361,618  nnzA= 1024, nnzB=1024
-tid=0, i,j = 113,638  nnzA= 1024, nnzB=1024
-tid=0, i,j = 396,628  nnzA= 1024, nnzB=1024
-tid=0, i,j = 73,632  nnzA= 1024, nnzB=1024
-tid=0, i,j = 242,624  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,649  nnzA= 1024, nnzB=1024
-tid=0, i,j = 19,624  nnzA= 1024, nnzB=1024
-tid=0, i,j = 957,650  nnzA= 1024, nnzB=1024
-tid=0, i,j = 617,636  nnzA= 1024, nnzB=1024
-tid=0, i,j = 341,630  nnzA= 1024, nnzB=1024
-tid=0, i,j = 716,637  nnzA= 1024, nnzB=1024
-tid=0, i,j = 285,637  nnzA= 1024, nnzB=1024
-tid=0, i,j = 361,627  nnzA= 1024, nnzB=1024
-tid=0, i,j = 683,625  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,634  nnzA= 1024, nnzB=1024
-tid=0, i,j = 966,623  nnzA= 1024, nnzB=1024
-tid=0, i,j = 934,643  nnzA= 1024, nnzB=1024
-tid=0, i,j = 882,629  nnzA= 1024, nnzB=1024
-tid=0, i,j = 49,631  nnzA= 1024, nnzB=1024
-tid=0, i,j = 616,639  nnzA= 1024, nnzB=1024
-tid=0, i,j = 973,652  nnzA= 1024, nnzB=1024
-tid=0, i,j = 60,653  nnzA= 1024, nnzB=1024
-tid=0, i,j = 524,624  nnzA= 1024, nnzB=1024
-tid=0, i,j = 367,638  nnzA= 1024, nnzB=1024
-tid=0, i,j = 783,647  nnzA= 1024, nnzB=1024
-tid=0, i,j = 457,641  nnzA= 1024, nnzB=1024
-tid=0, i,j = 585,631  nnzA= 1024, nnzB=1024
-tid=0, i,j = 195,648  nnzA= 1024, nnzB=1024
-tid=0, i,j = 837,640  nnzA= 1024, nnzB=1024
-tid=0, i,j = 333,653  nnzA= 1024, nnzB=1024
-tid=0, i,j = 281,651  nnzA= 1024, nnzB=1024
-tid=0, i,j = 51,626  nnzA= 1024, nnzB=1024
-tid=0, i,j = 882,635  nnzA= 1024, nnzB=1024
-tid=0, i,j = 158,664  nnzA= 1024, nnzB=1024
-tid=0, i,j = 300,669  nnzA= 1024, nnzB=1024
-tid=0, i,j = 454,675  nnzA= 1024, nnzB=1024
-tid=0, i,j = 503,666  nnzA= 1024, nnzB=1024
-tid=0, i,j = 699,681  nnzA= 1024, nnzB=1024
-tid=0, i,j = 819,672  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,655  nnzA= 1024, nnzB=1024
-tid=0, i,j = 349,656  nnzA= 1024, nnzB=1024
-tid=0, i,j = 141,671  nnzA= 1024, nnzB=1024
-tid=0, i,j = 819,674  nnzA= 1024, nnzB=1024
-tid=0, i,j = 36,674  nnzA= 1024, nnzB=1024
-tid=0, i,j = 767,663  nnzA= 1024, nnzB=1024
-tid=0, i,j = 77,665  nnzA= 1024, nnzB=1024
-tid=0, i,j = 117,682  nnzA= 1024, nnzB=1024
-tid=0, i,j = 591,660  nnzA= 1024, nnzB=1024
-tid=0, i,j = 951,667  nnzA= 1024, nnzB=1024
-tid=0, i,j = 17,685  nnzA= 1024, nnzB=1024
-tid=0, i,j = 605,654  nnzA= 1024, nnzB=1024
-tid=0, i,j = 907,684  nnzA= 1024, nnzB=1024
-tid=0, i,j = 899,679  nnzA= 1024, nnzB=1024
-tid=0, i,j = 289,677  nnzA= 1024, nnzB=1024
-tid=0, i,j = 110,683  nnzA= 1024, nnzB=1024
-tid=0, i,j = 202,686  nnzA= 1024, nnzB=1024
-tid=0, i,j = 17,659  nnzA= 1024, nnzB=1024
-tid=0, i,j = 512,661  nnzA= 1024, nnzB=1024
-tid=0, i,j = 647,677  nnzA= 1024, nnzB=1024
-tid=0, i,j = 398,680  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,678  nnzA= 1024, nnzB=1024
-tid=0, i,j = 565,671  nnzA= 1024, nnzB=1024
-tid=0, i,j = 242,676  nnzA= 1024, nnzB=1024
-tid=0, i,j = 365,668  nnzA= 1024, nnzB=1024
-tid=0, i,j = 390,681  nnzA= 1024, nnzB=1024
-tid=0, i,j = 323,701  nnzA= 1024, nnzB=1024
-tid=0, i,j = 702,695  nnzA= 1024, nnzB=1024
-tid=0, i,j = 76,707  nnzA= 1024, nnzB=1024
-tid=0, i,j = 502,689  nnzA= 1024, nnzB=1024
-tid=0, i,j = 287,688  nnzA= 1024, nnzB=1024
-tid=0, i,j = 631,698  nnzA= 1024, nnzB=1024
-tid=0, i,j = 102,696  nnzA= 1024, nnzB=1024
-tid=0, i,j = 374,704  nnzA= 1024, nnzB=1024
-tid=0, i,j = 511,706  nnzA= 1024, nnzB=1024
-tid=0, i,j = 579,705  nnzA= 1024, nnzB=1024
-tid=0, i,j = 378,694  nnzA= 1024, nnzB=1024
-tid=0, i,j = 777,702  nnzA= 1024, nnzB=1024
-tid=0, i,j = 462,711  nnzA= 1024, nnzB=1024
-tid=0, i,j = 392,691  nnzA= 1024, nnzB=1024
-tid=0, i,j = 152,699  nnzA= 1024, nnzB=1024
-tid=0, i,j = 45,687  nnzA= 1024, nnzB=1024
-tid=0, i,j = 475,714  nnzA= 1024, nnzB=1024
-tid=0, i,j = 872,709  nnzA= 1024, nnzB=1024
-tid=0, i,j = 299,690  nnzA= 1024, nnzB=1024
-tid=0, i,j = 600,692  nnzA= 1024, nnzB=1024
-tid=0, i,j = 172,715  nnzA= 1024, nnzB=1024
-tid=0, i,j = 107,710  nnzA= 1024, nnzB=1024
-tid=0, i,j = 293,710  nnzA= 1024, nnzB=1024
-tid=0, i,j = 925,708  nnzA= 1024, nnzB=1024
-tid=0, i,j = 873,709  nnzA= 1024, nnzB=1024
-tid=0, i,j = 840,700  nnzA= 1024, nnzB=1024
-tid=0, i,j = 259,708  nnzA= 1024, nnzB=1024
-tid=0, i,j = 210,711  nnzA= 1024, nnzB=1024
-tid=0, i,j = 132,703  nnzA= 1024, nnzB=1024
-tid=0, i,j = 697,716  nnzA= 1024, nnzB=1024
-tid=0, i,j = 480,724  nnzA= 1024, nnzB=1024
-tid=0, i,j = 797,728  nnzA= 1024, nnzB=1024
-tid=0, i,j = 337,726  nnzA= 1024, nnzB=1024
-tid=0, i,j = 884,729  nnzA= 1024, nnzB=1024
-tid=0, i,j = 848,718  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,742  nnzA= 1024, nnzB=1024
-tid=0, i,j = 201,720  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,734  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,737  nnzA= 1024, nnzB=1024
-tid=0, i,j = 146,737  nnzA= 1024, nnzB=1024
-tid=0, i,j = 751,715  nnzA= 1024, nnzB=1024
-tid=0, i,j = 630,741  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,732  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1023,738  nnzA= 1024, nnzB=1024
-tid=0, i,j = 806,721  nnzA= 1024, nnzB=1024
-tid=0, i,j = 335,739  nnzA= 1024, nnzB=1024
-tid=0, i,j = 596,740  nnzA= 1024, nnzB=1024
-tid=0, i,j = 675,723  nnzA= 1024, nnzB=1024
-tid=0, i,j = 934,722  nnzA= 1024, nnzB=1024
-tid=0, i,j = 725,720  nnzA= 1024, nnzB=1024
-tid=0, i,j = 331,719  nnzA= 1024, nnzB=1024
-tid=0, i,j = 275,731  nnzA= 1024, nnzB=1024
-tid=0, i,j = 415,722  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,727  nnzA= 1024, nnzB=1024
-tid=0, i,j = 365,741  nnzA= 1024, nnzB=1024
-tid=0, i,j = 620,717  nnzA= 1024, nnzB=1024
-tid=0, i,j = 763,733  nnzA= 1024, nnzB=1024
-tid=0, i,j = 234,717  nnzA= 1024, nnzB=1024
-tid=0, i,j = 268,735  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,736  nnzA= 1024, nnzB=1024
-tid=0, i,j = 767,730  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,745  nnzA= 1024, nnzB=1024
-tid=0, i,j = 592,753  nnzA= 1024, nnzB=1024
-tid=0, i,j = 847,758  nnzA= 1024, nnzB=1024
-tid=0, i,j = 56,773  nnzA= 1024, nnzB=1024
-tid=0, i,j = 472,753  nnzA= 1024, nnzB=1024
-tid=0, i,j = 91,747  nnzA= 1024, nnzB=1024
-tid=0, i,j = 633,756  nnzA= 1024, nnzB=1024
-tid=0, i,j = 936,766  nnzA= 1024, nnzB=1024
-tid=0, i,j = 954,763  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,771  nnzA= 1024, nnzB=1024
-tid=0, i,j = 381,760  nnzA= 1024, nnzB=1024
-tid=0, i,j = 59,749  nnzA= 1024, nnzB=1024
-tid=0, i,j = 742,752  nnzA= 1024, nnzB=1024
-tid=0, i,j = 915,766  nnzA= 1024, nnzB=1024
-tid=0, i,j = 745,752  nnzA= 1024, nnzB=1024
-tid=0, i,j = 186,743  nnzA= 1024, nnzB=1024
-tid=0, i,j = 458,770  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,749  nnzA= 1024, nnzB=1024
-tid=0, i,j = 340,760  nnzA= 1024, nnzB=1024
-tid=0, i,j = 307,768  nnzA= 1024, nnzB=1024
-tid=0, i,j = 313,752  nnzA= 1024, nnzB=1024
-tid=0, i,j = 372,767  nnzA= 1024, nnzB=1024
-tid=0, i,j = 348,751  nnzA= 1024, nnzB=1024
-tid=0, i,j = 103,771  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1007,754  nnzA= 1024, nnzB=1024
-tid=0, i,j = 886,748  nnzA= 1024, nnzB=1024
-tid=0, i,j = 645,745  nnzA= 1024, nnzB=1024
-tid=0, i,j = 273,746  nnzA= 1024, nnzB=1024
-tid=0, i,j = 913,764  nnzA= 1024, nnzB=1024
-tid=0, i,j = 392,764  nnzA= 1024, nnzB=1024
-tid=0, i,j = 500,759  nnzA= 1024, nnzB=1024
-tid=0, i,j = 962,762  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,775  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1022,804  nnzA= 1024, nnzB=1024
-tid=0, i,j = 462,786  nnzA= 1024, nnzB=1024
-tid=0, i,j = 696,789  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,785  nnzA= 1024, nnzB=1024
-tid=0, i,j = 893,778  nnzA= 1024, nnzB=1024
-tid=0, i,j = 273,788  nnzA= 1024, nnzB=1024
-tid=0, i,j = 896,803  nnzA= 1024, nnzB=1024
-tid=0, i,j = 256,780  nnzA= 1024, nnzB=1024
-tid=0, i,j = 880,797  nnzA= 1024, nnzB=1024
-tid=0, i,j = 809,793  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,783  nnzA= 1024, nnzB=1024
-tid=0, i,j = 636,792  nnzA= 1024, nnzB=1024
-tid=0, i,j = 899,781  nnzA= 1024, nnzB=1024
-tid=0, i,j = 793,791  nnzA= 1024, nnzB=1024
-tid=0, i,j = 54,784  nnzA= 1024, nnzB=1024
-tid=0, i,j = 347,796  nnzA= 1024, nnzB=1024
-tid=0, i,j = 773,774  nnzA= 1024, nnzB=1024
-tid=0, i,j = 399,782  nnzA= 1024, nnzB=1024
-tid=0, i,j = 876,787  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,798  nnzA= 1024, nnzB=1024
-tid=0, i,j = 294,799  nnzA= 1024, nnzB=1024
-tid=0, i,j = 892,782  nnzA= 1024, nnzB=1024
-tid=0, i,j = 970,800  nnzA= 1024, nnzB=1024
-tid=0, i,j = 537,776  nnzA= 1024, nnzB=1024
-tid=0, i,j = 130,801  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,779  nnzA= 1024, nnzB=1024
-tid=0, i,j = 392,777  nnzA= 1024, nnzB=1024
-tid=0, i,j = 986,794  nnzA= 1024, nnzB=1024
-tid=0, i,j = 471,790  nnzA= 1024, nnzB=1024
-tid=0, i,j = 955,792  nnzA= 1024, nnzB=1024
-tid=0, i,j = 656,795  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,805  nnzA= 1024, nnzB=1024
-tid=0, i,j = 985,818  nnzA= 1024, nnzB=1024
-tid=0, i,j = 430,817  nnzA= 1024, nnzB=1024
-tid=0, i,j = 839,821  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,835  nnzA= 1024, nnzB=1024
-tid=0, i,j = 646,810  nnzA= 1024, nnzB=1024
-tid=0, i,j = 747,822  nnzA= 1024, nnzB=1024
-tid=0, i,j = 625,827  nnzA= 1024, nnzB=1024
-tid=0, i,j = 982,831  nnzA= 1024, nnzB=1024
-tid=0, i,j = 719,809  nnzA= 1024, nnzB=1024
-tid=0, i,j = 408,815  nnzA= 1024, nnzB=1024
-tid=0, i,j = 547,811  nnzA= 1024, nnzB=1024
-tid=0, i,j = 295,834  nnzA= 1024, nnzB=1024
-tid=0, i,j = 915,830  nnzA= 1024, nnzB=1024
-tid=0, i,j = 902,816  nnzA= 1024, nnzB=1024
-tid=0, i,j = 406,825  nnzA= 1024, nnzB=1024
-tid=0, i,j = 886,824  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1003,832  nnzA= 1024, nnzB=1024
-tid=0, i,j = 375,812  nnzA= 1024, nnzB=1024
-tid=0, i,j = 362,833  nnzA= 1024, nnzB=1024
-tid=0, i,j = 200,813  nnzA= 1024, nnzB=1024
-tid=0, i,j = 737,809  nnzA= 1024, nnzB=1024
-tid=0, i,j = 889,806  nnzA= 1024, nnzB=1024
-tid=0, i,j = 32,805  nnzA= 1024, nnzB=1024
-tid=0, i,j = 688,819  nnzA= 1024, nnzB=1024
-tid=0, i,j = 999,833  nnzA= 1024, nnzB=1024
-tid=0, i,j = 504,807  nnzA= 1024, nnzB=1024
-tid=0, i,j = 39,823  nnzA= 1024, nnzB=1024
-tid=0, i,j = 136,834  nnzA= 1024, nnzB=1024
-tid=0, i,j = 814,826  nnzA= 1024, nnzB=1024
-tid=0, i,j = 407,828  nnzA= 1024, nnzB=1024
-tid=0, i,j = 511,829  nnzA= 1024, nnzB=1024
-tid=0, i,j = 565,837  nnzA= 1024, nnzB=1024
-tid=0, i,j = 741,847  nnzA= 1024, nnzB=1024
-tid=0, i,j = 483,848  nnzA= 1024, nnzB=1024
-tid=0, i,j = 972,865  nnzA= 1024, nnzB=1024
-tid=0, i,j = 636,841  nnzA= 1024, nnzB=1024
-tid=0, i,j = 994,858  nnzA= 1024, nnzB=1024
-tid=0, i,j = 302,850  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1022,864  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,845  nnzA= 1024, nnzB=1024
-tid=0, i,j = 567,851  nnzA= 1024, nnzB=1024
-tid=0, i,j = 533,846  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1014,856  nnzA= 1024, nnzB=1024
-tid=0, i,j = 159,840  nnzA= 1024, nnzB=1024
-tid=0, i,j = 781,858  nnzA= 1024, nnzB=1024
-tid=0, i,j = 136,842  nnzA= 1024, nnzB=1024
-tid=0, i,j = 529,852  nnzA= 1024, nnzB=1024
-tid=0, i,j = 103,841  nnzA= 1024, nnzB=1024
-tid=0, i,j = 524,843  nnzA= 1024, nnzB=1024
-tid=0, i,j = 716,859  nnzA= 1024, nnzB=1024
-tid=0, i,j = 623,853  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,836  nnzA= 1024, nnzB=1024
-tid=0, i,j = 508,859  nnzA= 1024, nnzB=1024
-tid=0, i,j = 541,839  nnzA= 1024, nnzB=1024
-tid=0, i,j = 114,844  nnzA= 1024, nnzB=1024
-tid=0, i,j = 464,849  nnzA= 1024, nnzB=1024
-tid=0, i,j = 636,862  nnzA= 1024, nnzB=1024
-tid=0, i,j = 106,855  nnzA= 1024, nnzB=1024
-tid=0, i,j = 150,852  nnzA= 1024, nnzB=1024
-tid=0, i,j = 711,839  nnzA= 1024, nnzB=1024
-tid=0, i,j = 21,863  nnzA= 1024, nnzB=1024
-tid=0, i,j = 650,857  nnzA= 1024, nnzB=1024
-tid=0, i,j = 151,857  nnzA= 1024, nnzB=1024
-tid=0, i,j = 48,867  nnzA= 1024, nnzB=1024
-tid=0, i,j = 990,877  nnzA= 1024, nnzB=1024
-tid=0, i,j = 176,880  nnzA= 1024, nnzB=1024
-tid=0, i,j = 775,872  nnzA= 1024, nnzB=1024
-tid=0, i,j = 687,900  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,876  nnzA= 1024, nnzB=1024
-tid=0, i,j = 929,883  nnzA= 1024, nnzB=1024
-tid=0, i,j = 42,897  nnzA= 1024, nnzB=1024
-tid=0, i,j = 722,877  nnzA= 1024, nnzB=1024
-tid=0, i,j = 608,882  nnzA= 1024, nnzB=1024
-tid=0, i,j = 653,892  nnzA= 1024, nnzB=1024
-tid=0, i,j = 173,873  nnzA= 1024, nnzB=1024
-tid=0, i,j = 679,891  nnzA= 1024, nnzB=1024
-tid=0, i,j = 453,871  nnzA= 1024, nnzB=1024
-tid=0, i,j = 943,888  nnzA= 1024, nnzB=1024
-tid=0, i,j = 506,870  nnzA= 1024, nnzB=1024
-tid=0, i,j = 37,895  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,874  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,887  nnzA= 1024, nnzB=1024
-tid=0, i,j = 687,885  nnzA= 1024, nnzB=1024
-tid=0, i,j = 97,866  nnzA= 1024, nnzB=1024
-tid=0, i,j = 168,875  nnzA= 1024, nnzB=1024
-tid=0, i,j = 110,888  nnzA= 1024, nnzB=1024
-tid=0, i,j = 33,894  nnzA= 1024, nnzB=1024
-tid=0, i,j = 695,895  nnzA= 1024, nnzB=1024
-tid=0, i,j = 303,868  nnzA= 1024, nnzB=1024
-tid=0, i,j = 23,881  nnzA= 1024, nnzB=1024
-tid=0, i,j = 390,896  nnzA= 1024, nnzB=1024
-tid=0, i,j = 364,869  nnzA= 1024, nnzB=1024
-tid=0, i,j = 628,890  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,884  nnzA= 1024, nnzB=1024
-tid=0, i,j = 892,889  nnzA= 1024, nnzB=1024
-tid=0, i,j = 605,901  nnzA= 1024, nnzB=1024
-tid=0, i,j = 502,915  nnzA= 1024, nnzB=1024
-tid=0, i,j = 366,915  nnzA= 1024, nnzB=1024
-tid=0, i,j = 548,911  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,906  nnzA= 1024, nnzB=1024
-tid=0, i,j = 823,916  nnzA= 1024, nnzB=1024
-tid=0, i,j = 496,939  nnzA= 1024, nnzB=1024
-tid=0, i,j = 215,938  nnzA= 1024, nnzB=1024
-tid=0, i,j = 480,918  nnzA= 1024, nnzB=1024
-tid=0, i,j = 924,914  nnzA= 1024, nnzB=1024
-tid=0, i,j = 674,907  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,931  nnzA= 1024, nnzB=1024
-tid=0, i,j = 13,930  nnzA= 1024, nnzB=1024
-tid=0, i,j = 795,905  nnzA= 1024, nnzB=1024
-tid=0, i,j = 165,934  nnzA= 1024, nnzB=1024
-tid=0, i,j = 685,925  nnzA= 1024, nnzB=1024
-tid=0, i,j = 608,920  nnzA= 1024, nnzB=1024
-tid=0, i,j = 456,909  nnzA= 1024, nnzB=1024
-tid=0, i,j = 966,921  nnzA= 1024, nnzB=1024
-tid=0, i,j = 216,923  nnzA= 1024, nnzB=1024
-tid=0, i,j = 808,904  nnzA= 1024, nnzB=1024
-tid=0, i,j = 146,901  nnzA= 1024, nnzB=1024
-tid=0, i,j = 167,910  nnzA= 1024, nnzB=1024
-tid=0, i,j = 860,933  nnzA= 1024, nnzB=1024
-tid=0, i,j = 399,936  nnzA= 1024, nnzB=1024
-tid=0, i,j = 351,935  nnzA= 1024, nnzB=1024
-tid=0, i,j = 420,916  nnzA= 1024, nnzB=1024
-tid=0, i,j = 57,902  nnzA= 1024, nnzB=1024
-tid=0, i,j = 970,919  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1021,903  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,926  nnzA= 1024, nnzB=1024
-tid=0, i,j = 538,929  nnzA= 1024, nnzB=1024
-tid=0, i,j = 414,940  nnzA= 1024, nnzB=1024
-tid=0, i,j = 221,951  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1017,950  nnzA= 1024, nnzB=1024
-tid=0, i,j = 628,948  nnzA= 1024, nnzB=1024
-tid=0, i,j = 109,944  nnzA= 1024, nnzB=1024
-tid=0, i,j = 653,956  nnzA= 1024, nnzB=1024
-tid=0, i,j = 316,949  nnzA= 1024, nnzB=1024
-tid=0, i,j = 875,971  nnzA= 1024, nnzB=1024
-tid=0, i,j = 823,955  nnzA= 1024, nnzB=1024
-tid=0, i,j = 671,945  nnzA= 1024, nnzB=1024
-tid=0, i,j = 856,970  nnzA= 1024, nnzB=1024
-tid=0, i,j = 538,943  nnzA= 1024, nnzB=1024
-tid=0, i,j = 370,964  nnzA= 1024, nnzB=1024
-tid=0, i,j = 504,965  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,958  nnzA= 1024, nnzB=1024
-tid=0, i,j = 667,959  nnzA= 1024, nnzB=1024
-tid=0, i,j = 828,961  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1023,967  nnzA= 1024, nnzB=1024
-tid=0, i,j = 748,939  nnzA= 1024, nnzB=1024
-tid=0, i,j = 246,946  nnzA= 1024, nnzB=1024
-tid=0, i,j = 78,960  nnzA= 1024, nnzB=1024
-tid=0, i,j = 31,943  nnzA= 1024, nnzB=1024
-tid=0, i,j = 564,969  nnzA= 1024, nnzB=1024
-tid=0, i,j = 400,968  nnzA= 1024, nnzB=1024
-tid=0, i,j = 182,947  nnzA= 1024, nnzB=1024
-tid=0, i,j = 483,966  nnzA= 1024, nnzB=1024
-tid=0, i,j = 656,957  nnzA= 1024, nnzB=1024
-tid=0, i,j = 457,955  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,962  nnzA= 1024, nnzB=1024
-tid=0, i,j = 356,942  nnzA= 1024, nnzB=1024
-tid=0, i,j = 586,941  nnzA= 1024, nnzB=1024
-tid=0, i,j = 397,963  nnzA= 1024, nnzB=1024
-tid=0, i,j = 549,972  nnzA= 1024, nnzB=1024
-tid=0, i,j = 72,985  nnzA= 1024, nnzB=1024
-tid=0, i,j = 582,984  nnzA= 1024, nnzB=1024
-tid=0, i,j = 743,985  nnzA= 1024, nnzB=1024
-tid=0, i,j = 340,987  nnzA= 1024, nnzB=1024
-tid=0, i,j = 123,978  nnzA= 1024, nnzB=1024
-tid=0, i,j = 449,984  nnzA= 1024, nnzB=1024
-tid=0, i,j = 120,987  nnzA= 1024, nnzB=1024
-tid=0, i,j = 371,980  nnzA= 1024, nnzB=1024
-tid=0, i,j = 980,976  nnzA= 1024, nnzB=1024
-tid=0, i,j = 58,983  nnzA= 1024, nnzB=1024
-tid=0, i,j = 323,986  nnzA= 1024, nnzB=1024
-tid=0, i,j = 175,981  nnzA= 1024, nnzB=1024
-tid=0, i,j = 630,972  nnzA= 1024, nnzB=1024
-tid=0, i,j = 347,977  nnzA= 1024, nnzB=1024
-tid=0, i,j = 934,974  nnzA= 1024, nnzB=1024
-tid=0, i,j = 170,995  nnzA= 1024, nnzB=1024
-tid=0, i,j = 660,991  nnzA= 1024, nnzB=1024
-tid=0, i,j = 172,988  nnzA= 1024, nnzB=1024
-tid=0, i,j = 67,1020  nnzA= 1024, nnzB=1024
-tid=0, i,j = 514,990  nnzA= 1024, nnzB=1024
-tid=0, i,j = 946,997  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1022,1006  nnzA= 1024, nnzB=1024
-tid=0, i,j = 287,1019  nnzA= 1024, nnzB=1024
-tid=0, i,j = 678,998  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,1001  nnzA= 1024, nnzB=1024
-tid=0, i,j = 491,1022  nnzA= 1024, nnzB=1024
-tid=0, i,j = 167,999  nnzA= 1024, nnzB=1024
-tid=0, i,j = 585,989  nnzA= 1024, nnzB=1024
-tid=0, i,j = 250,1004  nnzA= 1024, nnzB=1024
-tid=0, i,j = 87,993  nnzA= 1024, nnzB=1024
-tid=0, i,j = 151,1020  nnzA= 1024, nnzB=1024
-tid=0, i,j = 180,1017  nnzA= 1024, nnzB=1024
-tid=0, i,j = 254,1018  nnzA= 1024, nnzB=1024
-tid=0, i,j = 667,1014  nnzA= 1024, nnzB=1024
-tid=0, i,j = 992,994  nnzA= 1024, nnzB=1024
-tid=0, i,j = 531,992  nnzA= 1024, nnzB=1024
-tid=0, i,j = 286,1002  nnzA= 1024, nnzB=1024
-tid=0, i,j = 15,992  nnzA= 1024, nnzB=1024
-tid=0, i,j = 674,993  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,1015  nnzA= 1024, nnzB=1024
-tid=0, i,j = 810,1021  nnzA= 1024, nnzB=1024
-tid=0, i,j = 533,1012  nnzA= 1024, nnzB=1024
-tid=0, i,j = 457,1016  nnzA= 1024, nnzB=1024
-tid=0, i,j = 159,1008  nnzA= 1024, nnzB=1024
-tid=0, i,j = 613,1015  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,1013  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,1009  nnzA= 1024, nnzB=1024
-tid=0, i,j = 840,1023  nnzA= 1024, nnzB=1024
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 9.10131ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-
-    (0,478)   268
-    (0,574)   246
-    (2,376)   235
-    (5,560)   278
-    (6,996)   255
-    (7,183)   256
-    (7,666)   248
-    (8,896)   255
-    (9,187)   274
-    (10,446)   256
-    (11,46)   270
-    (11,955)   284
-    (12,397)   250
-    (12,953)   259
-    (13,192)   278
-    (14,421)   267
-    (15,568)   251
-    (16,788)   225
-    (16,904)   246
-    (17,928)   240
-    (18,103)   262
-    (19,821)   235
-    (19,886)   236
-    (20,474)   267
-    (21,479)   248
-    (21,975)   251
-    (22,569)   255
-    (23,310)   272
-    (24,905)   262
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-
-    (0,478)   268
-    (0,574)   246
-    (2,376)   235
-    (5,560)   278
-    (6,996)   255
-    (7,183)   256
-    (7,666)   248
-    (8,896)   255
-    (9,187)   274
-    (10,446)   256
-    (11,46)   270
-    (11,955)   284
-    (12,397)   250
-    (12,953)   259
-    (13,192)   278
-    (14,421)   267
-    (15,568)   251
-    (16,788)   225
-    (16,904)   246
-    (17,928)   240
-    (18,103)   262
-    (19,821)   235
-    (19,886)   236
-    (20,474)   267
-    (21,479)   248
-    (21,975)   251
-    (22,569)   255
-    (23,310)   272
-    (24,905)   262
-    (25,241)   225
-    (26,428)   224
-    (28,107)   228
-    (28,441)   274
-    (30,694)   269
-    (32,121)   239
-    (33,81)   249
-    (34,804)   269
-    (36,451)   264
-    (37,609)   263
-    (38,138)   258
-    (39,698)   263
-    (40,950)   236
-    (41,568)   236
-    (42,324)   238
-    (43,798)   244
-    (46,208)   240
-    (47,70)   264
-    (48,336)   277
-    (49,476)   254
-    (50,35)   242
-    (51,556)   265
-    (52,999)   247
-    (53,940)   264
-    (54,558)   257
-    (54,960)   259
-    (55,979)   251
-    (56,90)   305
-    (57,846)   275
-    (57,893)   272
-    (58,35)   260
-    (59,108)   255
-    (60,479)   255
-    (61,590)   264
-    (62,771)   259
-    (63,50)   267
-    (64,268)   276
-    (65,694)   249
-    (66,719)   261
-    (67,411)   239
-    (68,324)   246
-    (69,477)   254
-    (70,539)   241
-    (71,228)   235
-    (72,297)   242
-    (73,665)   269
-    (75,855)   227
-    (76,248)   235
-    (77,433)   251
-    (78,90)   275
-    (81,754)   270
-    (82,243)   286
-    (84,253)   267
-    (86,104)   247
-    (87,657)   255
-    (89,825)   251
-    (90,37)   248
-    (91,234)   259
-    (91,519)   276
-    (92,74)   259
-    (92,218)   266
-    (92,690)   256
-    (93,486)   268
-    (94,637)   277
-    (94,722)   261
-    (96,564)   282
-    (97,748)   245
-    (99,326)   249
-    (100,281)   248
-    (102,609)   258
-    (103,621)   277
-    (104,644)   226
-    (106,652)   244
-    (107,239)   273
-    (107,522)   234
-    (108,131)   274
-    (109,884)   253
-    (110,402)   251
-    (111,905)   256
-    (112,127)   241
-    (112,779)   239
-    (113,278)   251
-    (114,519)   264
-    (115,240)   262
-    (116,198)   258
-    (117,219)   230
-    (117,338)   251
-    (118,99)   260
-    (120,477)   266
-    (121,554)   271
-    (121,715)   291
-    (122,151)   253
-    (123,621)   252
-    (125,177)   236
-    (126,36)   275
-    (128,820)   263
-    (128,835)   248
-    (129,660)   255
-    (130,623)   246
-    (130,807)   273
-    (131,253)   271
-    (131,355)   260
-    (132,570)   264
-    (133,492)   278
-    (134,821)   268
-    (135,295)   266
-    (136,108)   263
-    (137,834)   271
-    (138,288)   253
-    (139,284)   249
-    (139,945)   286
-    (140,887)   265
-    (141,199)   274
-    (142,87)   235
-    (142,225)   261
-    (143,123)   258
-    (144,574)   262
-    (145,552)   250
-    (146,194)   244
-    (146,995)   255
-    (148,357)   253
-    (149,949)   253
-    (150,717)   255
-    (151,484)   272
-    (156,290)   250
-    (157,714)   302
-    (157,974)   274
-    (158,959)   228
-    (160,297)   252
-    (162,601)   264
-    (163,816)   271
-    (164,221)   254
-    (165,396)   243
-    (166,801)   242
-    (167,879)   234
-    (168,321)   273
-    (169,901)   286
-    (170,612)   282
-    (171,15)   253
-    (172,951)   261
-    (174,0)   258
-    (174,595)   259
-    (175,669)   254
-    (176,108)   261
-    (176,188)   279
-    (176,614)   269
-    (176,781)   255
-    (177,17)   261
-    (178,631)   265
-    (179,932)   225
-    (180,830)   258
-    (182,675)   259
-    (182,1001)   257
-    (183,692)   240
-    (184,143)   247
-    (185,450)   240
-    (186,779)   270
-    (187,997)   256
-    (188,357)   265
-    (189,111)   250
-    (190,990)   262
-    (192,644)   269
-    (192,953)   250
-    (193,135)   246
-    (194,137)   267
-    (195,922)   276
-    (197,859)   269
-    (198,910)   239
-    (199,531)   270
-    (201,907)   253
-    (202,863)   255
-    (203,865)   232
-    (204,614)   268
-    (207,826)   239
-    (208,985)   262
-    (209,808)   256
-    (210,659)   250
-    (211,71)   236
-    (211,931)   266
-    (212,426)   291
-    (213,152)   255
-    (214,928)   264
-    (215,268)   270
-    (216,550)   268
-    (217,921)   252
-    (218,704)   246
-    (218,922)   265
-    (219,66)   232
-    (220,704)   235
-    (221,56)   280
-    (221,551)   273
-    (222,545)   243
-    (223,1016)   249
-    (224,721)   261
-    (225,935)   270
-    (226,727)   254
-    (228,743)   240
-    (229,535)   242
-    (230,382)   245
-    (231,551)   260
-    (232,897)   273
-    (233,570)   235
-    (234,520)   246
-    (235,522)   261
-    (236,221)   244
-    (237,755)   271
-    (238,964)   243
-    (239,82)   243
-    (240,388)   238
-    (241,500)   276
-    (242,124)   240
-    (242,193)   243
-    (242,621)   243
-    (243,300)   254
-    (244,588)   256
-    (244,1004)   265
-    (245,494)   253
-    (246,326)   262
-    (247,115)   263
-    (247,147)   263
-    (248,233)   224
-    (250,485)   259
-    (251,708)   262
-    (252,197)   237
-    (253,485)   256
-    (254,40)   243
-    (254,238)   261
-    (255,895)   243
-    (256,114)   268
-    (257,461)   250
-    (257,796)   237
-    (258,233)   236
-    (260,884)   257
-    (261,945)   279
-    (262,368)   260
-    (264,755)   251
-    (265,124)   253
-    (266,352)   255
-    (267,10)   238
-    (268,234)   248
-    (269,400)   248
-    (270,877)   259
-    (270,924)   231
-    (271,944)   245
-    (272,67)   253
-    (273,100)   273
-    (274,979)   284
-    (276,333)   258
-    (277,377)   245
-    (279,877)   252
-    (280,18)   242
-    (281,449)   240
-    (282,179)   259
-    (283,1007)   244
-    (284,595)   271
-    (285,32)   231
-    (286,37)   245
-    (287,126)   299
-    (287,394)   257
-    (288,848)   267
-    (290,317)   257
-    (291,594)   264
-    (292,562)   257
-    (294,466)   265
-    (294,960)   262
-    (295,1)   245
-    (295,106)   252
-    (296,109)   245
-    (296,183)   243
-    (296,245)   238
-    (297,912)   281
-    (297,1006)   269
-    (299,159)   271
-    (300,554)   260
-    (301,774)   240
-    (302,30)   273
-    (303,645)   243
-    (304,229)   263
-    (305,622)   282
-    (307,264)   267
-    (308,28)   241
-    (309,328)   249
-    (309,627)   280
-    (310,357)   234
-    (311,355)   243
-    (312,61)   239
-    (313,758)   265
-    (314,571)   268
-    (315,177)   236
-    (315,298)   244
-    (315,741)   236
-    (316,177)   226
-    (316,308)   279
-    (317,323)   245
-    (318,595)   288
-    (319,126)   281
-    (320,468)   260
-    (321,73)   267
-    (322,235)   246
-    (323,375)   233
-    (323,651)   255
-    (324,549)   239
-    (325,306)   246
-    (325,487)   279
-    (326,649)   272
-    (327,704)   246
-    (328,142)   271
-    (329,176)   257
-    (330,848)   249
-    (330,965)   244
-    (332,795)   265
-    (334,695)   275
-    (335,694)   236
-    (336,775)   251
-    (336,808)   231
-    (337,608)   236
-    (338,993)   243
-    (339,680)   277
-    (340,849)   251
-    (341,36)   273
-    (342,723)   252
-    (343,678)   235
-    (344,384)   255
-    (344,680)   248
-    (345,75)   252
-    (347,996)   264
-    (348,60)   280
-    (348,821)   297
-    (349,804)   265
-    (350,282)   254
-    (351,142)   272
-    (351,937)   275
-    (352,160)   256
-    (353,536)   260
-    (355,352)   264
-    (356,340)   243
-    (358,678)   257
-    (360,679)   276
-    (361,794)   255
-    (361,989)   264
-    (362,816)   295
-    (363,206)   250
-    (364,629)   267
-    (365,990)   269
-    (366,841)   262
-    (366,971)   261
-    (367,888)   315
-    (368,587)   245
-    (369,684)   261
-    (370,270)   253
-    (371,327)   257
-    (372,471)   258
-    (373,88)   246
-    (374,669)   242
-    (375,992)   241
-    (376,336)   259
-    (377,86)   292
-    (378,882)   270
-    (379,592)   264
-    (380,77)   258
-    (380,643)   240
-    (381,1012)   255
-    (382,816)   253
-    (383,711)   240
-    (385,670)   249
-    (386,537)   255
-    (387,347)   240
-    (388,494)   268
-    (389,328)   235
-    (389,733)   237
-    (390,551)   269
-    (391,59)   254
-    (391,600)   270
-    (394,692)   247
-    (396,645)   233
-    (397,835)   259
-    (398,107)   261
-    (398,246)   264
-    (399,436)   267
-    (400,172)   260
-    (400,382)   240
-    (401,790)   245
-    (402,320)   258
-    (403,40)   257
-    (404,641)   250
-    (405,49)   269
-    (405,475)   257
-    (407,320)   277
-    (408,61)   253
-    (410,754)   239
-    (411,643)   269
-    (412,949)   260
-    (413,94)   254
-    (414,991)   257
-    (415,26)   244
-    (416,575)   254
-    (417,366)   232
-    (418,160)   258
-    (418,669)   266
-    (419,209)   252
-    (419,285)   266
-    (420,748)   277
-    (421,614)   258
-    (422,177)   217
-    (423,873)   251
-    (424,542)   258
-    (425,263)   247
-    (426,377)   261
-    (427,149)   236
-    (428,950)   246
-    (429,305)   277
-    (430,718)   237
-    (431,51)   246
-    (432,857)   246
-    (434,604)   248
-    (435,152)   248
-    (436,356)   286
-    (437,105)   235
-    (438,814)   254
-    (440,338)   251
-    (441,982)   259
-    (442,880)   244
-    (443,753)   273
-    (444,669)   240
-    (445,952)   236
-    (446,741)   264
-    (447,970)   247
-    (448,646)   244
-    (448,744)   237
-    (449,835)   286
-    (450,579)   241
-    (451,147)   258
-    (451,1017)   257
-    (452,868)   247
-    (453,26)   262
-    (454,415)   236
-    (454,668)   249
-    (455,43)   247
-    (456,849)   270
-    (456,985)   251
-    (457,218)   266
-    (458,510)   282
-    (459,737)   250
-    (460,836)   269
-    (461,849)   263
-    (461,917)   270
-    (462,900)   262
-    (463,316)   256
-    (464,762)   250
-    (465,355)   262
-    (465,801)   254
-    (466,673)   247
-    (467,112)   260
-    (468,288)   261
-    (470,889)   248
-    (471,650)   269
-    (473,121)   239
-    (473,127)   251
-    (474,487)   265
-    (475,382)   218
-    (476,44)   258
-    (477,342)   257
-    (478,786)   267
-    (480,667)   244
-    (481,558)   252
-    (482,680)   224
-    (483,517)   270
-    (484,961)   276
-    (485,274)   249
-    (486,1015)   262
-    (487,194)   241
-    (489,802)   252
-    (490,811)   260
-    (491,319)   254
-    (492,377)   242
-    (494,432)   207
-    (495,809)   292
-    (496,267)   255
-    (496,902)   247
-    (498,194)   244
-    (499,952)   273
-    (500,84)   259
-    (501,704)   233
-    (503,519)   278
-    (504,510)   264
-    (504,887)   262
-    (505,574)   285
-    (507,643)   259
-    (508,449)   241
-    (512,892)   253
-    (513,271)   242
-    (514,404)   276
-    (515,758)   263
-    (517,369)   271
-    (518,293)   245
-    (519,786)   261
-    (520,270)   256
-    (521,1013)   259
-    (522,284)   262
-    (523,632)   265
-    (524,945)   273
-    (525,94)   249
-    (525,362)   257
-    (526,52)   282
-    (527,61)   242
-    (528,294)   274
-    (529,145)   248
-    (529,998)   261
-    (530,112)   253
-    (531,908)   249
-    (533,674)   252
-    (534,505)   227
-    (535,660)   261
-    (535,776)   265
-    (536,500)   274
-    (537,799)   258
-    (538,492)   241
-    (538,861)   258
-    (540,245)   272
-    (542,137)   268
-    (545,658)   246
-    (546,213)   272
-    (547,767)   255
-    (547,912)   279
-    (547,1018)   252
-    (548,46)   261
-    (548,697)   265
-    (549,602)   257
-    (550,927)   277
-    (552,710)   271
-    (553,391)   244
-    (554,351)   227
-    (555,10)   235
-    (556,26)   238
-    (557,910)   255
-    (558,552)   261
-    (560,792)   265
-    (561,597)   257
-    (562,182)   264
-    (562,862)   261
-    (563,877)   276
-    (564,310)   259
-    (564,609)   251
-    (565,490)   251
-    (566,564)   263
-    (566,607)   251
-    (569,872)   279
-    (570,465)   263
-    (571,271)   271
-    (571,919)   243
-    (572,630)   237
-    (574,603)   272
-    (576,256)   284
-    (579,274)   236
-    (580,182)   252
-    (581,445)   251
-    (582,177)   196
-    (583,118)   280
-    (584,399)   250
-    (585,433)   244
-    (587,254)   237
-    (588,914)   254
-    (589,1016)   269
-    (590,95)   277
-    (590,802)   279
-    (591,978)   265
-    (592,527)   245
-    (593,143)   276
-    (594,430)   232
-    (595,787)   261
-    (596,677)   247
-    (598,788)   250
-    (599,127)   228
-    (600,339)   249
-    (601,478)   271
-    (602,218)   271
-    (603,759)   242
-    (604,270)   247
-    (605,76)   243
-    (606,930)   257
-    (608,832)   267
-    (609,287)   265
-    (610,794)   256
-    (611,759)   247
-    (612,1006)   282
-    (613,398)   239
-    (614,386)   259
-    (615,115)   264
-    (616,928)   254
-    (617,30)   260
-    (618,361)   243
-    (619,996)   222
-    (620,5)   248
-    (620,337)   256
-    (621,41)   251
-    (623,44)   267
-    (623,79)   252
-    (623,966)   263
-    (624,19)   270
-    (624,242)   258
-    (624,524)   244
-    (625,683)   288
-    (626,51)   242
-    (627,361)   257
-    (628,396)   248
-    (629,882)   260
-    (630,341)   237
-    (631,49)   238
-    (631,585)   234
-    (632,73)   268
-    (634,912)   278
-    (635,882)   266
-    (636,617)   252
-    (637,285)   251
-    (637,716)   275
-    (638,113)   274
-    (638,367)   254
-    (639,616)   258
-    (640,837)   234
-    (641,457)   251
-    (643,934)   265
-    (647,783)   240
-    (648,195)   270
-    (649,614)   239
-    (650,957)   265
-    (651,281)   252
-    (652,973)   267
-    (653,60)   249
-    (653,333)   268
-    (654,605)   272
-    (655,910)   234
-    (656,349)   255
-    (659,17)   250
-    (660,591)   275
-    (661,512)   277
-    (663,767)   258
-    (664,158)   224
-    (665,77)   239
-    (666,503)   248
-    (667,951)   261
-    (668,365)   278
-    (669,300)   273
-    (671,141)   272
-    (671,565)   285
-    (672,819)   223
-    (674,36)   249
-    (674,819)   249
-    (675,454)   234
-    (676,242)   263
-    (677,289)   278
-    (677,647)   255
-    (678,802)   240
-    (679,899)   242
-    (680,398)   266
-    (681,390)   266
-    (681,699)   233
-    (682,117)   246
-    (683,110)   265
-    (684,907)   243
-    (685,17)   239
-    (686,202)   255
-    (687,45)   222
-    (688,287)   242
-    (689,502)   257
-    (690,299)   252
-    (691,392)   256
-    (692,600)   264
-    (694,378)   243
-    (695,702)   271
-    (696,102)   251
-    (698,631)   252
-    (699,152)   272
-    (700,840)   267
-    (701,323)   239
-    (702,777)   232
-    (703,132)   264
-    (704,374)   261
-    (705,579)   254
-    (706,511)   233
-    (707,76)   261
-    (708,259)   269
-    (708,925)   266
-    (709,872)   269
-    (709,873)   265
-    (710,107)   235
-    (710,293)   266
-    (711,210)   257
-    (711,462)   267
-    (714,475)   245
-    (715,172)   253
-    (715,751)   241
-    (716,697)   249
-    (717,234)   239
-    (717,620)   244
-    (718,848)   260
-    (719,331)   265
-    (720,201)   255
-    (720,725)   272
-    (721,806)   262
-    (722,415)   239
-    (722,934)   262
-    (723,675)   249
-    (724,480)   259
-    (726,337)   259
-    (727,177)   237
-    (728,797)   272
-    (729,884)   241
-    (730,767)   249
-    (731,275)   275
-    (732,910)   231
-    (733,763)   283
-    (734,574)   263
-    (735,268)   253
-    (736,115)   218
-    (737,146)   238
-    (737,912)   249
-    (738,1023)   252
-    (739,335)   259
-    (740,596)   233
-    (741,365)   270
-    (741,630)   256
-    (742,485)   250
-    (743,186)   252
-    (745,621)   250
-    (745,645)   246
-    (746,273)   276
-    (747,91)   256
-    (748,886)   245
-    (749,59)   273
-    (749,755)   254
-    (751,348)   253
-    (752,313)   255
-    (752,742)   277
-    (752,745)   260
-    (753,472)   260
-    (753,592)   249
-    (754,1007)   234
-    (756,633)   255
-    (758,847)   268
-    (759,500)   253
-    (760,340)   251
-    (760,381)   270
-    (762,962)   270
-    (763,954)   236
-    (764,392)   236
-    (764,913)   258
-    (766,915)   265
-    (766,936)   259
-    (767,372)   266
-    (768,307)   266
-    (770,458)   265
-    (771,103)   241
-    (771,487)   264
-    (773,56)   248
-    (774,773)   259
-    (775,115)   266
-    (776,537)   254
-    (777,392)   258
-    (778,893)   287
-    (779,644)   270
-    (780,256)   263
-    (781,899)   261
-    (782,399)   251
-    (782,892)   277
-    (783,614)   237
-    (784,54)   231
-    (785,816)   261
-    (786,462)   248
-    (787,876)   262
-    (788,273)   276
-    (789,696)   260
-    (790,471)   251
-    (791,793)   261
-    (792,636)   264
-    (792,955)   263
-    (793,809)   269
-    (794,986)   249
-    (795,656)   253
-    (796,347)   246
-    (797,880)   264
-    (798,802)   256
-    (799,294)   267
-    (800,970)   231
-    (801,130)   244
-    (803,896)   256
-    (804,1022)   257
-    (805,32)   232
-    (805,479)   257
-    (806,889)   245
-    (807,504)   251
-    (809,719)   272
-    (809,737)   270
-    (810,646)   241
-    (811,547)   238
-    (812,375)   262
-    (813,200)   257
-    (815,408)   252
-    (816,902)   256
-    (817,430)   241
-    (818,985)   256
-    (819,688)   254
-    (821,839)   257
-    (822,747)   262
-    (823,39)   259
-    (824,886)   241
-    (825,406)   247
-    (826,814)   242
-    (827,625)   266
-    (828,407)   260
-    (829,511)   254
-    (830,915)   263
-    (831,982)   266
-    (832,1003)   246
-    (833,362)   259
-    (833,999)   258
-    (834,136)   263
-    (834,295)   267
-    (835,115)   281
-    (836,218)   272
-    (837,565)   285
-    (839,541)   280
-    (839,711)   273
-    (840,159)   251
-    (841,103)   240
-    (841,636)   271
-    (842,136)   257
-    (843,524)   254
-    (844,114)   260
-    (845,694)   268
-    (846,533)   274
-    (847,741)   243
-    (848,483)   269
-    (849,464)   257
-    (850,302)   245
-    (851,567)   248
-    (852,150)   262
-    (852,529)   258
-    (853,623)   234
-    (855,106)   265
-    (856,1014)   261
-    (857,151)   270
-    (857,650)   280
-    (858,781)   242
-    (858,994)   242
-    (859,508)   255
-    (859,716)   284
-    (862,636)   241
-    (863,21)   242
-    (864,1022)   242
-    (865,972)   264
-    (866,97)   243
-    (867,48)   235
-    (868,303)   249
-    (869,364)   255
-    (870,506)   241
-    (871,453)   255
-    (872,775)   259
-    (873,173)   269
-    (874,485)   249
-    (875,168)   249
-    (876,357)   243
-    (877,722)   255
-    (877,990)   267
-    (880,176)   291
-    (881,23)   268
-    (882,608)   248
-    (883,929)   251
-    (884,643)   247
-    (885,687)   259
-    (887,487)   257
-    (888,110)   266
-    (888,943)   264
-    (889,892)   267
-    (890,628)   261
-    (891,679)   258
-    (892,653)   254
-    (894,33)   258
-    (895,37)   266
-    (895,695)   269
-    (896,390)   269
-    (897,42)   265
-    (900,687)   281
-    (901,146)   241
-    (901,605)   261
-    (902,57)   230
-    (903,1021)   250
-    (904,808)   237
-    (905,795)   271
-    (906,479)   257
-    (907,674)   277
-    (909,456)   250
-    (910,167)   265
-    (911,548)   248
-    (914,924)   250
-    (915,366)   253
-    (915,502)   238
-    (916,420)   273
-    (916,823)   247
-    (918,480)   248
-    (919,970)   259
-    (920,608)   246
-    (921,966)   230
-    (923,216)   247
-    (925,685)   275
-    (926,755)   274
-    (929,538)   268
-    (930,13)   259
-    (931,479)   250
-    (933,860)   261
-    (934,165)   250
-    (935,351)   233
-    (936,399)   244
-    (938,215)   264
-    (939,496)   276
-    (939,748)   262
-    (940,414)   242
-    (941,586)   265
-    (942,356)   274
-    (943,31)   263
-    (943,538)   262
-    (944,109)   249
-    (945,671)   258
-    (946,246)   255
-    (947,182)   262
-    (948,628)   262
-    (949,316)   238
-    (950,1017)   259
-    (951,221)   250
-    (955,457)   237
-    (955,823)   241
-    (956,653)   258
-    (957,656)   255
-    (958,644)   238
-    (959,667)   246
-    (960,78)   247
-    (961,828)   252
-    (962,877)   269
-    (963,397)   284
-    (964,370)   262
-    (965,504)   244
-    (966,483)   246
-    (967,1023)   246
-    (968,400)   233
-    (969,564)   254
-    (970,856)   257
-    (971,875)   243
-    (972,549)   259
-    (972,630)   240
-    (974,934)   281
-    (976,980)   247
-    (977,347)   230
-    (978,123)   258
-    (980,371)   245
-    (981,175)   258
-    (983,58)   252
-    (984,449)   248
-    (984,582)   246
-    (985,72)   253
-    (985,743)   237
-    (986,323)   248
-    (987,120)   241
-    (987,340)   266
-    (988,172)   251
-    (989,585)   241
-    (990,514)   271
-    (991,660)   256
-    (992,15)   283
-    (992,531)   277
-    (993,87)   267
-    (993,674)   252
-    (994,992)   244
-    (995,170)   269
-    (997,946)   270
-    (998,678)   251
-    (999,167)   258
-    (1001,877)   250
-    (1002,286)   242
-    (1004,250)   259
-    (1006,1022)   248
-    (1008,159)   264
-    (1009,574)   258
-    (1012,533)   270
-    (1013,574)   273
-    (1014,667)   247
-    (1015,127)   244
-    (1015,613)   245
-    (1016,457)   246
-    (1017,180)   267
-    (1018,254)   237
-    (1019,287)   248
-    (1020,67)   261
-    (1020,151)   248
-    (1021,810)   239
-    (1022,491)   268
-    (1023,840)   264
-
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 1024 entries, memory: 40.2 KB
-
-    (0,478)   268
-    (0,574)   246
-    (2,376)   235
-    (5,560)   278
-    (6,996)   255
-    (7,183)   256
-    (7,666)   248
-    (8,896)   255
-    (9,187)   274
-    (10,446)   256
-    (11,46)   270
-    (11,955)   284
-    (12,397)   250
-    (12,953)   259
-    (13,192)   278
-    (14,421)   267
-    (15,568)   251
-    (16,788)   225
-    (16,904)   246
-    (17,928)   240
-    (18,103)   262
-    (19,821)   235
-    (19,886)   236
-    (20,474)   267
-    (21,479)   248
-    (21,975)   251
-    (22,569)   255
-    (23,310)   272
-    (24,905)   262
-    (25,241)   225
-    (26,428)   224
-    (28,107)   228
-    (28,441)   274
-    (30,694)   269
-    (32,121)   239
-    (33,81)   249
-    (34,804)   269
-    (36,451)   264
-    (37,609)   263
-    (38,138)   258
-    (39,698)   263
-    (40,950)   236
-    (41,568)   236
-    (42,324)   238
-    (43,798)   244
-    (46,208)   240
-    (47,70)   264
-    (48,336)   277
-    (49,476)   254
-    (50,35)   242
-    (51,556)   265
-    (52,999)   247
-    (53,940)   264
-    (54,558)   257
-    (54,960)   259
-    (55,979)   251
-    (56,90)   305
-    (57,846)   275
-    (57,893)   272
-    (58,35)   260
-    (59,108)   255
-    (60,479)   255
-    (61,590)   264
-    (62,771)   259
-    (63,50)   267
-    (64,268)   276
-    (65,694)   249
-    (66,719)   261
-    (67,411)   239
-    (68,324)   246
-    (69,477)   254
-    (70,539)   241
-    (71,228)   235
-    (72,297)   242
-    (73,665)   269
-    (75,855)   227
-    (76,248)   235
-    (77,433)   251
-    (78,90)   275
-    (81,754)   270
-    (82,243)   286
-    (84,253)   267
-    (86,104)   247
-    (87,657)   255
-    (89,825)   251
-    (90,37)   248
-    (91,234)   259
-    (91,519)   276
-    (92,74)   259
-    (92,218)   266
-    (92,690)   256
-    (93,486)   268
-    (94,637)   277
-    (94,722)   261
-    (96,564)   282
-    (97,748)   245
-    (99,326)   249
-    (100,281)   248
-    (102,609)   258
-    (103,621)   277
-    (104,644)   226
-    (106,652)   244
-    (107,239)   273
-    (107,522)   234
-    (108,131)   274
-    (109,884)   253
-    (110,402)   251
-    (111,905)   256
-    (112,127)   241
-    (112,779)   239
-    (113,278)   251
-    (114,519)   264
-    (115,240)   262
-    (116,198)   258
-    (117,219)   230
-    (117,338)   251
-    (118,99)   260
-    (120,477)   266
-    (121,554)   271
-    (121,715)   291
-    (122,151)   253
-    (123,621)   252
-    (125,177)   236
-    (126,36)   275
-    (128,820)   263
-    (128,835)   248
-    (129,660)   255
-    (130,623)   246
-    (130,807)   273
-    (131,253)   271
-    (131,355)   260
-    (132,570)   264
-    (133,492)   278
-    (134,821)   268
-    (135,295)   266
-    (136,108)   263
-    (137,834)   271
-    (138,288)   253
-    (139,284)   249
-    (139,945)   286
-    (140,887)   265
-    (141,199)   274
-    (142,87)   235
-    (142,225)   261
-    (143,123)   258
-    (144,574)   262
-    (145,552)   250
-    (146,194)   244
-    (146,995)   255
-    (148,357)   253
-    (149,949)   253
-    (150,717)   255
-    (151,484)   272
-    (156,290)   250
-    (157,714)   302
-    (157,974)   274
-    (158,959)   228
-    (160,297)   252
-    (162,601)   264
-    (163,816)   271
-    (164,221)   254
-    (165,396)   243
-    (166,801)   242
-    (167,879)   234
-    (168,321)   273
-    (169,901)   286
-    (170,612)   282
-    (171,15)   253
-    (172,951)   261
-    (174,0)   258
-    (174,595)   259
-    (175,669)   254
-    (176,108)   261
-    (176,188)   279
-    (176,614)   269
-    (176,781)   255
-    (177,17)   261
-    (178,631)   265
-    (179,932)   225
-    (180,830)   258
-    (182,675)   259
-    (182,1001)   257
-    (183,692)   240
-    (184,143)   247
-    (185,450)   240
-    (186,779)   270
-    (187,997)   256
-    (188,357)   265
-    (189,111)   250
-    (190,990)   262
-    (192,644)   269
-    (192,953)   250
-    (193,135)   246
-    (194,137)   267
-    (195,922)   276
-    (197,859)   269
-    (198,910)   239
-    (199,531)   270
-    (201,907)   253
-    (202,863)   255
-    (203,865)   232
-    (204,614)   268
-    (207,826)   239
-    (208,985)   262
-    (209,808)   256
-    (210,659)   250
-    (211,71)   236
-    (211,931)   266
-    (212,426)   291
-    (213,152)   255
-    (214,928)   264
-    (215,268)   270
-    (216,550)   268
-    (217,921)   252
-    (218,704)   246
-    (218,922)   265
-    (219,66)   232
-    (220,704)   235
-    (221,56)   280
-    (221,551)   273
-    (222,545)   243
-    (223,1016)   249
-    (224,721)   261
-    (225,935)   270
-    (226,727)   254
-    (228,743)   240
-    (229,535)   242
-    (230,382)   245
-    (231,551)   260
-    (232,897)   273
-    (233,570)   235
-    (234,520)   246
-    (235,522)   261
-    (236,221)   244
-    (237,755)   271
-    (238,964)   243
-    (239,82)   243
-    (240,388)   238
-    (241,500)   276
-    (242,124)   240
-    (242,193)   243
-    (242,621)   243
-    (243,300)   254
-    (244,588)   256
-    (244,1004)   265
-    (245,494)   253
-    (246,326)   262
-    (247,115)   263
-    (247,147)   263
-    (248,233)   224
-    (250,485)   259
-    (251,708)   262
-    (252,197)   237
-    (253,485)   256
-    (254,40)   243
-    (254,238)   261
-    (255,895)   243
-    (256,114)   268
-    (257,461)   250
-    (257,796)   237
-    (258,233)   236
-    (260,884)   257
-    (261,945)   279
-    (262,368)   260
-    (264,755)   251
-    (265,124)   253
-    (266,352)   255
-    (267,10)   238
-    (268,234)   248
-    (269,400)   248
-    (270,877)   259
-    (270,924)   231
-    (271,944)   245
-    (272,67)   253
-    (273,100)   273
-    (274,979)   284
-    (276,333)   258
-    (277,377)   245
-    (279,877)   252
-    (280,18)   242
-    (281,449)   240
-    (282,179)   259
-    (283,1007)   244
-    (284,595)   271
-    (285,32)   231
-    (286,37)   245
-    (287,126)   299
-    (287,394)   257
-    (288,848)   267
-    (290,317)   257
-    (291,594)   264
-    (292,562)   257
-    (294,466)   265
-    (294,960)   262
-    (295,1)   245
-    (295,106)   252
-    (296,109)   245
-    (296,183)   243
-    (296,245)   238
-    (297,912)   281
-    (297,1006)   269
-    (299,159)   271
-    (300,554)   260
-    (301,774)   240
-    (302,30)   273
-    (303,645)   243
-    (304,229)   263
-    (305,622)   282
-    (307,264)   267
-    (308,28)   241
-    (309,328)   249
-    (309,627)   280
-    (310,357)   234
-    (311,355)   243
-    (312,61)   239
-    (313,758)   265
-    (314,571)   268
-    (315,177)   236
-    (315,298)   244
-    (315,741)   236
-    (316,177)   226
-    (316,308)   279
-    (317,323)   245
-    (318,595)   288
-    (319,126)   281
-    (320,468)   260
-    (321,73)   267
-    (322,235)   246
-    (323,375)   233
-    (323,651)   255
-    (324,549)   239
-    (325,306)   246
-    (325,487)   279
-    (326,649)   272
-    (327,704)   246
-    (328,142)   271
-    (329,176)   257
-    (330,848)   249
-    (330,965)   244
-    (332,795)   265
-    (334,695)   275
-    (335,694)   236
-    (336,775)   251
-    (336,808)   231
-    (337,608)   236
-    (338,993)   243
-    (339,680)   277
-    (340,849)   251
-    (341,36)   273
-    (342,723)   252
-    (343,678)   235
-    (344,384)   255
-    (344,680)   248
-    (345,75)   252
-    (347,996)   264
-    (348,60)   280
-    (348,821)   297
-    (349,804)   265
-    (350,282)   254
-    (351,142)   272
-    (351,937)   275
-    (352,160)   256
-    (353,536)   260
-    (355,352)   264
-    (356,340)   243
-    (358,678)   257
-    (360,679)   276
-    (361,794)   255
-    (361,989)   264
-    (362,816)   295
-    (363,206)   250
-    (364,629)   267
-    (365,990)   269
-    (366,841)   262
-    (366,971)   261
-    (367,888)   315
-    (368,587)   245
-    (369,684)   261
-    (370,270)   253
-    (371,327)   257
-    (372,471)   258
-    (373,88)   246
-    (374,669)   242
-    (375,992)   241
-    (376,336)   259
-    (377,86)   292
-    (378,882)   270
-    (379,592)   264
-    (380,77)   258
-    (380,643)   240
-    (381,1012)   255
-    (382,816)   253
-    (383,711)   240
-    (385,670)   249
-    (386,537)   255
-    (387,347)   240
-    (388,494)   268
-    (389,328)   235
-    (389,733)   237
-    (390,551)   269
-    (391,59)   254
-    (391,600)   270
-    (394,692)   247
-    (396,645)   233
-    (397,835)   259
-    (398,107)   261
-    (398,246)   264
-    (399,436)   267
-    (400,172)   260
-    (400,382)   240
-    (401,790)   245
-    (402,320)   258
-    (403,40)   257
-    (404,641)   250
-    (405,49)   269
-    (405,475)   257
-    (407,320)   277
-    (408,61)   253
-    (410,754)   239
-    (411,643)   269
-    (412,949)   260
-    (413,94)   254
-    (414,991)   257
-    (415,26)   244
-    (416,575)   254
-    (417,366)   232
-    (418,160)   258
-    (418,669)   266
-    (419,209)   252
-    (419,285)   266
-    (420,748)   277
-    (421,614)   258
-    (422,177)   217
-    (423,873)   251
-    (424,542)   258
-    (425,263)   247
-    (426,377)   261
-    (427,149)   236
-    (428,950)   246
-    (429,305)   277
-    (430,718)   237
-    (431,51)   246
-    (432,857)   246
-    (434,604)   248
-    (435,152)   248
-    (436,356)   286
-    (437,105)   235
-    (438,814)   254
-    (440,338)   251
-    (441,982)   259
-    (442,880)   244
-    (443,753)   273
-    (444,669)   240
-    (445,952)   236
-    (446,741)   264
-    (447,970)   247
-    (448,646)   244
-    (448,744)   237
-    (449,835)   286
-    (450,579)   241
-    (451,147)   258
-    (451,1017)   257
-    (452,868)   247
-    (453,26)   262
-    (454,415)   236
-    (454,668)   249
-    (455,43)   247
-    (456,849)   270
-    (456,985)   251
-    (457,218)   266
-    (458,510)   282
-    (459,737)   250
-    (460,836)   269
-    (461,849)   263
-    (461,917)   270
-    (462,900)   262
-    (463,316)   256
-    (464,762)   250
-    (465,355)   262
-    (465,801)   254
-    (466,673)   247
-    (467,112)   260
-    (468,288)   261
-    (470,889)   248
-    (471,650)   269
-    (473,121)   239
-    (473,127)   251
-    (474,487)   265
-    (475,382)   218
-    (476,44)   258
-    (477,342)   257
-    (478,786)   267
-    (480,667)   244
-    (481,558)   252
-    (482,680)   224
-    (483,517)   270
-    (484,961)   276
-    (485,274)   249
-    (486,1015)   262
-    (487,194)   241
-    (489,802)   252
-    (490,811)   260
-    (491,319)   254
-    (492,377)   242
-    (494,432)   207
-    (495,809)   292
-    (496,267)   255
-    (496,902)   247
-    (498,194)   244
-    (499,952)   273
-    (500,84)   259
-    (501,704)   233
-    (503,519)   278
-    (504,510)   264
-    (504,887)   262
-    (505,574)   285
-    (507,643)   259
-    (508,449)   241
-    (512,892)   253
-    (513,271)   242
-    (514,404)   276
-    (515,758)   263
-    (517,369)   271
-    (518,293)   245
-    (519,786)   261
-    (520,270)   256
-    (521,1013)   259
-    (522,284)   262
-    (523,632)   265
-    (524,945)   273
-    (525,94)   249
-    (525,362)   257
-    (526,52)   282
-    (527,61)   242
-    (528,294)   274
-    (529,145)   248
-    (529,998)   261
-    (530,112)   253
-    (531,908)   249
-    (533,674)   252
-    (534,505)   227
-    (535,660)   261
-    (535,776)   265
-    (536,500)   274
-    (537,799)   258
-    (538,492)   241
-    (538,861)   258
-    (540,245)   272
-    (542,137)   268
-    (545,658)   246
-    (546,213)   272
-    (547,767)   255
-    (547,912)   279
-    (547,1018)   252
-    (548,46)   261
-    (548,697)   265
-    (549,602)   257
-    (550,927)   277
-    (552,710)   271
-    (553,391)   244
-    (554,351)   227
-    (555,10)   235
-    (556,26)   238
-    (557,910)   255
-    (558,552)   261
-    (560,792)   265
-    (561,597)   257
-    (562,182)   264
-    (562,862)   261
-    (563,877)   276
-    (564,310)   259
-    (564,609)   251
-    (565,490)   251
-    (566,564)   263
-    (566,607)   251
-    (569,872)   279
-    (570,465)   263
-    (571,271)   271
-    (571,919)   243
-    (572,630)   237
-    (574,603)   272
-    (576,256)   284
-    (579,274)   236
-    (580,182)   252
-    (581,445)   251
-    (582,177)   196
-    (583,118)   280
-    (584,399)   250
-    (585,433)   244
-    (587,254)   237
-    (588,914)   254
-    (589,1016)   269
-    (590,95)   277
-    (590,802)   279
-    (591,978)   265
-    (592,527)   245
-    (593,143)   276
-    (594,430)   232
-    (595,787)   261
-    (596,677)   247
-    (598,788)   250
-    (599,127)   228
-    (600,339)   249
-    (601,478)   271
-    (602,218)   271
-    (603,759)   242
-    (604,270)   247
-    (605,76)   243
-    (606,930)   257
-    (608,832)   267
-    (609,287)   265
-    (610,794)   256
-    (611,759)   247
-    (612,1006)   282
-    (613,398)   239
-    (614,386)   259
-    (615,115)   264
-    (616,928)   254
-    (617,30)   260
-    (618,361)   243
-    (619,996)   222
-    (620,5)   248
-    (620,337)   256
-    (621,41)   251
-    (623,44)   267
-    (623,79)   252
-    (623,966)   263
-    (624,19)   270
-    (624,242)   258
-    (624,524)   244
-    (625,683)   288
-    (626,51)   242
-    (627,361)   257
-    (628,396)   248
-    (629,882)   260
-    (630,341)   237
-    (631,49)   238
-    (631,585)   234
-    (632,73)   268
-    (634,912)   278
-    (635,882)   266
-    (636,617)   252
-    (637,285)   251
-    (637,716)   275
-    (638,113)   274
-    (638,367)   254
-    (639,616)   258
-    (640,837)   234
-    (641,457)   251
-    (643,934)   265
-    (647,783)   240
-    (648,195)   270
-    (649,614)   239
-    (650,957)   265
-    (651,281)   252
-    (652,973)   267
-    (653,60)   249
-    (653,333)   268
-    (654,605)   272
-    (655,910)   234
-    (656,349)   255
-    (659,17)   250
-    (660,591)   275
-    (661,512)   277
-    (663,767)   258
-    (664,158)   224
-    (665,77)   239
-    (666,503)   248
-    (667,951)   261
-    (668,365)   278
-    (669,300)   273
-    (671,141)   272
-    (671,565)   285
-    (672,819)   223
-    (674,36)   249
-    (674,819)   249
-    (675,454)   234
-    (676,242)   263
-    (677,289)   278
-    (677,647)   255
-    (678,802)   240
-    (679,899)   242
-    (680,398)   266
-    (681,390)   266
-    (681,699)   233
-    (682,117)   246
-    (683,110)   265
-    (684,907)   243
-    (685,17)   239
-    (686,202)   255
-    (687,45)   222
-    (688,287)   242
-    (689,502)   257
-    (690,299)   252
-    (691,392)   256
-    (692,600)   264
-    (694,378)   243
-    (695,702)   271
-    (696,102)   251
-    (698,631)   252
-    (699,152)   272
-    (700,840)   267
-    (701,323)   239
-    (702,777)   232
-    (703,132)   264
-    (704,374)   261
-    (705,579)   254
-    (706,511)   233
-    (707,76)   261
-    (708,259)   269
-    (708,925)   266
-    (709,872)   269
-    (709,873)   265
-    (710,107)   235
-    (710,293)   266
-    (711,210)   257
-    (711,462)   267
-    (714,475)   245
-    (715,172)   253
-    (715,751)   241
-    (716,697)   249
-    (717,234)   239
-    (717,620)   244
-    (718,848)   260
-    (719,331)   265
-    (720,201)   255
-    (720,725)   272
-    (721,806)   262
-    (722,415)   239
-    (722,934)   262
-    (723,675)   249
-    (724,480)   259
-    (726,337)   259
-    (727,177)   237
-    (728,797)   272
-    (729,884)   241
-    (730,767)   249
-    (731,275)   275
-    (732,910)   231
-    (733,763)   283
-    (734,574)   263
-    (735,268)   253
-    (736,115)   218
-    (737,146)   238
-    (737,912)   249
-    (738,1023)   252
-    (739,335)   259
-    (740,596)   233
-    (741,365)   270
-    (741,630)   256
-    (742,485)   250
-    (743,186)   252
-    (745,621)   250
-    (745,645)   246
-    (746,273)   276
-    (747,91)   256
-    (748,886)   245
-    (749,59)   273
-    (749,755)   254
-    (751,348)   253
-    (752,313)   255
-    (752,742)   277
-    (752,745)   260
-    (753,472)   260
-    (753,592)   249
-    (754,1007)   234
-    (756,633)   255
-    (758,847)   268
-    (759,500)   253
-    (760,340)   251
-    (760,381)   270
-    (762,962)   270
-    (763,954)   236
-    (764,392)   236
-    (764,913)   258
-    (766,915)   265
-    (766,936)   259
-    (767,372)   266
-    (768,307)   266
-    (770,458)   265
-    (771,103)   241
-    (771,487)   264
-    (773,56)   248
-    (774,773)   259
-    (775,115)   266
-    (776,537)   254
-    (777,392)   258
-    (778,893)   287
-    (779,644)   270
-    (780,256)   263
-    (781,899)   261
-    (782,399)   251
-    (782,892)   277
-    (783,614)   237
-    (784,54)   231
-    (785,816)   261
-    (786,462)   248
-    (787,876)   262
-    (788,273)   276
-    (789,696)   260
-    (790,471)   251
-    (791,793)   261
-    (792,636)   264
-    (792,955)   263
-    (793,809)   269
-    (794,986)   249
-    (795,656)   253
-    (796,347)   246
-    (797,880)   264
-    (798,802)   256
-    (799,294)   267
-    (800,970)   231
-    (801,130)   244
-    (803,896)   256
-    (804,1022)   257
-    (805,32)   232
-    (805,479)   257
-    (806,889)   245
-    (807,504)   251
-    (809,719)   272
-    (809,737)   270
-    (810,646)   241
-    (811,547)   238
-    (812,375)   262
-    (813,200)   257
-    (815,408)   252
-    (816,902)   256
-    (817,430)   241
-    (818,985)   256
-    (819,688)   254
-    (821,839)   257
-    (822,747)   262
-    (823,39)   259
-    (824,886)   241
-    (825,406)   247
-    (826,814)   242
-    (827,625)   266
-    (828,407)   260
-    (829,511)   254
-    (830,915)   263
-    (831,982)   266
-    (832,1003)   246
-    (833,362)   259
-    (833,999)   258
-    (834,136)   263
-    (834,295)   267
-    (835,115)   281
-    (836,218)   272
-    (837,565)   285
-    (839,541)   280
-    (839,711)   273
-    (840,159)   251
-    (841,103)   240
-    (841,636)   271
-    (842,136)   257
-    (843,524)   254
-    (844,114)   260
-    (845,694)   268
-    (846,533)   274
-    (847,741)   243
-    (848,483)   269
-    (849,464)   257
-    (850,302)   245
-    (851,567)   248
-    (852,150)   262
-    (852,529)   258
-    (853,623)   234
-    (855,106)   265
-    (856,1014)   261
-    (857,151)   270
-    (857,650)   280
-    (858,781)   242
-    (858,994)   242
-    (859,508)   255
-    (859,716)   284
-    (862,636)   241
-    (863,21)   242
-    (864,1022)   242
-    (865,972)   264
-    (866,97)   243
-    (867,48)   235
-    (868,303)   249
-    (869,364)   255
-    (870,506)   241
-    (871,453)   255
-    (872,775)   259
-    (873,173)   269
-    (874,485)   249
-    (875,168)   249
-    (876,357)   243
-    (877,722)   255
-    (877,990)   267
-    (880,176)   291
-    (881,23)   268
-    (882,608)   248
-    (883,929)   251
-    (884,643)   247
-    (885,687)   259
-    (887,487)   257
-    (888,110)   266
-    (888,943)   264
-    (889,892)   267
-    (890,628)   261
-    (891,679)   258
-    (892,653)   254
-    (894,33)   258
-    (895,37)   266
-    (895,695)   269
-    (896,390)   269
-    (897,42)   265
-    (900,687)   281
-    (901,146)   241
-    (901,605)   261
-    (902,57)   230
-    (903,1021)   250
-    (904,808)   237
-    (905,795)   271
-    (906,479)   257
-    (907,674)   277
-    (909,456)   250
-    (910,167)   265
-    (911,548)   248
-    (914,924)   250
-    (915,366)   253
-    (915,502)   238
-    (916,420)   273
-    (916,823)   247
-    (918,480)   248
-    (919,970)   259
-    (920,608)   246
-    (921,966)   230
-    (923,216)   247
-    (925,685)   275
-    (926,755)   274
-    (929,538)   268
-    (930,13)   259
-    (931,479)   250
-    (933,860)   261
-    (934,165)   250
-    (935,351)   233
-    (936,399)   244
-    (938,215)   264
-    (939,496)   276
-    (939,748)   262
-    (940,414)   242
-    (941,586)   265
-    (942,356)   274
-    (943,31)   263
-    (943,538)   262
-    (944,109)   249
-    (945,671)   258
-    (946,246)   255
-    (947,182)   262
-    (948,628)   262
-    (949,316)   238
-    (950,1017)   259
-    (951,221)   250
-    (955,457)   237
-    (955,823)   241
-    (956,653)   258
-    (957,656)   255
-    (958,644)   238
-    (959,667)   246
-    (960,78)   247
-    (961,828)   252
-    (962,877)   269
-    (963,397)   284
-    (964,370)   262
-    (965,504)   244
-    (966,483)   246
-    (967,1023)   246
-    (968,400)   233
-    (969,564)   254
-    (970,856)   257
-    (971,875)   243
-    (972,549)   259
-    (972,630)   240
-    (974,934)   281
-    (976,980)   247
-    (977,347)   230
-    (978,123)   258
-    (980,371)   245
-    (981,175)   258
-    (983,58)   252
-    (984,449)   248
-    (984,582)   246
-    (985,72)   253
-    (985,743)   237
-    (986,323)   248
-    (987,120)   241
-    (987,340)   266
-    (988,172)   251
-    (989,585)   241
-    (990,514)   271
-    (991,660)   256
-    (992,15)   283
-    (992,531)   277
-    (993,87)   267
-    (993,674)   252
-    (994,992)   244
-    (995,170)   269
-    (997,946)   270
-    (998,678)   251
-    (999,167)   258
-    (1001,877)   250
-    (1002,286)   242
-    (1004,250)   259
-    (1006,1022)   248
-    (1008,159)   264
-    (1009,574)   258
-    (1012,533)   270
-    (1013,574)   273
-    (1014,667)   247
-    (1015,127)   244
-    (1015,613)   245
-    (1016,457)   246
-    (1017,180)   267
-    (1018,254)   237
-    (1019,287)   248
-    (1020,67)   261
-    (1020,151)   248
-    (1021,810)   239
-    (1022,491)   268
-    (1023,840)   264
-
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS double matrix, sparse by row
-  Diff actual, 1024 entries, memory: 32.2 KB
-
-    (0,478)    0
-    (0,574)    0
-    (2,376)    0
-    (5,560)    0
-    (6,996)    0
-    (7,183)    0
-    (7,666)    0
-    (8,896)    0
-    (9,187)    0
-    (10,446)    0
-    (11,46)    0
-    (11,955)    0
-    (12,397)    0
-    (12,953)    0
-    (13,192)    0
-    (14,421)    0
-    (15,568)    0
-    (16,788)    0
-    (16,904)    0
-    (17,928)    0
-    (18,103)    0
-    (19,821)    0
-    (19,886)    0
-    (20,474)    0
-    (21,479)    0
-    (21,975)    0
-    (22,569)    0
-    (23,310)    0
-    (24,905)    0
-    (25,241)    0
-    (26,428)    0
-    (28,107)    0
-    (28,441)    0
-    (30,694)    0
-    (32,121)    0
-    (33,81)    0
-    (34,804)    0
-    (36,451)    0
-    (37,609)    0
-    (38,138)    0
-    (39,698)    0
-    (40,950)    0
-    (41,568)    0
-    (42,324)    0
-    (43,798)    0
-    (46,208)    0
-    (47,70)    0
-    (48,336)    0
-    (49,476)    0
-    (50,35)    0
-    (51,556)    0
-    (52,999)    0
-    (53,940)    0
-    (54,558)    0
-    (54,960)    0
-    (55,979)    0
-    (56,90)    0
-    (57,846)    0
-    (57,893)    0
-    (58,35)    0
-    (59,108)    0
-    (60,479)    0
-    (61,590)    0
-    (62,771)    0
-    (63,50)    0
-    (64,268)    0
-    (65,694)    0
-    (66,719)    0
-    (67,411)    0
-    (68,324)    0
-    (69,477)    0
-    (70,539)    0
-    (71,228)    0
-    (72,297)    0
-    (73,665)    0
-    (75,855)    0
-    (76,248)    0
-    (77,433)    0
-    (78,90)    0
-    (81,754)    0
-    (82,243)    0
-    (84,253)    0
-    (86,104)    0
-    (87,657)    0
-    (89,825)    0
-    (90,37)    0
-    (91,234)    0
-    (91,519)    0
-    (92,74)    0
-    (92,218)    0
-    (92,690)    0
-    (93,486)    0
-    (94,637)    0
-    (94,722)    0
-    (96,564)    0
-    (97,748)    0
-    (99,326)    0
-    (100,281)    0
-    (102,609)    0
-    (103,621)    0
-    (104,644)    0
-    (106,652)    0
-    (107,239)    0
-    (107,522)    0
-    (108,131)    0
-    (109,884)    0
-    (110,402)    0
-    (111,905)    0
-    (112,127)    0
-    (112,779)    0
-    (113,278)    0
-    (114,519)    0
-    (115,240)    0
-    (116,198)    0
-    (117,219)    0
-    (117,338)    0
-    (118,99)    0
-    (120,477)    0
-    (121,554)    0
-    (121,715)    0
-    (122,151)    0
-    (123,621)    0
-    (125,177)    0
-    (126,36)    0
-    (128,820)    0
-    (128,835)    0
-    (129,660)    0
-    (130,623)    0
-    (130,807)    0
-    (131,253)    0
-    (131,355)    0
-    (132,570)    0
-    (133,492)    0
-    (134,821)    0
-    (135,295)    0
-    (136,108)    0
-    (137,834)    0
-    (138,288)    0
-    (139,284)    0
-    (139,945)    0
-    (140,887)    0
-    (141,199)    0
-    (142,87)    0
-    (142,225)    0
-    (143,123)    0
-    (144,574)    0
-    (145,552)    0
-    (146,194)    0
-    (146,995)    0
-    (148,357)    0
-    (149,949)    0
-    (150,717)    0
-    (151,484)    0
-    (156,290)    0
-    (157,714)    0
-    (157,974)    0
-    (158,959)    0
-    (160,297)    0
-    (162,601)    0
-    (163,816)    0
-    (164,221)    0
-    (165,396)    0
-    (166,801)    0
-    (167,879)    0
-    (168,321)    0
-    (169,901)    0
-    (170,612)    0
-    (171,15)    0
-    (172,951)    0
-    (174,0)    0
-    (174,595)    0
-    (175,669)    0
-    (176,108)    0
-    (176,188)    0
-    (176,614)    0
-    (176,781)    0
-    (177,17)    0
-    (178,631)    0
-    (179,932)    0
-    (180,830)    0
-    (182,675)    0
-    (182,1001)    0
-    (183,692)    0
-    (184,143)    0
-    (185,450)    0
-    (186,779)    0
-    (187,997)    0
-    (188,357)    0
-    (189,111)    0
-    (190,990)    0
-    (192,644)    0
-    (192,953)    0
-    (193,135)    0
-    (194,137)    0
-    (195,922)    0
-    (197,859)    0
-    (198,910)    0
-    (199,531)    0
-    (201,907)    0
-    (202,863)    0
-    (203,865)    0
-    (204,614)    0
-    (207,826)    0
-    (208,985)    0
-    (209,808)    0
-    (210,659)    0
-    (211,71)    0
-    (211,931)    0
-    (212,426)    0
-    (213,152)    0
-    (214,928)    0
-    (215,268)    0
-    (216,550)    0
-    (217,921)    0
-    (218,704)    0
-    (218,922)    0
-    (219,66)    0
-    (220,704)    0
-    (221,56)    0
-    (221,551)    0
-    (222,545)    0
-    (223,1016)    0
-    (224,721)    0
-    (225,935)    0
-    (226,727)    0
-    (228,743)    0
-    (229,535)    0
-    (230,382)    0
-    (231,551)    0
-    (232,897)    0
-    (233,570)    0
-    (234,520)    0
-    (235,522)    0
-    (236,221)    0
-    (237,755)    0
-    (238,964)    0
-    (239,82)    0
-    (240,388)    0
-    (241,500)    0
-    (242,124)    0
-    (242,193)    0
-    (242,621)    0
-    (243,300)    0
-    (244,588)    0
-    (244,1004)    0
-    (245,494)    0
-    (246,326)    0
-    (247,115)    0
-    (247,147)    0
-    (248,233)    0
-    (250,485)    0
-    (251,708)    0
-    (252,197)    0
-    (253,485)    0
-    (254,40)    0
-    (254,238)    0
-    (255,895)    0
-    (256,114)    0
-    (257,461)    0
-    (257,796)    0
-    (258,233)    0
-    (260,884)    0
-    (261,945)    0
-    (262,368)    0
-    (264,755)    0
-    (265,124)    0
-    (266,352)    0
-    (267,10)    0
-    (268,234)    0
-    (269,400)    0
-    (270,877)    0
-    (270,924)    0
-    (271,944)    0
-    (272,67)    0
-    (273,100)    0
-    (274,979)    0
-    (276,333)    0
-    (277,377)    0
-    (279,877)    0
-    (280,18)    0
-    (281,449)    0
-    (282,179)    0
-    (283,1007)    0
-    (284,595)    0
-    (285,32)    0
-    (286,37)    0
-    (287,126)    0
-    (287,394)    0
-    (288,848)    0
-    (290,317)    0
-    (291,594)    0
-    (292,562)    0
-    (294,466)    0
-    (294,960)    0
-    (295,1)    0
-    (295,106)    0
-    (296,109)    0
-    (296,183)    0
-    (296,245)    0
-    (297,912)    0
-    (297,1006)    0
-    (299,159)    0
-    (300,554)    0
-    (301,774)    0
-    (302,30)    0
-    (303,645)    0
-    (304,229)    0
-    (305,622)    0
-    (307,264)    0
-    (308,28)    0
-    (309,328)    0
-    (309,627)    0
-    (310,357)    0
-    (311,355)    0
-    (312,61)    0
-    (313,758)    0
-    (314,571)    0
-    (315,177)    0
-    (315,298)    0
-    (315,741)    0
-    (316,177)    0
-    (316,308)    0
-    (317,323)    0
-    (318,595)    0
-    (319,126)    0
-    (320,468)    0
-    (321,73)    0
-    (322,235)    0
-    (323,375)    0
-    (323,651)    0
-    (324,549)    0
-    (325,306)    0
-    (325,487)    0
-    (326,649)    0
-    (327,704)    0
-    (328,142)    0
-    (329,176)    0
-    (330,848)    0
-    (330,965)    0
-    (332,795)    0
-    (334,695)    0
-    (335,694)    0
-    (336,775)    0
-    (336,808)    0
-    (337,608)    0
-    (338,993)    0
-    (339,680)    0
-    (340,849)    0
-    (341,36)    0
-    (342,723)    0
-    (343,678)    0
-    (344,384)    0
-    (344,680)    0
-    (345,75)    0
-    (347,996)    0
-    (348,60)    0
-    (348,821)    0
-    (349,804)    0
-    (350,282)    0
-    (351,142)    0
-    (351,937)    0
-    (352,160)    0
-    (353,536)    0
-    (355,352)    0
-    (356,340)    0
-    (358,678)    0
-    (360,679)    0
-    (361,794)    0
-    (361,989)    0
-    (362,816)    0
-    (363,206)    0
-    (364,629)    0
-    (365,990)    0
-    (366,841)    0
-    (366,971)    0
-    (367,888)    0
-    (368,587)    0
-    (369,684)    0
-    (370,270)    0
-    (371,327)    0
-    (372,471)    0
-    (373,88)    0
-    (374,669)    0
-    (375,992)    0
-    (376,336)    0
-    (377,86)    0
-    (378,882)    0
-    (379,592)    0
-    (380,77)    0
-    (380,643)    0
-    (381,1012)    0
-    (382,816)    0
-    (383,711)    0
-    (385,670)    0
-    (386,537)    0
-    (387,347)    0
-    (388,494)    0
-    (389,328)    0
-    (389,733)    0
-    (390,551)    0
-    (391,59)    0
-    (391,600)    0
-    (394,692)    0
-    (396,645)    0
-    (397,835)    0
-    (398,107)    0
-    (398,246)    0
-    (399,436)    0
-    (400,172)    0
-    (400,382)    0
-    (401,790)    0
-    (402,320)    0
-    (403,40)    0
-    (404,641)    0
-    (405,49)    0
-    (405,475)    0
-    (407,320)    0
-    (408,61)    0
-    (410,754)    0
-    (411,643)    0
-    (412,949)    0
-    (413,94)    0
-    (414,991)    0
-    (415,26)    0
-    (416,575)    0
-    (417,366)    0
-    (418,160)    0
-    (418,669)    0
-    (419,209)    0
-    (419,285)    0
-    (420,748)    0
-    (421,614)    0
-    (422,177)    0
-    (423,873)    0
-    (424,542)    0
-    (425,263)    0
-    (426,377)    0
-    (427,149)    0
-    (428,950)    0
-    (429,305)    0
-    (430,718)    0
-    (431,51)    0
-    (432,857)    0
-    (434,604)    0
-    (435,152)    0
-    (436,356)    0
-    (437,105)    0
-    (438,814)    0
-    (440,338)    0
-    (441,982)    0
-    (442,880)    0
-    (443,753)    0
-    (444,669)    0
-    (445,952)    0
-    (446,741)    0
-    (447,970)    0
-    (448,646)    0
-    (448,744)    0
-    (449,835)    0
-    (450,579)    0
-    (451,147)    0
-    (451,1017)    0
-    (452,868)    0
-    (453,26)    0
-    (454,415)    0
-    (454,668)    0
-    (455,43)    0
-    (456,849)    0
-    (456,985)    0
-    (457,218)    0
-    (458,510)    0
-    (459,737)    0
-    (460,836)    0
-    (461,849)    0
-    (461,917)    0
-    (462,900)    0
-    (463,316)    0
-    (464,762)    0
-    (465,355)    0
-    (465,801)    0
-    (466,673)    0
-    (467,112)    0
-    (468,288)    0
-    (470,889)    0
-    (471,650)    0
-    (473,121)    0
-    (473,127)    0
-    (474,487)    0
-    (475,382)    0
-    (476,44)    0
-    (477,342)    0
-    (478,786)    0
-    (480,667)    0
-    (481,558)    0
-    (482,680)    0
-    (483,517)    0
-    (484,961)    0
-    (485,274)    0
-    (486,1015)    0
-    (487,194)    0
-    (489,802)    0
-    (490,811)    0
-    (491,319)    0
-    (492,377)    0
-    (494,432)    0
-    (495,809)    0
-    (496,267)    0
-    (496,902)    0
-    (498,194)    0
-    (499,952)    0
-    (500,84)    0
-    (501,704)    0
-    (503,519)    0
-    (504,510)    0
-    (504,887)    0
-    (505,574)    0
-    (507,643)    0
-    (508,449)    0
-    (512,892)    0
-    (513,271)    0
-    (514,404)    0
-    (515,758)    0
-    (517,369)    0
-    (518,293)    0
-    (519,786)    0
-    (520,270)    0
-    (521,1013)    0
-    (522,284)    0
-    (523,632)    0
-    (524,945)    0
-    (525,94)    0
-    (525,362)    0
-    (526,52)    0
-    (527,61)    0
-    (528,294)    0
-    (529,145)    0
-    (529,998)    0
-    (530,112)    0
-    (531,908)    0
-    (533,674)    0
-    (534,505)    0
-    (535,660)    0
-    (535,776)    0
-    (536,500)    0
-    (537,799)    0
-    (538,492)    0
-    (538,861)    0
-    (540,245)    0
-    (542,137)    0
-    (545,658)    0
-    (546,213)    0
-    (547,767)    0
-    (547,912)    0
-    (547,1018)    0
-    (548,46)    0
-    (548,697)    0
-    (549,602)    0
-    (550,927)    0
-    (552,710)    0
-    (553,391)    0
-    (554,351)    0
-    (555,10)    0
-    (556,26)    0
-    (557,910)    0
-    (558,552)    0
-    (560,792)    0
-    (561,597)    0
-    (562,182)    0
-    (562,862)    0
-    (563,877)    0
-    (564,310)    0
-    (564,609)    0
-    (565,490)    0
-    (566,564)    0
-    (566,607)    0
-    (569,872)    0
-    (570,465)    0
-    (571,271)    0
-    (571,919)    0
-    (572,630)    0
-    (574,603)    0
-    (576,256)    0
-    (579,274)    0
-    (580,182)    0
-    (581,445)    0
-    (582,177)    0
-    (583,118)    0
-    (584,399)    0
-    (585,433)    0
-    (587,254)    0
-    (588,914)    0
-    (589,1016)    0
-    (590,95)    0
-    (590,802)    0
-    (591,978)    0
-    (592,527)    0
-    (593,143)    0
-    (594,430)    0
-    (595,787)    0
-    (596,677)    0
-    (598,788)    0
-    (599,127)    0
-    (600,339)    0
-    (601,478)    0
-    (602,218)    0
-    (603,759)    0
-    (604,270)    0
-    (605,76)    0
-    (606,930)    0
-    (608,832)    0
-    (609,287)    0
-    (610,794)    0
-    (611,759)    0
-    (612,1006)    0
-    (613,398)    0
-    (614,386)    0
-    (615,115)    0
-    (616,928)    0
-    (617,30)    0
-    (618,361)    0
-    (619,996)    0
-    (620,5)    0
-    (620,337)    0
-    (621,41)    0
-    (623,44)    0
-    (623,79)    0
-    (623,966)    0
-    (624,19)    0
-    (624,242)    0
-    (624,524)    0
-    (625,683)    0
-    (626,51)    0
-    (627,361)    0
-    (628,396)    0
-    (629,882)    0
-    (630,341)    0
-    (631,49)    0
-    (631,585)    0
-    (632,73)    0
-    (634,912)    0
-    (635,882)    0
-    (636,617)    0
-    (637,285)    0
-    (637,716)    0
-    (638,113)    0
-    (638,367)    0
-    (639,616)    0
-    (640,837)    0
-    (641,457)    0
-    (643,934)    0
-    (647,783)    0
-    (648,195)    0
-    (649,614)    0
-    (650,957)    0
-    (651,281)    0
-    (652,973)    0
-    (653,60)    0
-    (653,333)    0
-    (654,605)    0
-    (655,910)    0
-    (656,349)    0
-    (659,17)    0
-    (660,591)    0
-    (661,512)    0
-    (663,767)    0
-    (664,158)    0
-    (665,77)    0
-    (666,503)    0
-    (667,951)    0
-    (668,365)    0
-    (669,300)    0
-    (671,141)    0
-    (671,565)    0
-    (672,819)    0
-    (674,36)    0
-    (674,819)    0
-    (675,454)    0
-    (676,242)    0
-    (677,289)    0
-    (677,647)    0
-    (678,802)    0
-    (679,899)    0
-    (680,398)    0
-    (681,390)    0
-    (681,699)    0
-    (682,117)    0
-    (683,110)    0
-    (684,907)    0
-    (685,17)    0
-    (686,202)    0
-    (687,45)    0
-    (688,287)    0
-    (689,502)    0
-    (690,299)    0
-    (691,392)    0
-    (692,600)    0
-    (694,378)    0
-    (695,702)    0
-    (696,102)    0
-    (698,631)    0
-    (699,152)    0
-    (700,840)    0
-    (701,323)    0
-    (702,777)    0
-    (703,132)    0
-    (704,374)    0
-    (705,579)    0
-    (706,511)    0
-    (707,76)    0
-    (708,259)    0
-    (708,925)    0
-    (709,872)    0
-    (709,873)    0
-    (710,107)    0
-    (710,293)    0
-    (711,210)    0
-    (711,462)    0
-    (714,475)    0
-    (715,172)    0
-    (715,751)    0
-    (716,697)    0
-    (717,234)    0
-    (717,620)    0
-    (718,848)    0
-    (719,331)    0
-    (720,201)    0
-    (720,725)    0
-    (721,806)    0
-    (722,415)    0
-    (722,934)    0
-    (723,675)    0
-    (724,480)    0
-    (726,337)    0
-    (727,177)    0
-    (728,797)    0
-    (729,884)    0
-    (730,767)    0
-    (731,275)    0
-    (732,910)    0
-    (733,763)    0
-    (734,574)    0
-    (735,268)    0
-    (736,115)    0
-    (737,146)    0
-    (737,912)    0
-    (738,1023)    0
-    (739,335)    0
-    (740,596)    0
-    (741,365)    0
-    (741,630)    0
-    (742,485)    0
-    (743,186)    0
-    (745,621)    0
-    (745,645)    0
-    (746,273)    0
-    (747,91)    0
-    (748,886)    0
-    (749,59)    0
-    (749,755)    0
-    (751,348)    0
-    (752,313)    0
-    (752,742)    0
-    (752,745)    0
-    (753,472)    0
-    (753,592)    0
-    (754,1007)    0
-    (756,633)    0
-    (758,847)    0
-    (759,500)    0
-    (760,340)    0
-    (760,381)    0
-    (762,962)    0
-    (763,954)    0
-    (764,392)    0
-    (764,913)    0
-    (766,915)    0
-    (766,936)    0
-    (767,372)    0
-    (768,307)    0
-    (770,458)    0
-    (771,103)    0
-    (771,487)    0
-    (773,56)    0
-    (774,773)    0
-    (775,115)    0
-    (776,537)    0
-    (777,392)    0
-    (778,893)    0
-    (779,644)    0
-    (780,256)    0
-    (781,899)    0
-    (782,399)    0
-    (782,892)    0
-    (783,614)    0
-    (784,54)    0
-    (785,816)    0
-    (786,462)    0
-    (787,876)    0
-    (788,273)    0
-    (789,696)    0
-    (790,471)    0
-    (791,793)    0
-    (792,636)    0
-    (792,955)    0
-    (793,809)    0
-    (794,986)    0
-    (795,656)    0
-    (796,347)    0
-    (797,880)    0
-    (798,802)    0
-    (799,294)    0
-    (800,970)    0
-    (801,130)    0
-    (803,896)    0
-    (804,1022)    0
-    (805,32)    0
-    (805,479)    0
-    (806,889)    0
-    (807,504)    0
-    (809,719)    0
-    (809,737)    0
-    (810,646)    0
-    (811,547)    0
-    (812,375)    0
-    (813,200)    0
-    (815,408)    0
-    (816,902)    0
-    (817,430)    0
-    (818,985)    0
-    (819,688)    0
-    (821,839)    0
-    (822,747)    0
-    (823,39)    0
-    (824,886)    0
-    (825,406)    0
-    (826,814)    0
-    (827,625)    0
-    (828,407)    0
-    (829,511)    0
-    (830,915)    0
-    (831,982)    0
-    (832,1003)    0
-    (833,362)    0
-    (833,999)    0
-    (834,136)    0
-    (834,295)    0
-    (835,115)    0
-    (836,218)    0
-    (837,565)    0
-    (839,541)    0
-    (839,711)    0
-    (840,159)    0
-    (841,103)    0
-    (841,636)    0
-    (842,136)    0
-    (843,524)    0
-    (844,114)    0
-    (845,694)    0
-    (846,533)    0
-    (847,741)    0
-    (848,483)    0
-    (849,464)    0
-    (850,302)    0
-    (851,567)    0
-    (852,150)    0
-    (852,529)    0
-    (853,623)    0
-    (855,106)    0
-    (856,1014)    0
-    (857,151)    0
-    (857,650)    0
-    (858,781)    0
-    (858,994)    0
-    (859,508)    0
-    (859,716)    0
-    (862,636)    0
-    (863,21)    0
-    (864,1022)    0
-    (865,972)    0
-    (866,97)    0
-    (867,48)    0
-    (868,303)    0
-    (869,364)    0
-    (870,506)    0
-    (871,453)    0
-    (872,775)    0
-    (873,173)    0
-    (874,485)    0
-    (875,168)    0
-    (876,357)    0
-    (877,722)    0
-    (877,990)    0
-    (880,176)    0
-    (881,23)    0
-    (882,608)    0
-    (883,929)    0
-    (884,643)    0
-    (885,687)    0
-    (887,487)    0
-    (888,110)    0
-    (888,943)    0
-    (889,892)    0
-    (890,628)    0
-    (891,679)    0
-    (892,653)    0
-    (894,33)    0
-    (895,37)    0
-    (895,695)    0
-    (896,390)    0
-    (897,42)    0
-    (900,687)    0
-    (901,146)    0
-    (901,605)    0
-    (902,57)    0
-    (903,1021)    0
-    (904,808)    0
-    (905,795)    0
-    (906,479)    0
-    (907,674)    0
-    (909,456)    0
-    (910,167)    0
-    (911,548)    0
-    (914,924)    0
-    (915,366)    0
-    (915,502)    0
-    (916,420)    0
-    (916,823)    0
-    (918,480)    0
-    (919,970)    0
-    (920,608)    0
-    (921,966)    0
-    (923,216)    0
-    (925,685)    0
-    (926,755)    0
-    (929,538)    0
-    (930,13)    0
-    (931,479)    0
-    (933,860)    0
-    (934,165)    0
-    (935,351)    0
-    (936,399)    0
-    (938,215)    0
-    (939,496)    0
-    (939,748)    0
-    (940,414)    0
-    (941,586)    0
-    (942,356)    0
-    (943,31)    0
-    (943,538)    0
-    (944,109)    0
-    (945,671)    0
-    (946,246)    0
-    (947,182)    0
-    (948,628)    0
-    (949,316)    0
-    (950,1017)    0
-    (951,221)    0
-    (955,457)    0
-    (955,823)    0
-    (956,653)    0
-    (957,656)    0
-    (958,644)    0
-    (959,667)    0
-    (960,78)    0
-    (961,828)    0
-    (962,877)    0
-    (963,397)    0
-    (964,370)    0
-    (965,504)    0
-    (966,483)    0
-    (967,1023)    0
-    (968,400)    0
-    (969,564)    0
-    (970,856)    0
-    (971,875)    0
-    (972,549)    0
-    (972,630)    0
-    (974,934)    0
-    (976,980)    0
-    (977,347)    0
-    (978,123)    0
-    (980,371)    0
-    (981,175)    0
-    (983,58)    0
-    (984,449)    0
-    (984,582)    0
-    (985,72)    0
-    (985,743)    0
-    (986,323)    0
-    (987,120)    0
-    (987,340)    0
-    (988,172)    0
-    (989,585)    0
-    (990,514)    0
-    (991,660)    0
-    (992,15)    0
-    (992,531)    0
-    (993,87)    0
-    (993,674)    0
-    (994,992)    0
-    (995,170)    0
-    (997,946)    0
-    (998,678)    0
-    (999,167)    0
-    (1001,877)    0
-    (1002,286)    0
-    (1004,250)    0
-    (1006,1022)    0
-    (1008,159)    0
-    (1009,574)    0
-    (1012,533)    0
-    (1013,574)    0
-    (1014,667)    0
-    (1015,127)    0
-    (1015,613)    0
-    (1016,457)    0
-    (1017,180)    0
-    (1018,254)    0
-    (1019,287)    0
-    (1020,67)    0
-    (1020,151)    0
-    (1021,810)    0
-    (1022,491)    0
-    (1023,840)    0
-
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  T actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- work:1024 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 5120 values, invsparse = 205
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-5120 nonzeroes left to fill..
-2026 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 65536 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5304500
-inside enumify: 0x7f1ff5304500
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-bucket 5 has 1024 dots to do
-LAUNCHING BUCKET CODE: 5
-Confiring spdnINside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_spdn
-found memory-cached prog GB_jit_AxB_dot3_phase3_spdn
- got kernel instance AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_spdnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 2.7095ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 131
-
-    (0,478)   1
-    (0,574)   2
-    (2,376) zombie
-    (5,560)   3
-    (6,996)   2
-    (7,183)   0
-    (7,666)   0
-    (8,896)   2
-    (9,187)   0
-    (10,446)   2
-    (11,46)   2
-    (11,955)   2
-    (12,397)   1
-    (12,953)   0
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   0
-    (17,928)   0
-    (18,103) zombie
-    (19,821)   1
-    (19,886)   0
-    (20,474)   4
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   2
-    (24,905)   0
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 893 entries, memory: 28.2 KB
-
-    (0,478)   1
-    (0,574)   2
-    (5,560)   3
-    (6,996)   2
-    (7,183)   0
-    (7,666)   0
-    (8,896)   2
-    (9,187)   0
-    (10,446)   2
-    (11,46)   2
-    (11,955)   2
-    (12,397)   1
-    (12,953)   0
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   0
-    (17,928)   0
-    (19,821)   1
-    (19,886)   0
-    (20,474)   4
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   2
-    (24,905)   0
-    (25,241)   0
-    (26,428)   0
-    (28,107)   2
-    (32,121)   0
-    (33,81)   2
-    (37,609)   2
-    (39,698)   1
-    (41,568)   1
-    (42,324)   0
-    (43,798)   1
-    (46,208)   0
-    (47,70)   1
-    (48,336)   1
-    (49,476)   1
-    (50,35)   0
-    (51,556)   0
-    (52,999)   1
-    (53,940)   1
-    (54,558)   0
-    (54,960)   1
-    (55,979)   1
-    (56,90)   2
-    (57,846)   3
-    (57,893)   0
-    (58,35)   0
-    (59,108)   3
-    (60,479)   1
-    (61,590)   2
-    (62,771)   0
-    (63,50)   0
-    (64,268)   3
-    (66,719)   2
-    (67,411)   2
-    (68,324)   0
-    (69,477)   0
-    (70,539)   1
-    (71,228)   3
-    (72,297)   3
-    (73,665)   0
-    (75,855)   0
-    (76,248)   0
-    (77,433)   4
-    (78,90)   3
-    (81,754)   4
-    (82,243)   2
-    (84,253)   1
-    (86,104)   3
-    (87,657)   0
-    (89,825)   2
-    (90,37)   4
-    (91,234)   1
-    (91,519)   1
-    (92,74)   3
-    (92,218)   1
-    (92,690)   1
-    (93,486)   2
-    (94,637)   0
-    (94,722)   1
-    (96,564)   1
-    (99,326)   2
-    (100,281)   1
-    (102,609)   2
-    (104,644)   0
-    (106,652)   1
-    (107,239)   0
-    (107,522)   2
-    (108,131)   1
-    (109,884)   2
-    (110,402)   3
-    (111,905)   2
-    (112,127)   0
-    (112,779)   0
-    (113,278)   0
-    (114,519)   1
-    (115,240)   4
-    (117,219)   0
-    (117,338)   2
-    (118,99)   4
-    (120,477)   1
-    (121,554)   3
-    (121,715)   3
-    (122,151)   3
-    (125,177)   5
-    (128,820)   6
-    (129,660)   0
-    (130,623)   1
-    (131,253)   1
-    (131,355)   1
-    (133,492)   1
-    (134,821)   0
-    (135,295)   2
-    (136,108)   3
-    (137,834)   2
-    (138,288)   1
-    (139,284)   2
-    (139,945)   0
-    (141,199)   1
-    (142,87)   4
-    (142,225)   1
-    (143,123)   0
-    (144,574)   0
-    (146,194)   3
-    (148,357)   0
-    (149,949)   1
-    (150,717)   2
-    (151,484)   2
-    (156,290)   2
-    (157,714)   0
-    (157,974)   1
-    (160,297)   1
-    (162,601)   2
-    (163,816)   3
-    (164,221)   1
-    (165,396)   1
-    (166,801)   3
-    (167,879)   3
-    (168,321)   0
-    (169,901)   3
-    (172,951)   1
-    (176,108)   1
-    (176,188)   1
-    (176,614)   2
-    (176,781)   1
-    (178,631)   1
-    (179,932)   2
-    (180,830)   3
-    (182,675)   1
-    (182,1001)   2
-    (183,692)   1
-    (184,143)   2
-    (185,450)   1
-    (186,779)   0
-    (187,997)   3
-    (188,357)   1
-    (189,111)   2
-    (190,990)   1
-    (192,644)   0
-    (192,953)   0
-    (193,135)   1
-    (194,137)   4
-    (195,922)   4
-    (197,859)   1
-    (198,910)   1
-    (199,531)   3
-    (201,907)   0
-    (202,863)   1
-    (203,865)   4
-    (204,614)   3
-    (207,826)   1
-    (208,985)   2
-    (209,808)   3
-    (211,71)   4
-    (211,931)   3
-    (212,426)   0
-    (213,152)   0
-    (214,928)   0
-    (215,268)   3
-    (216,550)   3
-    (217,921)   0
-    (218,704)   2
-    (218,922)   2
-    (219,66)   1
-    (220,704)   2
-    (221,56)   1
-    (221,551)   2
-    (222,545)   1
-    (223,1016)   2
-    (224,721)   1
-    (225,935)   1
-    (226,727)   0
-    (228,743)   4
-    (229,535)   2
-    (231,551)   3
-    (232,897)   2
-    (234,520)   2
-    (235,522)   2
-    (236,221)   3
-    (237,755)   2
-    (238,964)   2
-    (239,82)   0
-    (240,388)   0
-    (241,500)   2
-    (242,124)   3
-    (242,193)   0
-    (243,300)   0
-    (244,588)   0
-    (244,1004)   3
-    (245,494)   0
-    (246,326)   1
-    (247,115)   1
-    (247,147)   1
-    (248,233)   0
-    (250,485)   6
-    (251,708)   0
-    (252,197)   1
-    (253,485)   5
-    (254,40)   3
-    (254,238)   0
-    (255,895)   3
-    (256,114)   0
-    (257,461)   2
-    (257,796)   0
-    (258,233)   1
-    (260,884)   2
-    (261,945)   1
-    (262,368)   2
-    (264,755)   1
-    (265,124)   1
-    (266,352)   3
-    (267,10)   1
-    (268,234)   1
-    (269,400)   1
-    (270,877)   0
-    (270,924)   0
-    (271,944)   0
-    (272,67)   3
-    (273,100)   1
-    (274,979)   4
-    (276,333)   2
-    (277,377)   0
-    (279,877)   1
-    (280,18)   3
-    (281,449)   3
-    (282,179)   2
-    (283,1007)   2
-    (285,32)   1
-    (286,37)   2
-    (287,394)   3
-    (288,848)   0
-    (290,317)   0
-    (291,594)   1
-    (294,466)   2
-    (294,960)   0
-    (295,1)   0
-    (295,106)   2
-    (296,109)   2
-    (296,183)   0
-    (296,245)   0
-    (297,912)   1
-    (299,159)   1
-    (300,554)   1
-    (301,774)   1
-    (302,30)   1
-    (303,645)   1
-    (304,229)   1
-    (305,622)   0
-    (307,264)   3
-    (308,28)   0
-    (309,328)   4
-    (309,627)   0
-    (310,357)   1
-    (311,355)   1
-    (312,61)   2
-    (314,571)   3
-    (315,177)   3
-    (315,741)   0
-    (316,177)   3
-    (316,308)   4
-    (320,468)   1
-    (321,73)   0
-    (322,235)   2
-    (323,375)   3
-    (323,651)   3
-    (324,549)   2
-    (325,306)   1
-    (325,487)   1
-    (326,649)   2
-    (327,704)   0
-    (329,176)   2
-    (330,848)   1
-    (330,965)   2
-    (332,795)   1
-    (334,695)   1
-    (336,808)   4
-    (337,608)   1
-    (338,993)   2
-    (339,680)   0
-    (340,849)   1
-    (342,723)   2
-    (343,678)   2
-    (344,384)   3
-    (344,680)   0
-    (345,75)   0
-    (347,996)   3
-    (348,60)   3
-    (348,821)   1
-    (350,282)   1
-    (352,160)   2
-    (353,536)   1
-    (355,352)   5
-    (356,340)   2
-    (358,678)   2
-    (360,679)   1
-    (361,794)   0
-    (361,989)   3
-    (362,816)   2
-    (363,206)   4
-    (364,629)   0
-    (365,990)   0
-    (366,841)   1
-    (366,971)   0
-    (367,888)   2
-    (368,587)   0
-    (369,684)   3
-    (370,270)   1
-    (372,471)   1
-    (373,88)   1
-    (375,992)   2
-    (376,336)   3
-    (377,86)   1
-    (378,882)   1
-    (379,592)   2
-    (380,77)   2
-    (380,643)   2
-    (381,1012)   2
-    (382,816)   2
-    (383,711)   2
-    (385,670)   1
-    (386,537)   1
-    (387,347)   2
-    (388,494)   1
-    (389,328)   3
-    (390,551)   1
-    (391,59)   2
-    (391,600)   1
-    (394,692)   4
-    (396,645)   2
-    (398,107)   3
-    (398,246)   2
-    (399,436)   3
-    (400,172)   0
-    (401,790)   3
-    (402,320)   2
-    (403,40)   2
-    (404,641)   0
-    (405,49)   0
-    (405,475)   1
-    (407,320)   3
-    (408,61)   4
-    (410,754)   3
-    (411,643)   2
-    (412,949)   1
-    (413,94)   5
-    (415,26)   1
-    (416,575)   0
-    (417,366)   3
-    (418,160)   0
-    (419,209)   1
-    (421,614)   1
-    (422,177)   2
-    (423,873)   1
-    (424,542)   3
-    (425,263)   0
-    (426,377)   0
-    (427,149)   0
-    (429,305)   0
-    (430,718)   1
-    (431,51)   0
-    (432,857)   2
-    (434,604)   0
-    (435,152)   2
-    (436,356)   1
-    (437,105)   3
-    (440,338)   0
-    (441,982)   2
-    (442,880)   1
-    (443,753)   1
-    (446,741)   0
-    (448,646)   0
-    (448,744)   2
-    (450,579)   1
-    (451,147)   0
-    (451,1017)   0
-    (452,868)   3
-    (453,26)   1
-    (454,415)   1
-    (454,668)   0
-    (455,43)   0
-    (456,849)   1
-    (456,985)   2
-    (457,218)   2
-    (458,510)   4
-    (459,737)   2
-    (460,836)   2
-    (461,849)   0
-    (461,917)   2
-    (462,900)   1
-    (463,316)   1
-    (464,762)   1
-    (465,355)   1
-    (465,801)   1
-    (466,673)   0
-    (468,288)   1
-    (470,889)   2
-    (471,650)   1
-    (473,121)   1
-    (473,127)   2
-    (474,487)   0
-    (476,44)   0
-    (477,342)   1
-    (480,667)   1
-    (481,558)   0
-    (482,680)   1
-    (483,517)   1
-    (484,961)   1
-    (485,274)   0
-    (486,1015)   3
-    (487,194)   1
-    (489,802)   2
-    (490,811)   1
-    (491,319)   4
-    (492,377)   1
-    (494,432)   1
-    (495,809)   0
-    (496,267)   2
-    (496,902)   1
-    (498,194)   1
-    (500,84)   0
-    (501,704)   2
-    (503,519)   2
-    (504,510)   3
-    (505,574)   1
-    (507,643)   3
-    (508,449)   3
-    (512,892)   3
-    (513,271)   2
-    (517,369)   1
-    (518,293)   2
-    (520,270)   1
-    (521,1013)   1
-    (522,284)   1
-    (524,945)   1
-    (525,94)   5
-    (525,362)   2
-    (526,52)   1
-    (527,61)   3
-    (529,998)   0
-    (531,908)   1
-    (533,674)   4
-    (535,660)   1
-    (535,776)   1
-    (536,500)   3
-    (537,799)   2
-    (538,492)   2
-    (538,861)   1
-    (540,245)   0
-    (542,137)   2
-    (545,658)   0
-    (546,213)   1
-    (547,767)   1
-    (547,912)   3
-    (547,1018)   1
-    (548,46)   2
-    (548,697)   0
-    (549,602)   2
-    (550,927)   2
-    (553,391)   1
-    (554,351)   2
-    (555,10)   2
-    (556,26)   2
-    (557,910)   0
-    (560,792)   0
-    (562,182)   0
-    (562,862)   1
-    (563,877)   0
-    (564,310)   3
-    (564,609)   3
-    (565,490)   0
-    (566,564)   2
-    (566,607)   1
-    (569,872)   0
-    (570,465)   1
-    (571,271)   3
-    (571,919)   1
-    (574,603)   0
-    (576,256)   4
-    (579,274)   0
-    (580,182)   0
-    (581,445)   0
-    (582,177)   3
-    (583,118)   0
-    (584,399)   1
-    (585,433)   4
-    (587,254)   2
-    (588,914)   2
-    (589,1016)   3
-    (590,95)   3
-    (590,802)   2
-    (592,527)   0
-    (593,143)   2
-    (594,430)   0
-    (595,787)   2
-    (598,788)   1
-    (599,127)   3
-    (601,478)   2
-    (602,218)   0
-    (603,759)   1
-    (604,270)   1
-    (605,76)   3
-    (606,930)   0
-    (608,832)   1
-    (609,287)   1
-    (610,794)   0
-    (611,759)   1
-    (613,398)   3
-    (614,386)   4
-    (615,115)   0
-    (616,928)   0
-    (617,30)   2
-    (618,361)   5
-    (619,996)   4
-    (620,5)   3
-    (621,41)   0
-    (623,44)   2
-    (624,19)   1
-    (624,242)   2
-    (624,524)   1
-    (626,51)   0
-    (627,361)   1
-    (628,396)   3
-    (629,882)   1
-    (630,341)   1
-    (631,49)   1
-    (631,585)   1
-    (632,73)   1
-    (634,912)   2
-    (635,882)   1
-    (636,617)   1
-    (637,716)   0
-    (638,113)   1
-    (639,616)   5
-    (640,837)   2
-    (641,457)   1
-    (643,934)   3
-    (647,783)   2
-    (648,195)   1
-    (649,614)   1
-    (650,957)   1
-    (651,281)   2
-    (652,973)   1
-    (653,60)   1
-    (653,333)   2
-    (654,605)   3
-    (655,910)   0
-    (656,349)   3
-    (660,591)   4
-    (661,512)   2
-    (663,767)   0
-    (665,77)   3
-    (666,503)   4
-    (667,951)   2
-    (668,365)   4
-    (669,300)   1
-    (671,141)   1
-    (671,565)   2
-    (672,819)   1
-    (674,819)   1
-    (675,454)   0
-    (676,242)   2
-    (677,289)   4
-    (678,802)   3
-    (680,398)   1
-    (681,390)   1
-    (682,117)   4
-    (683,110)   2
-    (684,907)   0
-    (686,202)   0
-    (687,45)   1
-    (688,287)   2
-    (689,502)   3
-    (690,299)   3
-    (691,392)   2
-    (692,600)   0
-    (694,378)   1
-    (695,702)   1
-    (696,102)   2
-    (698,631)   0
-    (699,152)   1
-    (700,840)   1
-    (702,777)   1
-    (703,132)   1
-    (704,374)   1
-    (705,579)   1
-    (706,511)   3
-    (707,76)   3
-    (708,259)   2
-    (708,925)   0
-    (709,872)   1
-    (709,873)   1
-    (710,107)   3
-    (710,293)   2
-    (711,462)   0
-    (714,475)   2
-    (715,172)   0
-    (715,751)   2
-    (716,697)   0
-    (717,234)   0
-    (718,848)   2
-    (719,331)   1
-    (720,201)   1
-    (720,725)   2
-    (722,415)   2
-    (722,934)   2
-    (723,675)   2
-    (724,480)   3
-    (727,177)   4
-    (728,797)   1
-    (729,884)   1
-    (730,767)   0
-    (731,275)   1
-    (732,910)   0
-    (733,763)   5
-    (734,574)   0
-    (735,268)   3
-    (736,115)   1
-    (737,912)   2
-    (738,1023)   2
-    (739,335)   0
-    (740,596)   3
-    (741,365)   1
-    (742,485)   5
-    (743,186)   1
-    (745,645)   2
-    (746,273)   3
-    (747,91)   5
-    (748,886)   0
-    (749,59)   2
-    (749,755)   2
-    (751,348)   0
-    (752,313)   2
-    (752,742)   0
-    (752,745)   1
-    (753,472)   1
-    (753,592)   1
-    (754,1007)   0
-    (756,633)   1
-    (758,847)   2
-    (759,500)   3
-    (760,340)   2
-    (760,381)   2
-    (762,962)   3
-    (763,954)   0
-    (764,392)   1
-    (764,913)   3
-    (766,915)   3
-    (766,936)   0
-    (767,372)   1
-    (768,307)   0
-    (770,458)   0
-    (771,487)   0
-    (773,56)   1
-    (774,773)   0
-    (775,115)   1
-    (776,537)   1
-    (777,392)   1
-    (778,893)   0
-    (779,644)   0
-    (780,256)   2
-    (782,399)   1
-    (782,892)   2
-    (783,614)   2
-    (785,816)   1
-    (786,462)   1
-    (787,876)   1
-    (788,273)   4
-    (789,696)   2
-    (790,471)   1
-    (791,793)   3
-    (792,636)   3
-    (792,955)   3
-    (793,809)   0
-    (794,986)   1
-    (795,656)   0
-    (796,347)   3
-    (797,880)   2
-    (798,802)   0
-    (801,130)   1
-    (803,896)   3
-    (804,1022)   3
-    (805,32)   1
-    (805,479)   1
-    (806,889)   2
-    (807,504)   3
-    (809,719)   1
-    (809,737)   2
-    (810,646)   0
-    (812,375)   3
-    (813,200)   2
-    (815,408)   3
-    (816,902)   1
-    (817,430)   1
-    (818,985)   5
-    (819,688)   1
-    (821,839)   1
-    (822,747)   1
-    (823,39)   1
-    (824,886)   0
-    (825,406)   0
-    (828,407)   2
-    (829,511)   1
-    (830,915)   2
-    (831,982)   1
-    (832,1003)   2
-    (833,362)   2
-    (833,999)   2
-    (834,136)   2
-    (834,295)   1
-    (835,115)   1
-    (836,218)   2
-    (837,565)   4
-    (839,541)   0
-    (839,711)   0
-    (840,159)   1
-    (841,636)   1
-    (842,136)   2
-    (843,524)   0
-    (844,114)   0
-    (846,533)   1
-    (847,741)   0
-    (848,483)   1
-    (849,464)   3
-    (850,302)   0
-    (851,567)   1
-    (852,150)   4
-    (852,529)   0
-    (853,623)   1
-    (855,106)   2
-    (856,1014)   1
-    (857,151)   2
-    (857,650)   1
-    (858,781)   1
-    (858,994)   0
-    (859,508)   0
-    (859,716)   0
-    (862,636)   2
-    (863,21)   4
-    (864,1022)   2
-    (866,97)   0
-    (867,48)   1
-    (868,303)   1
-    (869,364)   4
-    (871,453)   1
-    (873,173)   0
-    (874,485)   7
-    (875,168)   1
-    (876,357)   0
-    (877,722)   1
-    (877,990)   0
-    (880,176)   2
-    (881,23)   1
-    (882,608)   0
-    (884,643)   3
-    (885,687)   0
-    (887,487)   0
-    (888,110)   2
-    (888,943)   0
-    (889,892)   3
-    (890,628)   2
-    (891,679)   1
-    (892,653)   2
-    (894,33)   0
-    (895,37)   2
-    (895,695)   0
-    (896,390)   0
-    (897,42)   2
-    (900,687)   0
-    (901,605)   2
-    (902,57)   1
-    (903,1021)   1
-    (904,808)   4
-    (905,795)   3
-    (906,479)   0
-    (907,674)   2
-    (909,456)   2
-    (911,548)   1
-    (914,924)   1
-    (915,366)   2
-    (915,502)   3
-    (916,420)   3
-    (916,823)   1
-    (918,480)   3
-    (920,608)   1
-    (925,685)   0
-    (926,755)   4
-    (929,538)   0
-    (930,13)   1
-    (931,479)   3
-    (933,860)   0
-    (934,165)   0
-    (935,351)   2
-    (936,399)   1
-    (938,215)   0
-    (939,496)   0
-    (940,414)   0
-    (941,586)   5
-    (942,356)   1
-    (943,31)   4
-    (943,538)   0
-    (944,109)   3
-    (945,671)   1
-    (946,246)   3
-    (947,182)   0
-    (948,628)   2
-    (949,316)   0
-    (950,1017)   0
-    (951,221)   2
-    (955,457)   1
-    (955,823)   0
-    (956,653)   2
-    (957,656)   0
-    (958,644)   0
-    (959,667)   2
-    (960,78)   3
-    (961,828)   4
-    (962,877)   1
-    (963,397)   1
-    (964,370)   1
-    (965,504)   3
-    (966,483)   2
-    (967,1023)   2
-    (968,400)   0
-    (969,564)   1
-    (970,856)   1
-    (971,875)   1
-    (972,549)   1
-    (974,934)   2
-    (977,347)   3
-    (978,123)   0
-    (981,175)   3
-    (983,58)   1
-    (984,449)   1
-    (984,582)   2
-    (985,72)   1
-    (985,743)   2
-    (987,120)   2
-    (987,340)   4
-    (988,172)   0
-    (989,585)   2
-    (991,660)   1
-    (992,531)   3
-    (993,87)   2
-    (993,674)   2
-    (994,992)   2
-    (995,170)   2
-    (997,946)   1
-    (998,678)   2
-    (1001,877)   1
-    (1002,286)   2
-    (1004,250)   3
-    (1006,1022)   3
-    (1008,159)   1
-    (1009,574)   0
-    (1012,533)   1
-    (1013,574)   1
-    (1014,667)   3
-    (1015,127)   1
-    (1015,613)   2
-    (1016,457)   1
-    (1017,180)   2
-    (1018,254)   2
-    (1019,287)   3
-    (1020,67)   3
-    (1020,151)   2
-    (1021,810)   1
-    (1022,491)   0
-    (1023,840)   2
-
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 893 entries, memory: 28.2 KB
-
-    (0,478)   1
-    (0,574)   2
-    (5,560)   3
-    (6,996)   2
-    (7,183)   0
-    (7,666)   0
-    (8,896)   2
-    (9,187)   0
-    (10,446)   2
-    (11,46)   2
-    (11,955)   2
-    (12,397)   1
-    (12,953)   0
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   0
-    (17,928)   0
-    (19,821)   1
-    (19,886)   0
-    (20,474)   4
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   2
-    (24,905)   0
-    (25,241)   0
-    (26,428)   0
-    (28,107)   2
-    (32,121)   0
-    (33,81)   2
-    (37,609)   2
-    (39,698)   1
-    (41,568)   1
-    (42,324)   0
-    (43,798)   1
-    (46,208)   0
-    (47,70)   1
-    (48,336)   1
-    (49,476)   1
-    (50,35)   0
-    (51,556)   0
-    (52,999)   1
-    (53,940)   1
-    (54,558)   0
-    (54,960)   1
-    (55,979)   1
-    (56,90)   2
-    (57,846)   3
-    (57,893)   0
-    (58,35)   0
-    (59,108)   3
-    (60,479)   1
-    (61,590)   2
-    (62,771)   0
-    (63,50)   0
-    (64,268)   3
-    (66,719)   2
-    (67,411)   2
-    (68,324)   0
-    (69,477)   0
-    (70,539)   1
-    (71,228)   3
-    (72,297)   3
-    (73,665)   0
-    (75,855)   0
-    (76,248)   0
-    (77,433)   4
-    (78,90)   3
-    (81,754)   4
-    (82,243)   2
-    (84,253)   1
-    (86,104)   3
-    (87,657)   0
-    (89,825)   2
-    (90,37)   4
-    (91,234)   1
-    (91,519)   1
-    (92,74)   3
-    (92,218)   1
-    (92,690)   1
-    (93,486)   2
-    (94,637)   0
-    (94,722)   1
-    (96,564)   1
-    (99,326)   2
-    (100,281)   1
-    (102,609)   2
-    (104,644)   0
-    (106,652)   1
-    (107,239)   0
-    (107,522)   2
-    (108,131)   1
-    (109,884)   2
-    (110,402)   3
-    (111,905)   2
-    (112,127)   0
-    (112,779)   0
-    (113,278)   0
-    (114,519)   1
-    (115,240)   4
-    (117,219)   0
-    (117,338)   2
-    (118,99)   4
-    (120,477)   1
-    (121,554)   3
-    (121,715)   3
-    (122,151)   3
-    (125,177)   5
-    (128,820)   6
-    (129,660)   0
-    (130,623)   1
-    (131,253)   1
-    (131,355)   1
-    (133,492)   1
-    (134,821)   0
-    (135,295)   2
-    (136,108)   3
-    (137,834)   2
-    (138,288)   1
-    (139,284)   2
-    (139,945)   0
-    (141,199)   1
-    (142,87)   4
-    (142,225)   1
-    (143,123)   0
-    (144,574)   0
-    (146,194)   3
-    (148,357)   0
-    (149,949)   1
-    (150,717)   2
-    (151,484)   2
-    (156,290)   2
-    (157,714)   0
-    (157,974)   1
-    (160,297)   1
-    (162,601)   2
-    (163,816)   3
-    (164,221)   1
-    (165,396)   1
-    (166,801)   3
-    (167,879)   3
-    (168,321)   0
-    (169,901)   3
-    (172,951)   1
-    (176,108)   1
-    (176,188)   1
-    (176,614)   2
-    (176,781)   1
-    (178,631)   1
-    (179,932)   2
-    (180,830)   3
-    (182,675)   1
-    (182,1001)   2
-    (183,692)   1
-    (184,143)   2
-    (185,450)   1
-    (186,779)   0
-    (187,997)   3
-    (188,357)   1
-    (189,111)   2
-    (190,990)   1
-    (192,644)   0
-    (192,953)   0
-    (193,135)   1
-    (194,137)   4
-    (195,922)   4
-    (197,859)   1
-    (198,910)   1
-    (199,531)   3
-    (201,907)   0
-    (202,863)   1
-    (203,865)   4
-    (204,614)   3
-    (207,826)   1
-    (208,985)   2
-    (209,808)   3
-    (211,71)   4
-    (211,931)   3
-    (212,426)   0
-    (213,152)   0
-    (214,928)   0
-    (215,268)   3
-    (216,550)   3
-    (217,921)   0
-    (218,704)   2
-    (218,922)   2
-    (219,66)   1
-    (220,704)   2
-    (221,56)   1
-    (221,551)   2
-    (222,545)   1
-    (223,1016)   2
-    (224,721)   1
-    (225,935)   1
-    (226,727)   0
-    (228,743)   4
-    (229,535)   2
-    (231,551)   3
-    (232,897)   2
-    (234,520)   2
-    (235,522)   2
-    (236,221)   3
-    (237,755)   2
-    (238,964)   2
-    (239,82)   0
-    (240,388)   0
-    (241,500)   2
-    (242,124)   3
-    (242,193)   0
-    (243,300)   0
-    (244,588)   0
-    (244,1004)   3
-    (245,494)   0
-    (246,326)   1
-    (247,115)   1
-    (247,147)   1
-    (248,233)   0
-    (250,485)   6
-    (251,708)   0
-    (252,197)   1
-    (253,485)   5
-    (254,40)   3
-    (254,238)   0
-    (255,895)   3
-    (256,114)   0
-    (257,461)   2
-    (257,796)   0
-    (258,233)   1
-    (260,884)   2
-    (261,945)   1
-    (262,368)   2
-    (264,755)   1
-    (265,124)   1
-    (266,352)   3
-    (267,10)   1
-    (268,234)   1
-    (269,400)   1
-    (270,877)   0
-    (270,924)   0
-    (271,944)   0
-    (272,67)   3
-    (273,100)   1
-    (274,979)   4
-    (276,333)   2
-    (277,377)   0
-    (279,877)   1
-    (280,18)   3
-    (281,449)   3
-    (282,179)   2
-    (283,1007)   2
-    (285,32)   1
-    (286,37)   2
-    (287,394)   3
-    (288,848)   0
-    (290,317)   0
-    (291,594)   1
-    (294,466)   2
-    (294,960)   0
-    (295,1)   0
-    (295,106)   2
-    (296,109)   2
-    (296,183)   0
-    (296,245)   0
-    (297,912)   1
-    (299,159)   1
-    (300,554)   1
-    (301,774)   1
-    (302,30)   1
-    (303,645)   1
-    (304,229)   1
-    (305,622)   0
-    (307,264)   3
-    (308,28)   0
-    (309,328)   4
-    (309,627)   0
-    (310,357)   1
-    (311,355)   1
-    (312,61)   2
-    (314,571)   3
-    (315,177)   3
-    (315,741)   0
-    (316,177)   3
-    (316,308)   4
-    (320,468)   1
-    (321,73)   0
-    (322,235)   2
-    (323,375)   3
-    (323,651)   3
-    (324,549)   2
-    (325,306)   1
-    (325,487)   1
-    (326,649)   2
-    (327,704)   0
-    (329,176)   2
-    (330,848)   1
-    (330,965)   2
-    (332,795)   1
-    (334,695)   1
-    (336,808)   4
-    (337,608)   1
-    (338,993)   2
-    (339,680)   0
-    (340,849)   1
-    (342,723)   2
-    (343,678)   2
-    (344,384)   3
-    (344,680)   0
-    (345,75)   0
-    (347,996)   3
-    (348,60)   3
-    (348,821)   1
-    (350,282)   1
-    (352,160)   2
-    (353,536)   1
-    (355,352)   5
-    (356,340)   2
-    (358,678)   2
-    (360,679)   1
-    (361,794)   0
-    (361,989)   3
-    (362,816)   2
-    (363,206)   4
-    (364,629)   0
-    (365,990)   0
-    (366,841)   1
-    (366,971)   0
-    (367,888)   2
-    (368,587)   0
-    (369,684)   3
-    (370,270)   1
-    (372,471)   1
-    (373,88)   1
-    (375,992)   2
-    (376,336)   3
-    (377,86)   1
-    (378,882)   1
-    (379,592)   2
-    (380,77)   2
-    (380,643)   2
-    (381,1012)   2
-    (382,816)   2
-    (383,711)   2
-    (385,670)   1
-    (386,537)   1
-    (387,347)   2
-    (388,494)   1
-    (389,328)   3
-    (390,551)   1
-    (391,59)   2
-    (391,600)   1
-    (394,692)   4
-    (396,645)   2
-    (398,107)   3
-    (398,246)   2
-    (399,436)   3
-    (400,172)   0
-    (401,790)   3
-    (402,320)   2
-    (403,40)   2
-    (404,641)   0
-    (405,49)   0
-    (405,475)   1
-    (407,320)   3
-    (408,61)   4
-    (410,754)   3
-    (411,643)   2
-    (412,949)   1
-    (413,94)   5
-    (415,26)   1
-    (416,575)   0
-    (417,366)   3
-    (418,160)   0
-    (419,209)   1
-    (421,614)   1
-    (422,177)   2
-    (423,873)   1
-    (424,542)   3
-    (425,263)   0
-    (426,377)   0
-    (427,149)   0
-    (429,305)   0
-    (430,718)   1
-    (431,51)   0
-    (432,857)   2
-    (434,604)   0
-    (435,152)   2
-    (436,356)   1
-    (437,105)   3
-    (440,338)   0
-    (441,982)   2
-    (442,880)   1
-    (443,753)   1
-    (446,741)   0
-    (448,646)   0
-    (448,744)   2
-    (450,579)   1
-    (451,147)   0
-    (451,1017)   0
-    (452,868)   3
-    (453,26)   1
-    (454,415)   1
-    (454,668)   0
-    (455,43)   0
-    (456,849)   1
-    (456,985)   2
-    (457,218)   2
-    (458,510)   4
-    (459,737)   2
-    (460,836)   2
-    (461,849)   0
-    (461,917)   2
-    (462,900)   1
-    (463,316)   1
-    (464,762)   1
-    (465,355)   1
-    (465,801)   1
-    (466,673)   0
-    (468,288)   1
-    (470,889)   2
-    (471,650)   1
-    (473,121)   1
-    (473,127)   2
-    (474,487)   0
-    (476,44)   0
-    (477,342)   1
-    (480,667)   1
-    (481,558)   0
-    (482,680)   1
-    (483,517)   1
-    (484,961)   1
-    (485,274)   0
-    (486,1015)   3
-    (487,194)   1
-    (489,802)   2
-    (490,811)   1
-    (491,319)   4
-    (492,377)   1
-    (494,432)   1
-    (495,809)   0
-    (496,267)   2
-    (496,902)   1
-    (498,194)   1
-    (500,84)   0
-    (501,704)   2
-    (503,519)   2
-    (504,510)   3
-    (505,574)   1
-    (507,643)   3
-    (508,449)   3
-    (512,892)   3
-    (513,271)   2
-    (517,369)   1
-    (518,293)   2
-    (520,270)   1
-    (521,1013)   1
-    (522,284)   1
-    (524,945)   1
-    (525,94)   5
-    (525,362)   2
-    (526,52)   1
-    (527,61)   3
-    (529,998)   0
-    (531,908)   1
-    (533,674)   4
-    (535,660)   1
-    (535,776)   1
-    (536,500)   3
-    (537,799)   2
-    (538,492)   2
-    (538,861)   1
-    (540,245)   0
-    (542,137)   2
-    (545,658)   0
-    (546,213)   1
-    (547,767)   1
-    (547,912)   3
-    (547,1018)   1
-    (548,46)   2
-    (548,697)   0
-    (549,602)   2
-    (550,927)   2
-    (553,391)   1
-    (554,351)   2
-    (555,10)   2
-    (556,26)   2
-    (557,910)   0
-    (560,792)   0
-    (562,182)   0
-    (562,862)   1
-    (563,877)   0
-    (564,310)   3
-    (564,609)   3
-    (565,490)   0
-    (566,564)   2
-    (566,607)   1
-    (569,872)   0
-    (570,465)   1
-    (571,271)   3
-    (571,919)   1
-    (574,603)   0
-    (576,256)   4
-    (579,274)   0
-    (580,182)   0
-    (581,445)   0
-    (582,177)   3
-    (583,118)   0
-    (584,399)   1
-    (585,433)   4
-    (587,254)   2
-    (588,914)   2
-    (589,1016)   3
-    (590,95)   3
-    (590,802)   2
-    (592,527)   0
-    (593,143)   2
-    (594,430)   0
-    (595,787)   2
-    (598,788)   1
-    (599,127)   3
-    (601,478)   2
-    (602,218)   0
-    (603,759)   1
-    (604,270)   1
-    (605,76)   3
-    (606,930)   0
-    (608,832)   1
-    (609,287)   1
-    (610,794)   0
-    (611,759)   1
-    (613,398)   3
-    (614,386)   4
-    (615,115)   0
-    (616,928)   0
-    (617,30)   2
-    (618,361)   5
-    (619,996)   4
-    (620,5)   3
-    (621,41)   0
-    (623,44)   2
-    (624,19)   1
-    (624,242)   2
-    (624,524)   1
-    (626,51)   0
-    (627,361)   1
-    (628,396)   3
-    (629,882)   1
-    (630,341)   1
-    (631,49)   1
-    (631,585)   1
-    (632,73)   1
-    (634,912)   2
-    (635,882)   1
-    (636,617)   1
-    (637,716)   0
-    (638,113)   1
-    (639,616)   5
-    (640,837)   2
-    (641,457)   1
-    (643,934)   3
-    (647,783)   2
-    (648,195)   1
-    (649,614)   1
-    (650,957)   1
-    (651,281)   2
-    (652,973)   1
-    (653,60)   1
-    (653,333)   2
-    (654,605)   3
-    (655,910)   0
-    (656,349)   3
-    (660,591)   4
-    (661,512)   2
-    (663,767)   0
-    (665,77)   3
-    (666,503)   4
-    (667,951)   2
-    (668,365)   4
-    (669,300)   1
-    (671,141)   1
-    (671,565)   2
-    (672,819)   1
-    (674,819)   1
-    (675,454)   0
-    (676,242)   2
-    (677,289)   4
-    (678,802)   3
-    (680,398)   1
-    (681,390)   1
-    (682,117)   4
-    (683,110)   2
-    (684,907)   0
-    (686,202)   0
-    (687,45)   1
-    (688,287)   2
-    (689,502)   3
-    (690,299)   3
-    (691,392)   2
-    (692,600)   0
-    (694,378)   1
-    (695,702)   1
-    (696,102)   2
-    (698,631)   0
-    (699,152)   1
-    (700,840)   1
-    (702,777)   1
-    (703,132)   1
-    (704,374)   1
-    (705,579)   1
-    (706,511)   3
-    (707,76)   3
-    (708,259)   2
-    (708,925)   0
-    (709,872)   1
-    (709,873)   1
-    (710,107)   3
-    (710,293)   2
-    (711,462)   0
-    (714,475)   2
-    (715,172)   0
-    (715,751)   2
-    (716,697)   0
-    (717,234)   0
-    (718,848)   2
-    (719,331)   1
-    (720,201)   1
-    (720,725)   2
-    (722,415)   2
-    (722,934)   2
-    (723,675)   2
-    (724,480)   3
-    (727,177)   4
-    (728,797)   1
-    (729,884)   1
-    (730,767)   0
-    (731,275)   1
-    (732,910)   0
-    (733,763)   5
-    (734,574)   0
-    (735,268)   3
-    (736,115)   1
-    (737,912)   2
-    (738,1023)   2
-    (739,335)   0
-    (740,596)   3
-    (741,365)   1
-    (742,485)   5
-    (743,186)   1
-    (745,645)   2
-    (746,273)   3
-    (747,91)   5
-    (748,886)   0
-    (749,59)   2
-    (749,755)   2
-    (751,348)   0
-    (752,313)   2
-    (752,742)   0
-    (752,745)   1
-    (753,472)   1
-    (753,592)   1
-    (754,1007)   0
-    (756,633)   1
-    (758,847)   2
-    (759,500)   3
-    (760,340)   2
-    (760,381)   2
-    (762,962)   3
-    (763,954)   0
-    (764,392)   1
-    (764,913)   3
-    (766,915)   3
-    (766,936)   0
-    (767,372)   1
-    (768,307)   0
-    (770,458)   0
-    (771,487)   0
-    (773,56)   1
-    (774,773)   0
-    (775,115)   1
-    (776,537)   1
-    (777,392)   1
-    (778,893)   0
-    (779,644)   0
-    (780,256)   2
-    (782,399)   1
-    (782,892)   2
-    (783,614)   2
-    (785,816)   1
-    (786,462)   1
-    (787,876)   1
-    (788,273)   4
-    (789,696)   2
-    (790,471)   1
-    (791,793)   3
-    (792,636)   3
-    (792,955)   3
-    (793,809)   0
-    (794,986)   1
-    (795,656)   0
-    (796,347)   3
-    (797,880)   2
-    (798,802)   0
-    (801,130)   1
-    (803,896)   3
-    (804,1022)   3
-    (805,32)   1
-    (805,479)   1
-    (806,889)   2
-    (807,504)   3
-    (809,719)   1
-    (809,737)   2
-    (810,646)   0
-    (812,375)   3
-    (813,200)   2
-    (815,408)   3
-    (816,902)   1
-    (817,430)   1
-    (818,985)   5
-    (819,688)   1
-    (821,839)   1
-    (822,747)   1
-    (823,39)   1
-    (824,886)   0
-    (825,406)   0
-    (828,407)   2
-    (829,511)   1
-    (830,915)   2
-    (831,982)   1
-    (832,1003)   2
-    (833,362)   2
-    (833,999)   2
-    (834,136)   2
-    (834,295)   1
-    (835,115)   1
-    (836,218)   2
-    (837,565)   4
-    (839,541)   0
-    (839,711)   0
-    (840,159)   1
-    (841,636)   1
-    (842,136)   2
-    (843,524)   0
-    (844,114)   0
-    (846,533)   1
-    (847,741)   0
-    (848,483)   1
-    (849,464)   3
-    (850,302)   0
-    (851,567)   1
-    (852,150)   4
-    (852,529)   0
-    (853,623)   1
-    (855,106)   2
-    (856,1014)   1
-    (857,151)   2
-    (857,650)   1
-    (858,781)   1
-    (858,994)   0
-    (859,508)   0
-    (859,716)   0
-    (862,636)   2
-    (863,21)   4
-    (864,1022)   2
-    (866,97)   0
-    (867,48)   1
-    (868,303)   1
-    (869,364)   4
-    (871,453)   1
-    (873,173)   0
-    (874,485)   7
-    (875,168)   1
-    (876,357)   0
-    (877,722)   1
-    (877,990)   0
-    (880,176)   2
-    (881,23)   1
-    (882,608)   0
-    (884,643)   3
-    (885,687)   0
-    (887,487)   0
-    (888,110)   2
-    (888,943)   0
-    (889,892)   3
-    (890,628)   2
-    (891,679)   1
-    (892,653)   2
-    (894,33)   0
-    (895,37)   2
-    (895,695)   0
-    (896,390)   0
-    (897,42)   2
-    (900,687)   0
-    (901,605)   2
-    (902,57)   1
-    (903,1021)   1
-    (904,808)   4
-    (905,795)   3
-    (906,479)   0
-    (907,674)   2
-    (909,456)   2
-    (911,548)   1
-    (914,924)   1
-    (915,366)   2
-    (915,502)   3
-    (916,420)   3
-    (916,823)   1
-    (918,480)   3
-    (920,608)   1
-    (925,685)   0
-    (926,755)   4
-    (929,538)   0
-    (930,13)   1
-    (931,479)   3
-    (933,860)   0
-    (934,165)   0
-    (935,351)   2
-    (936,399)   1
-    (938,215)   0
-    (939,496)   0
-    (940,414)   0
-    (941,586)   5
-    (942,356)   1
-    (943,31)   4
-    (943,538)   0
-    (944,109)   3
-    (945,671)   1
-    (946,246)   3
-    (947,182)   0
-    (948,628)   2
-    (949,316)   0
-    (950,1017)   0
-    (951,221)   2
-    (955,457)   1
-    (955,823)   0
-    (956,653)   2
-    (957,656)   0
-    (958,644)   0
-    (959,667)   2
-    (960,78)   3
-    (961,828)   4
-    (962,877)   1
-    (963,397)   1
-    (964,370)   1
-    (965,504)   3
-    (966,483)   2
-    (967,1023)   2
-    (968,400)   0
-    (969,564)   1
-    (970,856)   1
-    (971,875)   1
-    (972,549)   1
-    (974,934)   2
-    (977,347)   3
-    (978,123)   0
-    (981,175)   3
-    (983,58)   1
-    (984,449)   1
-    (984,582)   2
-    (985,72)   1
-    (985,743)   2
-    (987,120)   2
-    (987,340)   4
-    (988,172)   0
-    (989,585)   2
-    (991,660)   1
-    (992,531)   3
-    (993,87)   2
-    (993,674)   2
-    (994,992)   2
-    (995,170)   2
-    (997,946)   1
-    (998,678)   2
-    (1001,877)   1
-    (1002,286)   2
-    (1004,250)   3
-    (1006,1022)   3
-    (1008,159)   1
-    (1009,574)   0
-    (1012,533)   1
-    (1013,574)   1
-    (1014,667)   3
-    (1015,127)   1
-    (1015,613)   2
-    (1016,457)   1
-    (1017,180)   2
-    (1018,254)   2
-    (1019,287)   3
-    (1020,67)   3
-    (1020,151)   2
-    (1021,810)   1
-    (1022,491)   0
-    (1023,840)   2
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, sparse by row
-  Diff actual, 893 entries, memory: 32.2 KB
-
-    (0,478)    0
-    (0,574)    0
-    (5,560)    0
-    (6,996)    0
-    (7,183)    0
-    (7,666)    0
-    (8,896)    0
-    (9,187)    0
-    (10,446)    0
-    (11,46)    0
-    (11,955)    0
-    (12,397)    0
-    (12,953)    0
-    (13,192)    0
-    (14,421)    0
-    (15,568)    0
-    (16,788)    0
-    (16,904)    0
-    (17,928)    0
-    (19,821)    0
-    (19,886)    0
-    (20,474)    0
-    (21,479)    0
-    (21,975)    0
-    (22,569)    0
-    (23,310)    0
-    (24,905)    0
-    (25,241)    0
-    (26,428)    0
-    (28,107)    0
-    (32,121)    0
-    (33,81)    0
-    (37,609)    0
-    (39,698)    0
-    (41,568)    0
-    (42,324)    0
-    (43,798)    0
-    (46,208)    0
-    (47,70)    0
-    (48,336)    0
-    (49,476)    0
-    (50,35)    0
-    (51,556)    0
-    (52,999)    0
-    (53,940)    0
-    (54,558)    0
-    (54,960)    0
-    (55,979)    0
-    (56,90)    0
-    (57,846)    0
-    (57,893)    0
-    (58,35)    0
-    (59,108)    0
-    (60,479)    0
-    (61,590)    0
-    (62,771)    0
-    (63,50)    0
-    (64,268)    0
-    (66,719)    0
-    (67,411)    0
-    (68,324)    0
-    (69,477)    0
-    (70,539)    0
-    (71,228)    0
-    (72,297)    0
-    (73,665)    0
-    (75,855)    0
-    (76,248)    0
-    (77,433)    0
-    (78,90)    0
-    (81,754)    0
-    (82,243)    0
-    (84,253)    0
-    (86,104)    0
-    (87,657)    0
-    (89,825)    0
-    (90,37)    0
-    (91,234)    0
-    (91,519)    0
-    (92,74)    0
-    (92,218)    0
-    (92,690)    0
-    (93,486)    0
-    (94,637)    0
-    (94,722)    0
-    (96,564)    0
-    (99,326)    0
-    (100,281)    0
-    (102,609)    0
-    (104,644)    0
-    (106,652)    0
-    (107,239)    0
-    (107,522)    0
-    (108,131)    0
-    (109,884)    0
-    (110,402)    0
-    (111,905)    0
-    (112,127)    0
-    (112,779)    0
-    (113,278)    0
-    (114,519)    0
-    (115,240)    0
-    (117,219)    0
-    (117,338)    0
-    (118,99)    0
-    (120,477)    0
-    (121,554)    0
-    (121,715)    0
-    (122,151)    0
-    (125,177)    0
-    (128,820)    0
-    (129,660)    0
-    (130,623)    0
-    (131,253)    0
-    (131,355)    0
-    (133,492)    0
-    (134,821)    0
-    (135,295)    0
-    (136,108)    0
-    (137,834)    0
-    (138,288)    0
-    (139,284)    0
-    (139,945)    0
-    (141,199)    0
-    (142,87)    0
-    (142,225)    0
-    (143,123)    0
-    (144,574)    0
-    (146,194)    0
-    (148,357)    0
-    (149,949)    0
-    (150,717)    0
-    (151,484)    0
-    (156,290)    0
-    (157,714)    0
-    (157,974)    0
-    (160,297)    0
-    (162,601)    0
-    (163,816)    0
-    (164,221)    0
-    (165,396)    0
-    (166,801)    0
-    (167,879)    0
-    (168,321)    0
-    (169,901)    0
-    (172,951)    0
-    (176,108)    0
-    (176,188)    0
-    (176,614)    0
-    (176,781)    0
-    (178,631)    0
-    (179,932)    0
-    (180,830)    0
-    (182,675)    0
-    (182,1001)    0
-    (183,692)    0
-    (184,143)    0
-    (185,450)    0
-    (186,779)    0
-    (187,997)    0
-    (188,357)    0
-    (189,111)    0
-    (190,990)    0
-    (192,644)    0
-    (192,953)    0
-    (193,135)    0
-    (194,137)    0
-    (195,922)    0
-    (197,859)    0
-    (198,910)    0
-    (199,531)    0
-    (201,907)    0
-    (202,863)    0
-    (203,865)    0
-    (204,614)    0
-    (207,826)    0
-    (208,985)    0
-    (209,808)    0
-    (211,71)    0
-    (211,931)    0
-    (212,426)    0
-    (213,152)    0
-    (214,928)    0
-    (215,268)    0
-    (216,550)    0
-    (217,921)    0
-    (218,704)    0
-    (218,922)    0
-    (219,66)    0
-    (220,704)    0
-    (221,56)    0
-    (221,551)    0
-    (222,545)    0
-    (223,1016)    0
-    (224,721)    0
-    (225,935)    0
-    (226,727)    0
-    (228,743)    0
-    (229,535)    0
-    (231,551)    0
-    (232,897)    0
-    (234,520)    0
-    (235,522)    0
-    (236,221)    0
-    (237,755)    0
-    (238,964)    0
-    (239,82)    0
-    (240,388)    0
-    (241,500)    0
-    (242,124)    0
-    (242,193)    0
-    (243,300)    0
-    (244,588)    0
-    (244,1004)    0
-    (245,494)    0
-    (246,326)    0
-    (247,115)    0
-    (247,147)    0
-    (248,233)    0
-    (250,485)    0
-    (251,708)    0
-    (252,197)    0
-    (253,485)    0
-    (254,40)    0
-    (254,238)    0
-    (255,895)    0
-    (256,114)    0
-    (257,461)    0
-    (257,796)    0
-    (258,233)    0
-    (260,884)    0
-    (261,945)    0
-    (262,368)    0
-    (264,755)    0
-    (265,124)    0
-    (266,352)    0
-    (267,10)    0
-    (268,234)    0
-    (269,400)    0
-    (270,877)    0
-    (270,924)    0
-    (271,944)    0
-    (272,67)    0
-    (273,100)    0
-    (274,979)    0
-    (276,333)    0
-    (277,377)    0
-    (279,877)    0
-    (280,18)    0
-    (281,449)    0
-    (282,179)    0
-    (283,1007)    0
-    (285,32)    0
-    (286,37)    0
-    (287,394)    0
-    (288,848)    0
-    (290,317)    0
-    (291,594)    0
-    (294,466)    0
-    (294,960)    0
-    (295,1)    0
-    (295,106)    0
-    (296,109)    0
-    (296,183)    0
-    (296,245)    0
-    (297,912)    0
-    (299,159)    0
-    (300,554)    0
-    (301,774)    0
-    (302,30)    0
-    (303,645)    0
-    (304,229)    0
-    (305,622)    0
-    (307,264)    0
-    (308,28)    0
-    (309,328)    0
-    (309,627)    0
-    (310,357)    0
-    (311,355)    0
-    (312,61)    0
-    (314,571)    0
-    (315,177)    0
-    (315,741)    0
-    (316,177)    0
-    (316,308)    0
-    (320,468)    0
-    (321,73)    0
-    (322,235)    0
-    (323,375)    0
-    (323,651)    0
-    (324,549)    0
-    (325,306)    0
-    (325,487)    0
-    (326,649)    0
-    (327,704)    0
-    (329,176)    0
-    (330,848)    0
-    (330,965)    0
-    (332,795)    0
-    (334,695)    0
-    (336,808)    0
-    (337,608)    0
-    (338,993)    0
-    (339,680)    0
-    (340,849)    0
-    (342,723)    0
-    (343,678)    0
-    (344,384)    0
-    (344,680)    0
-    (345,75)    0
-    (347,996)    0
-    (348,60)    0
-    (348,821)    0
-    (350,282)    0
-    (352,160)    0
-    (353,536)    0
-    (355,352)    0
-    (356,340)    0
-    (358,678)    0
-    (360,679)    0
-    (361,794)    0
-    (361,989)    0
-    (362,816)    0
-    (363,206)    0
-    (364,629)    0
-    (365,990)    0
-    (366,841)    0
-    (366,971)    0
-    (367,888)    0
-    (368,587)    0
-    (369,684)    0
-    (370,270)    0
-    (372,471)    0
-    (373,88)    0
-    (375,992)    0
-    (376,336)    0
-    (377,86)    0
-    (378,882)    0
-    (379,592)    0
-    (380,77)    0
-    (380,643)    0
-    (381,1012)    0
-    (382,816)    0
-    (383,711)    0
-    (385,670)    0
-    (386,537)    0
-    (387,347)    0
-    (388,494)    0
-    (389,328)    0
-    (390,551)    0
-    (391,59)    0
-    (391,600)    0
-    (394,692)    0
-    (396,645)    0
-    (398,107)    0
-    (398,246)    0
-    (399,436)    0
-    (400,172)    0
-    (401,790)    0
-    (402,320)    0
-    (403,40)    0
-    (404,641)    0
-    (405,49)    0
-    (405,475)    0
-    (407,320)    0
-    (408,61)    0
-    (410,754)    0
-    (411,643)    0
-    (412,949)    0
-    (413,94)    0
-    (415,26)    0
-    (416,575)    0
-    (417,366)    0
-    (418,160)    0
-    (419,209)    0
-    (421,614)    0
-    (422,177)    0
-    (423,873)    0
-    (424,542)    0
-    (425,263)    0
-    (426,377)    0
-    (427,149)    0
-    (429,305)    0
-    (430,718)    0
-    (431,51)    0
-    (432,857)    0
-    (434,604)    0
-    (435,152)    0
-    (436,356)    0
-    (437,105)    0
-    (440,338)    0
-    (441,982)    0
-    (442,880)    0
-    (443,753)    0
-    (446,741)    0
-    (448,646)    0
-    (448,744)    0
-    (450,579)    0
-    (451,147)    0
-    (451,1017)    0
-    (452,868)    0
-    (453,26)    0
-    (454,415)    0
-    (454,668)    0
-    (455,43)    0
-    (456,849)    0
-    (456,985)    0
-    (457,218)    0
-    (458,510)    0
-    (459,737)    0
-    (460,836)    0
-    (461,849)    0
-    (461,917)    0
-    (462,900)    0
-    (463,316)    0
-    (464,762)    0
-    (465,355)    0
-    (465,801)    0
-    (466,673)    0
-    (468,288)    0
-    (470,889)    0
-    (471,650)    0
-    (473,121)    0
-    (473,127)    0
-    (474,487)    0
-    (476,44)    0
-    (477,342)    0
-    (480,667)    0
-    (481,558)    0
-    (482,680)    0
-    (483,517)    0
-    (484,961)    0
-    (485,274)    0
-    (486,1015)    0
-    (487,194)    0
-    (489,802)    0
-    (490,811)    0
-    (491,319)    0
-    (492,377)    0
-    (494,432)    0
-    (495,809)    0
-    (496,267)    0
-    (496,902)    0
-    (498,194)    0
-    (500,84)    0
-    (501,704)    0
-    (503,519)    0
-    (504,510)    0
-    (505,574)    0
-    (507,643)    0
-    (508,449)    0
-    (512,892)    0
-    (513,271)    0
-    (517,369)    0
-    (518,293)    0
-    (520,270)    0
-    (521,1013)    0
-    (522,284)    0
-    (524,945)    0
-    (525,94)    0
-    (525,362)    0
-    (526,52)    0
-    (527,61)    0
-    (529,998)    0
-    (531,908)    0
-    (533,674)    0
-    (535,660)    0
-    (535,776)    0
-    (536,500)    0
-    (537,799)    0
-    (538,492)    0
-    (538,861)    0
-    (540,245)    0
-    (542,137)    0
-    (545,658)    0
-    (546,213)    0
-    (547,767)    0
-    (547,912)    0
-    (547,1018)    0
-    (548,46)    0
-    (548,697)    0
-    (549,602)    0
-    (550,927)    0
-    (553,391)    0
-    (554,351)    0
-    (555,10)    0
-    (556,26)    0
-    (557,910)    0
-    (560,792)    0
-    (562,182)    0
-    (562,862)    0
-    (563,877)    0
-    (564,310)    0
-    (564,609)    0
-    (565,490)    0
-    (566,564)    0
-    (566,607)    0
-    (569,872)    0
-    (570,465)    0
-    (571,271)    0
-    (571,919)    0
-    (574,603)    0
-    (576,256)    0
-    (579,274)    0
-    (580,182)    0
-    (581,445)    0
-    (582,177)    0
-    (583,118)    0
-    (584,399)    0
-    (585,433)    0
-    (587,254)    0
-    (588,914)    0
-    (589,1016)    0
-    (590,95)    0
-    (590,802)    0
-    (592,527)    0
-    (593,143)    0
-    (594,430)    0
-    (595,787)    0
-    (598,788)    0
-    (599,127)    0
-    (601,478)    0
-    (602,218)    0
-    (603,759)    0
-    (604,270)    0
-    (605,76)    0
-    (606,930)    0
-    (608,832)    0
-    (609,287)    0
-    (610,794)    0
-    (611,759)    0
-    (613,398)    0
-    (614,386)    0
-    (615,115)    0
-    (616,928)    0
-    (617,30)    0
-    (618,361)    0
-    (619,996)    0
-    (620,5)    0
-    (621,41)    0
-    (623,44)    0
-    (624,19)    0
-    (624,242)    0
-    (624,524)    0
-    (626,51)    0
-    (627,361)    0
-    (628,396)    0
-    (629,882)    0
-    (630,341)    0
-    (631,49)    0
-    (631,585)    0
-    (632,73)    0
-    (634,912)    0
-    (635,882)    0
-    (636,617)    0
-    (637,716)    0
-    (638,113)    0
-    (639,616)    0
-    (640,837)    0
-    (641,457)    0
-    (643,934)    0
-    (647,783)    0
-    (648,195)    0
-    (649,614)    0
-    (650,957)    0
-    (651,281)    0
-    (652,973)    0
-    (653,60)    0
-    (653,333)    0
-    (654,605)    0
-    (655,910)    0
-    (656,349)    0
-    (660,591)    0
-    (661,512)    0
-    (663,767)    0
-    (665,77)    0
-    (666,503)    0
-    (667,951)    0
-    (668,365)    0
-    (669,300)    0
-    (671,141)    0
-    (671,565)    0
-    (672,819)    0
-    (674,819)    0
-    (675,454)    0
-    (676,242)    0
-    (677,289)    0
-    (678,802)    0
-    (680,398)    0
-    (681,390)    0
-    (682,117)    0
-    (683,110)    0
-    (684,907)    0
-    (686,202)    0
-    (687,45)    0
-    (688,287)    0
-    (689,502)    0
-    (690,299)    0
-    (691,392)    0
-    (692,600)    0
-    (694,378)    0
-    (695,702)    0
-    (696,102)    0
-    (698,631)    0
-    (699,152)    0
-    (700,840)    0
-    (702,777)    0
-    (703,132)    0
-    (704,374)    0
-    (705,579)    0
-    (706,511)    0
-    (707,76)    0
-    (708,259)    0
-    (708,925)    0
-    (709,872)    0
-    (709,873)    0
-    (710,107)    0
-    (710,293)    0
-    (711,462)    0
-    (714,475)    0
-    (715,172)    0
-    (715,751)    0
-    (716,697)    0
-    (717,234)    0
-    (718,848)    0
-    (719,331)    0
-    (720,201)    0
-    (720,725)    0
-    (722,415)    0
-    (722,934)    0
-    (723,675)    0
-    (724,480)    0
-    (727,177)    0
-    (728,797)    0
-    (729,884)    0
-    (730,767)    0
-    (731,275)    0
-    (732,910)    0
-    (733,763)    0
-    (734,574)    0
-    (735,268)    0
-    (736,115)    0
-    (737,912)    0
-    (738,1023)    0
-    (739,335)    0
-    (740,596)    0
-    (741,365)    0
-    (742,485)    0
-    (743,186)    0
-    (745,645)    0
-    (746,273)    0
-    (747,91)    0
-    (748,886)    0
-    (749,59)    0
-    (749,755)    0
-    (751,348)    0
-    (752,313)    0
-    (752,742)    0
-    (752,745)    0
-    (753,472)    0
-    (753,592)    0
-    (754,1007)    0
-    (756,633)    0
-    (758,847)    0
-    (759,500)    0
-    (760,340)    0
-    (760,381)    0
-    (762,962)    0
-    (763,954)    0
-    (764,392)    0
-    (764,913)    0
-    (766,915)    0
-    (766,936)    0
-    (767,372)    0
-    (768,307)    0
-    (770,458)    0
-    (771,487)    0
-    (773,56)    0
-    (774,773)    0
-    (775,115)    0
-    (776,537)    0
-    (777,392)    0
-    (778,893)    0
-    (779,644)    0
-    (780,256)    0
-    (782,399)    0
-    (782,892)    0
-    (783,614)    0
-    (785,816)    0
-    (786,462)    0
-    (787,876)    0
-    (788,273)    0
-    (789,696)    0
-    (790,471)    0
-    (791,793)    0
-    (792,636)    0
-    (792,955)    0
-    (793,809)    0
-    (794,986)    0
-    (795,656)    0
-    (796,347)    0
-    (797,880)    0
-    (798,802)    0
-    (801,130)    0
-    (803,896)    0
-    (804,1022)    0
-    (805,32)    0
-    (805,479)    0
-    (806,889)    0
-    (807,504)    0
-    (809,719)    0
-    (809,737)    0
-    (810,646)    0
-    (812,375)    0
-    (813,200)    0
-    (815,408)    0
-    (816,902)    0
-    (817,430)    0
-    (818,985)    0
-    (819,688)    0
-    (821,839)    0
-    (822,747)    0
-    (823,39)    0
-    (824,886)    0
-    (825,406)    0
-    (828,407)    0
-    (829,511)    0
-    (830,915)    0
-    (831,982)    0
-    (832,1003)    0
-    (833,362)    0
-    (833,999)    0
-    (834,136)    0
-    (834,295)    0
-    (835,115)    0
-    (836,218)    0
-    (837,565)    0
-    (839,541)    0
-    (839,711)    0
-    (840,159)    0
-    (841,636)    0
-    (842,136)    0
-    (843,524)    0
-    (844,114)    0
-    (846,533)    0
-    (847,741)    0
-    (848,483)    0
-    (849,464)    0
-    (850,302)    0
-    (851,567)    0
-    (852,150)    0
-    (852,529)    0
-    (853,623)    0
-    (855,106)    0
-    (856,1014)    0
-    (857,151)    0
-    (857,650)    0
-    (858,781)    0
-    (858,994)    0
-    (859,508)    0
-    (859,716)    0
-    (862,636)    0
-    (863,21)    0
-    (864,1022)    0
-    (866,97)    0
-    (867,48)    0
-    (868,303)    0
-    (869,364)    0
-    (871,453)    0
-    (873,173)    0
-    (874,485)    0
-    (875,168)    0
-    (876,357)    0
-    (877,722)    0
-    (877,990)    0
-    (880,176)    0
-    (881,23)    0
-    (882,608)    0
-    (884,643)    0
-    (885,687)    0
-    (887,487)    0
-    (888,110)    0
-    (888,943)    0
-    (889,892)    0
-    (890,628)    0
-    (891,679)    0
-    (892,653)    0
-    (894,33)    0
-    (895,37)    0
-    (895,695)    0
-    (896,390)    0
-    (897,42)    0
-    (900,687)    0
-    (901,605)    0
-    (902,57)    0
-    (903,1021)    0
-    (904,808)    0
-    (905,795)    0
-    (906,479)    0
-    (907,674)    0
-    (909,456)    0
-    (911,548)    0
-    (914,924)    0
-    (915,366)    0
-    (915,502)    0
-    (916,420)    0
-    (916,823)    0
-    (918,480)    0
-    (920,608)    0
-    (925,685)    0
-    (926,755)    0
-    (929,538)    0
-    (930,13)    0
-    (931,479)    0
-    (933,860)    0
-    (934,165)    0
-    (935,351)    0
-    (936,399)    0
-    (938,215)    0
-    (939,496)    0
-    (940,414)    0
-    (941,586)    0
-    (942,356)    0
-    (943,31)    0
-    (943,538)    0
-    (944,109)    0
-    (945,671)    0
-    (946,246)    0
-    (947,182)    0
-    (948,628)    0
-    (949,316)    0
-    (950,1017)    0
-    (951,221)    0
-    (955,457)    0
-    (955,823)    0
-    (956,653)    0
-    (957,656)    0
-    (958,644)    0
-    (959,667)    0
-    (960,78)    0
-    (961,828)    0
-    (962,877)    0
-    (963,397)    0
-    (964,370)    0
-    (965,504)    0
-    (966,483)    0
-    (967,1023)    0
-    (968,400)    0
-    (969,564)    0
-    (970,856)    0
-    (971,875)    0
-    (972,549)    0
-    (974,934)    0
-    (977,347)    0
-    (978,123)    0
-    (981,175)    0
-    (983,58)    0
-    (984,449)    0
-    (984,582)    0
-    (985,72)    0
-    (985,743)    0
-    (987,120)    0
-    (987,340)    0
-    (988,172)    0
-    (989,585)    0
-    (991,660)    0
-    (992,531)    0
-    (993,87)    0
-    (993,674)    0
-    (994,992)    0
-    (995,170)    0
-    (997,946)    0
-    (998,678)    0
-    (1001,877)    0
-    (1002,286)    0
-    (1004,250)    0
-    (1006,1022)    0
-    (1008,159)    0
-    (1009,574)    0
-    (1012,533)    0
-    (1013,574)    0
-    (1014,667)    0
-    (1015,127)    0
-    (1015,613)    0
-    (1016,457)    0
-    (1017,180)    0
-    (1018,254)    0
-    (1019,287)    0
-    (1020,67)    0
-    (1020,151)    0
-    (1021,810)    0
-    (1022,491)    0
-    (1023,840)    0
-
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  T actual, 893 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    (25,241)   1
-    (26,428)   1
-    ...
- work:893 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 8192 bytes
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 4096 bytes
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
- rmm_wrap_alloc 8192 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 10240 values, invsparse = 103
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-10240 nonzeroes left to fill..
-4633 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 131072 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff534e400
-inside enumify: 0x7f1ff534e400
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-bucket 6 has 1024 dots to do
-LAUNCHING BUCKET CODE: 6
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vssp
-found memory-cached prog GB_jit_AxB_dot3_phase3_vssp
- got kernel instance AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsspIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.908288ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1001
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 23 entries, memory: 16.6 KB
-
-    (42,324)   0
-    (73,665)   1
-    (106,652)   0
-    (138,288)   1
-    (242,124)   1
-    (295,1)   0
-    (300,554)   1
-    (312,61)   0
-    (344,384)   0
-    (496,267)   0
-    (587,254)   1
-    (686,202)   0
-    (708,925)   1
-    (715,751)   0
-    (729,884)   0
-    (741,365)   1
-    (751,348)   1
-    (792,636)   0
-    (857,151)   0
-    (876,357)   0
-    (940,414)   0
-    (945,671)   0
-    (968,400)   1
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 23 entries, memory: 1.1 KB
-
-    (42,324)   0
-    (73,665)   1
-    (106,652)   0
-    (138,288)   1
-    (242,124)   1
-    (295,1)   0
-    (300,554)   1
-    (312,61)   0
-    (344,384)   0
-    (496,267)   0
-    (587,254)   1
-    (686,202)   0
-    (708,925)   1
-    (715,751)   0
-    (729,884)   0
-    (741,365)   1
-    (751,348)   1
-    (792,636)   0
-    (857,151)   0
-    (876,357)   0
-    (940,414)   0
-    (945,671)   0
-    (968,400)   1
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 23 entries, memory: 1.2 KB
-
-    (42,324)    0
-    (73,665)    0
-    (106,652)    0
-    (138,288)    0
-    (242,124)    0
-    (295,1)    0
-    (300,554)    0
-    (312,61)    0
-    (344,384)    0
-    (496,267)    0
-    (587,254)    0
-    (686,202)    0
-    (708,925)    0
-    (715,751)    0
-    (729,884)    0
-    (741,365)    0
-    (751,348)    0
-    (792,636)    0
-    (857,151)    0
-    (876,357)    0
-    (940,414)    0
-    (945,671)    0
-    (968,400)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 23 entries, memory: 1.0 KB
-
-    (42,324)   1
-    (73,665)   1
-    (106,652)   1
-    (138,288)   1
-    (242,124)   1
-    (295,1)   1
-    (300,554)   1
-    (312,61)   1
-    (344,384)   1
-    (496,267)   1
-    (587,254)   1
-    (686,202)   1
-    (708,925)   1
-    (715,751)   1
-    (729,884)   1
-    (741,365)   1
-    (751,348)   1
-    (792,636)   1
-    (857,151)   1
-    (876,357)   1
-    (940,414)   1
-    (945,671)   1
-    (968,400)   1
- work:23 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 16384 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 32768 bytes
-1024 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5382700
-inside enumify: 0x7f1ff5382700
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-bucket 7 has 1024 dots to do
-LAUNCHING BUCKET CODE: 7
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.616448ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
-1024 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5382800
-inside enumify: 0x7f1ff5382800
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-bucket 8 has 1024 dots to do
-LAUNCHING BUCKET CODE: 8
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.801792ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5383200
-inside enumify: 0x7f1ff5383200
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-bucket 9 has 1024 dots to do
-LAUNCHING BUCKET CODE: 9
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.820224ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5383b00
-inside enumify: 0x7f1ff5383b00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-bucket 10 has 1024 dots to do
-LAUNCHING BUCKET CODE: 10
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 1.00762ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 5120 values, invsparse = 205
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-5120 nonzeroes left to fill..
-2091 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-569 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff539d500
-inside enumify: 0x7f1ff539d500
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b57180
-GB_enumify_mask gets mcode: 1 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 4
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 4, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434374399488
-done enumify semiring
-scode=397409434374399488
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-bucket 11 has 1024 dots to do
-LAUNCHING BUCKET CODE: 11
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434374399488.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_mp
-found memory-cached prog GB_jit_AxB_dot3_phase3_mp
- got kernel instance AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-Launching _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-warp 27 zombie count = 32, nzombies = 0
-warp 11 zombie count = 32, nzombies = 0
-warp 21 zombie count = 32, nzombies = 0
-warp 3 zombie count = 32, nzombies = 0
-warp 6 zombie count = 32, nzombies = 0
-warp 2 zombie count = 32, nzombies = 0
-warp 14 zombie count = 32, nzombies = 0
-warp 7 zombie count = 31, nzombies = 0
-warp 18 zombie count = 32, nzombies = 0
-warp 5 zombie count = 32, nzombies = 0
-warp 10 zombie count = 32, nzombies = 0
-warp 1 zombie count = 32, nzombies = 0
-warp 24 zombie count = 32, nzombies = 0
-warp 15 zombie count = 32, nzombies = 0
-warp 20 zombie count = 32, nzombies = 0
-warp 28 zombie count = 31, nzombies = 0
-warp 4 zombie count = 31, nzombies = 0
-warp 25 zombie count = 32, nzombies = 0
-warp 17 zombie count = 32, nzombies = 0
-warp 12 zombie count = 32, nzombies = 0
-warp 31 zombie count = 32, nzombies = 0
-warp 16 zombie count = 32, nzombies = 0
-warp 30 zombie count = 32, nzombies = 0
-warp 22 zombie count = 31, nzombies = 0
-warp 0 zombie count = 32, nzombies = 0
-warp 23 zombie count = 32, nzombies = 0
-warp 26 zombie count = 31, nzombies = 0
-warp 9 zombie count = 31, nzombies = 0
- Czombie = 32
- Czombie = 96
- Czombie = 96
- Czombie = 128
- Czombie = 160
- Czombie = 192
-warp 29 zombie count = 32, nzombies = 128
- Czombie = 224
-warp 13 zombie count = 31, nzombies = 160
- Czombie = 255
- Czombie = 287
- Czombie = 383
- Czombie = 383
- Czombie = 383
-warp 19 zombie count = 32, nzombies = 319
- Czombie = 479
- Czombie = 479
- Czombie = 479
- Czombie = 510
- Czombie = 573
- Czombie = 573
- Czombie = 669
- Czombie = 669
- Czombie = 669
- Czombie = 701
- Czombie = 764
- Czombie = 764
- Czombie = 796
- Czombie = 859
- Czombie = 859
- Czombie = 890
- Czombie = 953
- Czombie = 953
- Czombie = 985
-warp 8 zombie count = 31, nzombies = 985
- Czombie = 1016
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 1.59027ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1016
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 16.3 KB
-
-    (235,522)   1
-    (309,328)   1
-    (417,366)   0
-    (565,490)   0
-    (611,759)   0
-    (714,475)   1
-    (766,915)   0
-    (877,722)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 8 entries, memory: 544 bytes
-
-    (235,522)   1
-    (309,328)   1
-    (417,366)   0
-    (565,490)   0
-    (611,759)   0
-    (714,475)   1
-    (766,915)   0
-    (877,722)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 8 entries, memory: 576 bytes
-
-    (235,522)    0
-    (309,328)    0
-    (417,366)    0
-    (565,490)    0
-    (611,759)    0
-    (714,475)    0
-    (766,915)    0
-    (877,722)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 8 entries, memory: 520 bytes
-
-    (235,522)   1
-    (309,328)   1
-    (417,366)   1
-    (565,490)   1
-    (611,759)   1
-    (714,475)   1
-    (766,915)   1
-    (877,722)   1
- work:8 gpus:0 [       OK ] AxB_dot3_tests_PLUS_TIMES_3.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (475 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_3.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
-Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 512 bytes
-32 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5384400
-inside enumify: 0x7f1ff5384400
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-bucket 1 has 32 dots to do
-LAUNCHING BUCKET CODE: 1
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_dndn
-found memory-cached prog GB_jit_AxB_dot3_phase3_dndn
- got kernel instance AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_dndnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-tid=0, i,j = 6,0  nnzA= 32, nnzB=32
-tid=0, i,j = 1,1  nnzA= 32, nnzB=32
-tid=0, i,j = 12,3  nnzA= 32, nnzB=32
-tid=0, i,j = 17,3  nnzA= 32, nnzB=32
-tid=0, i,j = 19,4  nnzA= 32, nnzB=32
-tid=0, i,j = 19,5  nnzA= 32, nnzB=32
-tid=0, i,j = 22,6  nnzA= 32, nnzB=32
-tid=0, i,j = 24,6  nnzA= 32, nnzB=32
-tid=0, i,j = 10,8  nnzA= 32, nnzB=32
-tid=0, i,j = 19,9  nnzA= 32, nnzB=32
-tid=0, i,j = 31,9  nnzA= 32, nnzB=32
-tid=0, i,j = 13,11  nnzA= 32, nnzB=32
-tid=0, i,j = 11,12  nnzA= 32, nnzB=32
-tid=0, i,j = 24,14  nnzA= 32, nnzB=32
-tid=0, i,j = 30,15  nnzA= 32, nnzB=32
-tid=0, i,j = 20,16  nnzA= 32, nnzB=32
-tid=0, i,j = 30,17  nnzA= 32, nnzB=32
-tid=0, i,j = 18,18  nnzA= 32, nnzB=32
-tid=0, i,j = 1,19  nnzA= 32, nnzB=32
-tid=0, i,j = 25,20  nnzA= 32, nnzB=32
-tid=0, i,j = 24,21  nnzA= 32, nnzB=32
-tid=0, i,j = 27,21  nnzA= 32, nnzB=32
-tid=0, i,j = 30,22  nnzA= 32, nnzB=32
-tid=0, i,j = 30,23  nnzA= 32, nnzB=32
-tid=0, i,j = 14,24  nnzA= 32, nnzB=32
-tid=0, i,j = 4,25  nnzA= 32, nnzB=32
-tid=0, i,j = 15,26  nnzA= 32, nnzB=32
-tid=0, i,j = 28,27  nnzA= 32, nnzB=32
-tid=0, i,j = 16,28  nnzA= 32, nnzB=32
-tid=0, i,j = 9,29  nnzA= 32, nnzB=32
-tid=0, i,j = 24,30  nnzA= 32, nnzB=32
-tid=0, i,j = 31,31  nnzA= 32, nnzB=32
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 3.29933ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-
-    (0,6)   11
-    (1,1)   10
-    (3,12)   7
-    (3,17)   6
-    (4,19)   8
-    (5,19)   10
-    (6,22)   6
-    (6,24)   9
-    (8,10)   7
-    (9,19)   8
-    (9,31)   6
-    (11,13)   8
-    (12,11)   6
-    (14,24)   10
-    (15,30)   9
-    (16,20)   5
-    (17,30)   7
-    (18,18)   12
-    (19,1)   6
-    (20,25)   7
-    (21,24)   9
-    (21,27)   6
-    (22,30)   8
-    (23,30)   11
-    (24,14)   7
-    (25,4)   9
-    (26,15)   4
-    (27,28)   5
-    (28,16)   4
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-
-    (0,6)   11
-    (1,1)   10
-    (3,12)   7
-    (3,17)   6
-    (4,19)   8
-    (5,19)   10
-    (6,22)   6
-    (6,24)   9
-    (8,10)   7
-    (9,19)   8
-    (9,31)   6
-    (11,13)   8
-    (12,11)   6
-    (14,24)   10
-    (15,30)   9
-    (16,20)   5
-    (17,30)   7
-    (18,18)   12
-    (19,1)   6
-    (20,25)   7
-    (21,24)   9
-    (21,27)   6
-    (22,30)   8
-    (23,30)   11
-    (24,14)   7
-    (25,4)   9
-    (26,15)   4
-    (27,28)   5
-    (28,16)   4
-    (29,9)   7
-    (30,24)   10
-    (31,31)   10
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 32 entries, memory: 1.5 KB
-
-    (0,6)   11
-    (1,1)   10
-    (3,12)   7
-    (3,17)   6
-    (4,19)   8
-    (5,19)   10
-    (6,22)   6
-    (6,24)   9
-    (8,10)   7
-    (9,19)   8
-    (9,31)   6
-    (11,13)   8
-    (12,11)   6
-    (14,24)   10
-    (15,30)   9
-    (16,20)   5
-    (17,30)   7
-    (18,18)   12
-    (19,1)   6
-    (20,25)   7
-    (21,24)   9
-    (21,27)   6
-    (22,30)   8
-    (23,30)   11
-    (24,14)   7
-    (25,4)   9
-    (26,15)   4
-    (27,28)   5
-    (28,16)   4
-    (29,9)   7
-    (30,24)   10
-    (31,31)   10
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 32 entries, memory: 1.2 KB
-
-    (0,6)    0
-    (1,1)    0
-    (3,12)    0
-    (3,17)    0
-    (4,19)    0
-    (5,19)    0
-    (6,22)    0
-    (6,24)    0
-    (8,10)    0
-    (9,19)    0
-    (9,31)    0
-    (11,13)    0
-    (12,11)    0
-    (14,24)    0
-    (15,30)    0
-    (16,20)    0
-    (17,30)    0
-    (18,18)    0
-    (19,1)    0
-    (20,25)    0
-    (21,24)    0
-    (21,27)    0
-    (22,30)    0
-    (23,30)    0
-    (24,14)    0
-    (25,4)    0
-    (26,15)    0
-    (27,28)    0
-    (28,16)    0
-    (29,9)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 32 entries, memory: 1.0 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (4,19)   1
-    (5,19)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,19)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (18,18)   1
-    (19,1)   1
-    (20,25)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
- work:32 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 1024 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 160 values, invsparse = 7
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-160 nonzeroes left to fill..
-62 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff539d800
-inside enumify: 0x7f1ff539d800
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-bucket 5 has 32 dots to do
-LAUNCHING BUCKET CODE: 5
-Confiring spdnINside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_spdn
-found memory-cached prog GB_jit_AxB_dot3_phase3_spdn
- got kernel instance AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_spdnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.500736ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 6
-
-    (0,6) zombie
-    (1,1)   2
-    (3,12)   2
-    (3,17)   2
-    (4,19) zombie
-    (5,19) zombie
-    (6,22)   2
-    (6,24)   0
-    (8,10)   0
-    (9,19) zombie
-    (9,31)   4
-    (11,13)   3
-    (12,11)   2
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18) zombie
-    (19,1)   3
-    (20,25) zombie
-    (21,24)   1
-    (21,27)   0
-    (22,30)   1
-    (23,30)   0
-    (24,14)   1
-    (25,4)   0
-    (26,15)   1
-    (27,28)   2
-    (28,16)   1
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 26 entries, memory: 1.1 KB
-
-    (1,1)   2
-    (3,12)   2
-    (3,17)   2
-    (6,22)   2
-    (6,24)   0
-    (8,10)   0
-    (9,31)   4
-    (11,13)   3
-    (12,11)   2
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (19,1)   3
-    (21,24)   1
-    (21,27)   0
-    (22,30)   1
-    (23,30)   0
-    (24,14)   1
-    (25,4)   0
-    (26,15)   1
-    (27,28)   2
-    (28,16)   1
-    (29,9)   0
-    (30,24)   1
-    (31,31)   2
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 26 entries, memory: 1.1 KB
-
-    (1,1)   2
-    (3,12)   2
-    (3,17)   2
-    (6,22)   2
-    (6,24)   0
-    (8,10)   0
-    (9,31)   4
-    (11,13)   3
-    (12,11)   2
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (19,1)   3
-    (21,24)   1
-    (21,27)   0
-    (22,30)   1
-    (23,30)   0
-    (24,14)   1
-    (25,4)   0
-    (26,15)   1
-    (27,28)   2
-    (28,16)   1
-    (29,9)   0
-    (30,24)   1
-    (31,31)   2
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 26 entries, memory: 1.2 KB
-
-    (1,1)    0
-    (3,12)    0
-    (3,17)    0
-    (6,22)    0
-    (6,24)    0
-    (8,10)    0
-    (9,31)    0
-    (11,13)    0
-    (12,11)    0
-    (14,24)    0
-    (15,30)    0
-    (16,20)    0
-    (17,30)    0
-    (19,1)    0
-    (21,24)    0
-    (21,27)    0
-    (22,30)    0
-    (23,30)    0
-    (24,14)    0
-    (25,4)    0
-    (26,15)    0
-    (27,28)    0
-    (28,16)    0
-    (29,9)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 26 entries, memory: 1.0 KB
-
-    (1,1)   1
-    (3,12)   1
-    (3,17)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,31)   1
-    (11,13)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   1
-    (17,30)   1
-    (19,1)   1
-    (21,24)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    (29,9)   1
-    (30,24)   1
-    (31,31)   1
- work:26 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 320 values, invsparse = 4
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-320 nonzeroes left to fill..
-140 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff539f100
-inside enumify: 0x7f1ff539f100
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-bucket 6 has 32 dots to do
-LAUNCHING BUCKET CODE: 6
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vssp
-found memory-cached prog GB_jit_AxB_dot3_phase3_vssp
- got kernel instance AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsspIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.628736ms
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 15
-
-    (0,6) zombie
-    (1,1)   1
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22)   0
-    (6,24)   0
-    (8,10)   0
-    (9,19) zombie
-    (9,31)   1
-    (11,13) zombie
-    (12,11)   0
-    (14,24)   1
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   1
-    (20,25) zombie
-    (21,24) zombie
-    (21,27)   1
-    (22,30)   0
-    (23,30)   0
-    (24,14) zombie
-    (25,4) zombie
-    (26,15)   0
-    (27,28) zombie
-    (28,16)   0
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 17 entries, memory: 1.1 KB
-
-    (1,1)   1
-    (3,17)   0
-    (6,22)   0
-    (6,24)   0
-    (8,10)   0
-    (9,31)   1
-    (12,11)   0
-    (14,24)   1
-    (15,30)   1
-    (19,1)   1
-    (21,27)   1
-    (22,30)   0
-    (23,30)   0
-    (26,15)   0
-    (28,16)   0
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 17 entries, memory: 1.1 KB
-
-    (1,1)   1
-    (3,17)   0
-    (6,22)   0
-    (6,24)   0
-    (8,10)   0
-    (9,31)   1
-    (12,11)   0
-    (14,24)   1
-    (15,30)   1
-    (19,1)   1
-    (21,27)   1
-    (22,30)   0
-    (23,30)   0
-    (26,15)   0
-    (28,16)   0
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 17 entries, memory: 1.2 KB
-
-    (1,1)    0
-    (3,17)    0
-    (6,22)    0
-    (6,24)    0
-    (8,10)    0
-    (9,31)    0
-    (12,11)    0
-    (14,24)    0
-    (15,30)    0
-    (19,1)    0
-    (21,27)    0
-    (22,30)    0
-    (23,30)    0
-    (26,15)    0
-    (28,16)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 17 entries, memory: 1.0 KB
-
-    (1,1)   1
-    (3,17)   1
-    (6,22)   1
-    (6,24)   1
-    (8,10)   1
-    (9,31)   1
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (19,1)   1
-    (21,27)   1
-    (22,30)   1
-    (23,30)   1
-    (26,15)   1
-    (28,16)   1
-    (30,24)   1
-    (31,31)   1
- work:17 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff539e300
-inside enumify: 0x7f1ff539e300
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-bucket 7 has 32 dots to do
-LAUNCHING BUCKET CODE: 7
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.480256ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff53e0600
-inside enumify: 0x7f1ff53e0600
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-bucket 8 has 32 dots to do
-LAUNCHING BUCKET CODE: 8
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.421888ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff53ead00
-inside enumify: 0x7f1ff53ead00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-bucket 9 has 32 dots to do
-LAUNCHING BUCKET CODE: 9
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.551936ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-21 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 128 values, invsparse = 8
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-128 nonzeroes left to fill..
-43 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff53ebe00
-inside enumify: 0x7f1ff53ebe00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-bucket 10 has 32 dots to do
-LAUNCHING BUCKET CODE: 10
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.4096ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 24
-
-    (0,6) zombie
-    (1,1)   0
-    (3,12) zombie
-    (3,17)   0
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24) zombie
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30)   1
-    (16,20) zombie
-    (17,30) zombie
-    (18,18) zombie
-    (19,1)   0
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 8 entries, memory: 864 bytes
-
-    (1,1)   0
-    (3,17)   0
-    (9,31)   0
-    (15,30)   1
-    (19,1)   0
-    (25,4)   1
-    (30,24)   0
-    (31,31)   0
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 8 entries, memory: 896 bytes
-
-    (1,1)    0
-    (3,17)    0
-    (9,31)    0
-    (15,30)    0
-    (19,1)    0
-    (25,4)    0
-    (30,24)    0
-    (31,31)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 8 entries, memory: 840 bytes
-
-    (1,1)   1
-    (3,17)   1
-    (9,31)   1
-    (15,30)   1
-    (19,1)   1
-    (25,4)   1
-    (30,24)   1
-    (31,31)   1
- work:8 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 4567
-fill_random nrows=32ncols=32 need 32 values, invsparse = 32
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-32 nonzeroes left to fill..
-2 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 543210
-fill_random nrows=32ncols=32 need 160 values, invsparse = 7
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-160 nonzeroes left to fill..
-51 nonzeroes left to fill..
-inside fill, using seed 32
-fill_random nrows=32ncols=32 need 64 values, invsparse = 16
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-64 nonzeroes left to fill..
-20 nonzeroes left to fill..
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff53ee300
-inside enumify: 0x7f1ff53ee300
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-32 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-bucket 11 has 32 dots to do
-LAUNCHING BUCKET CODE: 11
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_mp
-found memory-cached prog GB_jit_AxB_dot3_phase3_mp
- got kernel instance AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-Launching _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<1,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-warp 0 zombie count = 27, nzombies = 0
- Czombie = 27
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 1.20934ms
-
-  32x32 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 32 entries, memory: 1.1 KB
-  pending tuples: 0 max pending: 0 zombies: 27
-
-    (0,6) zombie
-    (1,1)   1
-    (3,12) zombie
-    (3,17) zombie
-    (4,19) zombie
-    (5,19) zombie
-    (6,22) zombie
-    (6,24)   0
-    (8,10) zombie
-    (9,19) zombie
-    (9,31)   0
-    (11,13) zombie
-    (12,11) zombie
-    (14,24) zombie
-    (15,30) zombie
-    (16,20)   0
-    (17,30) zombie
-    (18,18) zombie
-    (19,1) zombie
-    (20,25) zombie
-    (21,24) zombie
-    (21,27) zombie
-    (22,30) zombie
-    (23,30) zombie
-    (24,14) zombie
-    (25,4)   1
-    (26,15) zombie
-    (27,28) zombie
-    (28,16) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 32 entries, memory: 1.1 KB
-
-    (0,6)   1
-    (1,1)   1
-    (3,12)   1
-    (3,17)   0
-    (4,19)   1
-    (5,19)   1
-    (6,22)   0
-    (6,24)   1
-    (8,10)   0
-    (9,19)   0
-    (9,31)   0
-    (11,13)   0
-    (12,11)   1
-    (14,24)   1
-    (15,30)   1
-    (16,20)   0
-    (17,30)   0
-    (18,18)   1
-    (19,1)   0
-    (20,25)   0
-    (21,24)   0
-    (21,27)   0
-    (22,30)   0
-    (23,30)   1
-    (24,14)   1
-    (25,4)   1
-    (26,15)   1
-    (27,28)   1
-    (28,16)   1
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 5 entries, memory: 864 bytes
-
-    (1,1)   1
-    (6,24)   0
-    (9,31)   0
-    (16,20)   0
-    (25,4)   1
-
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 5 entries, memory: 864 bytes
-
-    (1,1)   1
-    (6,24)   0
-    (9,31)   0
-    (16,20)   0
-    (25,4)   1
-
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS double matrix, sparse by row
-  Diff actual, 5 entries, memory: 896 bytes
-
-    (1,1)    0
-    (6,24)    0
-    (9,31)    0
-    (16,20)    0
-    (25,4)    0
-
-
-  32x32 GraphBLAS bool matrix, sparse by row
-  T actual, 5 entries, memory: 840 bytes
-
-    (1,1)   1
-    (6,24)   1
-    (9,31)   1
-    (16,20)   1
-    (25,4)   1
- work:5 gpus:0 [       OK ] AxB_dot3_tests_PLUS_TIMES_3.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (20 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_3.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
-Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 4096 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-1024 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff53ee200
-inside enumify: 0x7f1ff53ee200
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 1, no filling
-done assigning buckets
-bucket 1 has 1024 dots to do
-LAUNCHING BUCKET CODE: 1
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_dndn
-found memory-cached prog GB_jit_AxB_dot3_phase3_dndn
- got kernel instance AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_dndn_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_dndnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-tid=0, i,j = 569,22  nnzA= 1024, nnzB=1024
-tid=0, i,j = 192,13  nnzA= 1024, nnzB=1024
-tid=0, i,j = 103,18  nnzA= 1024, nnzB=1024
-tid=0, i,j = 886,19  nnzA= 1024, nnzB=1024
-tid=0, i,j = 905,24  nnzA= 1024, nnzB=1024
-tid=0, i,j = 975,21  nnzA= 1024, nnzB=1024
-tid=0, i,j = 996,6  nnzA= 1024, nnzB=1024
-tid=0, i,j = 107,28  nnzA= 1024, nnzB=1024
-tid=0, i,j = 568,15  nnzA= 1024, nnzB=1024
-tid=0, i,j = 187,9  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,0  nnzA= 1024, nnzB=1024
-tid=0, i,j = 928,17  nnzA= 1024, nnzB=1024
-tid=0, i,j = 428,26  nnzA= 1024, nnzB=1024
-tid=0, i,j = 310,23  nnzA= 1024, nnzB=1024
-tid=0, i,j = 376,2  nnzA= 1024, nnzB=1024
-tid=0, i,j = 560,5  nnzA= 1024, nnzB=1024
-tid=0, i,j = 896,8  nnzA= 1024, nnzB=1024
-tid=0, i,j = 446,10  nnzA= 1024, nnzB=1024
-tid=0, i,j = 953,12  nnzA= 1024, nnzB=1024
-tid=0, i,j = 821,19  nnzA= 1024, nnzB=1024
-tid=0, i,j = 46,11  nnzA= 1024, nnzB=1024
-tid=0, i,j = 421,14  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,21  nnzA= 1024, nnzB=1024
-tid=0, i,j = 397,12  nnzA= 1024, nnzB=1024
-tid=0, i,j = 904,16  nnzA= 1024, nnzB=1024
-tid=0, i,j = 241,25  nnzA= 1024, nnzB=1024
-tid=0, i,j = 474,20  nnzA= 1024, nnzB=1024
-tid=0, i,j = 788,16  nnzA= 1024, nnzB=1024
-tid=0, i,j = 478,0  nnzA= 1024, nnzB=1024
-tid=0, i,j = 666,7  nnzA= 1024, nnzB=1024
-tid=0, i,j = 955,11  nnzA= 1024, nnzB=1024
-tid=0, i,j = 183,7  nnzA= 1024, nnzB=1024
-tid=0, i,j = 960,54  nnzA= 1024, nnzB=1024
-tid=0, i,j = 476,49  nnzA= 1024, nnzB=1024
-tid=0, i,j = 893,57  nnzA= 1024, nnzB=1024
-tid=0, i,j = 108,59  nnzA= 1024, nnzB=1024
-tid=0, i,j = 940,53  nnzA= 1024, nnzB=1024
-tid=0, i,j = 804,34  nnzA= 1024, nnzB=1024
-tid=0, i,j = 70,47  nnzA= 1024, nnzB=1024
-tid=0, i,j = 846,57  nnzA= 1024, nnzB=1024
-tid=0, i,j = 771,62  nnzA= 1024, nnzB=1024
-tid=0, i,j = 35,58  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,30  nnzA= 1024, nnzB=1024
-tid=0, i,j = 999,52  nnzA= 1024, nnzB=1024
-tid=0, i,j = 590,61  nnzA= 1024, nnzB=1024
-tid=0, i,j = 558,54  nnzA= 1024, nnzB=1024
-tid=0, i,j = 121,32  nnzA= 1024, nnzB=1024
-tid=0, i,j = 138,38  nnzA= 1024, nnzB=1024
-tid=0, i,j = 568,41  nnzA= 1024, nnzB=1024
-tid=0, i,j = 81,33  nnzA= 1024, nnzB=1024
-tid=0, i,j = 698,39  nnzA= 1024, nnzB=1024
-tid=0, i,j = 950,40  nnzA= 1024, nnzB=1024
-tid=0, i,j = 208,46  nnzA= 1024, nnzB=1024
-tid=0, i,j = 336,48  nnzA= 1024, nnzB=1024
-tid=0, i,j = 90,56  nnzA= 1024, nnzB=1024
-tid=0, i,j = 798,43  nnzA= 1024, nnzB=1024
-tid=0, i,j = 556,51  nnzA= 1024, nnzB=1024
-tid=0, i,j = 609,37  nnzA= 1024, nnzB=1024
-tid=0, i,j = 441,28  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,60  nnzA= 1024, nnzB=1024
-tid=0, i,j = 979,55  nnzA= 1024, nnzB=1024
-tid=0, i,j = 35,50  nnzA= 1024, nnzB=1024
-tid=0, i,j = 324,42  nnzA= 1024, nnzB=1024
-tid=0, i,j = 451,36  nnzA= 1024, nnzB=1024
-tid=0, i,j = 665,73  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,65  nnzA= 1024, nnzB=1024
-tid=0, i,j = 297,72  nnzA= 1024, nnzB=1024
-tid=0, i,j = 50,63  nnzA= 1024, nnzB=1024
-tid=0, i,j = 324,68  nnzA= 1024, nnzB=1024
-tid=0, i,j = 234,91  nnzA= 1024, nnzB=1024
-tid=0, i,j = 637,94  nnzA= 1024, nnzB=1024
-tid=0, i,j = 690,92  nnzA= 1024, nnzB=1024
-tid=0, i,j = 243,82  nnzA= 1024, nnzB=1024
-tid=0, i,j = 90,78  nnzA= 1024, nnzB=1024
-tid=0, i,j = 411,67  nnzA= 1024, nnzB=1024
-tid=0, i,j = 825,89  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,92  nnzA= 1024, nnzB=1024
-tid=0, i,j = 748,97  nnzA= 1024, nnzB=1024
-tid=0, i,j = 486,93  nnzA= 1024, nnzB=1024
-tid=0, i,j = 268,64  nnzA= 1024, nnzB=1024
-tid=0, i,j = 657,87  nnzA= 1024, nnzB=1024
-tid=0, i,j = 37,90  nnzA= 1024, nnzB=1024
-tid=0, i,j = 719,66  nnzA= 1024, nnzB=1024
-tid=0, i,j = 564,96  nnzA= 1024, nnzB=1024
-tid=0, i,j = 539,70  nnzA= 1024, nnzB=1024
-tid=0, i,j = 754,81  nnzA= 1024, nnzB=1024
-tid=0, i,j = 228,71  nnzA= 1024, nnzB=1024
-tid=0, i,j = 74,92  nnzA= 1024, nnzB=1024
-tid=0, i,j = 433,77  nnzA= 1024, nnzB=1024
-tid=0, i,j = 248,76  nnzA= 1024, nnzB=1024
-tid=0, i,j = 104,86  nnzA= 1024, nnzB=1024
-tid=0, i,j = 722,94  nnzA= 1024, nnzB=1024
-tid=0, i,j = 519,91  nnzA= 1024, nnzB=1024
-tid=0, i,j = 402,110  nnzA= 1024, nnzB=1024
-tid=0, i,j = 609,102  nnzA= 1024, nnzB=1024
-tid=0, i,j = 253,84  nnzA= 1024, nnzB=1024
-tid=0, i,j = 884,109  nnzA= 1024, nnzB=1024
-tid=0, i,j = 477,69  nnzA= 1024, nnzB=1024
-tid=0, i,j = 326,99  nnzA= 1024, nnzB=1024
-tid=0, i,j = 855,75  nnzA= 1024, nnzB=1024
-tid=0, i,j = 652,106  nnzA= 1024, nnzB=1024
-tid=0, i,j = 820,128  nnzA= 1024, nnzB=1024
-tid=0, i,j = 240,115  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,125  nnzA= 1024, nnzB=1024
-tid=0, i,j = 554,121  nnzA= 1024, nnzB=1024
-tid=0, i,j = 278,113  nnzA= 1024, nnzB=1024
-tid=0, i,j = 99,118  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,123  nnzA= 1024, nnzB=1024
-tid=0, i,j = 623,130  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,104  nnzA= 1024, nnzB=1024
-tid=0, i,j = 131,108  nnzA= 1024, nnzB=1024
-tid=0, i,j = 338,117  nnzA= 1024, nnzB=1024
-tid=0, i,j = 281,100  nnzA= 1024, nnzB=1024
-tid=0, i,j = 36,126  nnzA= 1024, nnzB=1024
-tid=0, i,j = 660,129  nnzA= 1024, nnzB=1024
-tid=0, i,j = 522,107  nnzA= 1024, nnzB=1024
-tid=0, i,j = 779,112  nnzA= 1024, nnzB=1024
-tid=0, i,j = 151,122  nnzA= 1024, nnzB=1024
-tid=0, i,j = 477,120  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,112  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,103  nnzA= 1024, nnzB=1024
-tid=0, i,j = 519,114  nnzA= 1024, nnzB=1024
-tid=0, i,j = 219,117  nnzA= 1024, nnzB=1024
-tid=0, i,j = 288,138  nnzA= 1024, nnzB=1024
-tid=0, i,j = 284,139  nnzA= 1024, nnzB=1024
-tid=0, i,j = 355,131  nnzA= 1024, nnzB=1024
-tid=0, i,j = 835,128  nnzA= 1024, nnzB=1024
-tid=0, i,j = 715,121  nnzA= 1024, nnzB=1024
-tid=0, i,j = 198,116  nnzA= 1024, nnzB=1024
-tid=0, i,j = 821,134  nnzA= 1024, nnzB=1024
-tid=0, i,j = 905,111  nnzA= 1024, nnzB=1024
-tid=0, i,j = 239,107  nnzA= 1024, nnzB=1024
-tid=0, i,j = 807,130  nnzA= 1024, nnzB=1024
-tid=0, i,j = 959,158  nnzA= 1024, nnzB=1024
-tid=0, i,j = 714,157  nnzA= 1024, nnzB=1024
-tid=0, i,j = 123,143  nnzA= 1024, nnzB=1024
-tid=0, i,j = 87,142  nnzA= 1024, nnzB=1024
-tid=0, i,j = 290,156  nnzA= 1024, nnzB=1024
-tid=0, i,j = 995,146  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,163  nnzA= 1024, nnzB=1024
-tid=0, i,j = 949,149  nnzA= 1024, nnzB=1024
-tid=0, i,j = 492,133  nnzA= 1024, nnzB=1024
-tid=0, i,j = 974,157  nnzA= 1024, nnzB=1024
-tid=0, i,j = 834,137  nnzA= 1024, nnzB=1024
-tid=0, i,j = 253,131  nnzA= 1024, nnzB=1024
-tid=0, i,j = 194,146  nnzA= 1024, nnzB=1024
-tid=0, i,j = 601,162  nnzA= 1024, nnzB=1024
-tid=0, i,j = 108,136  nnzA= 1024, nnzB=1024
-tid=0, i,j = 199,141  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,148  nnzA= 1024, nnzB=1024
-tid=0, i,j = 484,151  nnzA= 1024, nnzB=1024
-tid=0, i,j = 570,132  nnzA= 1024, nnzB=1024
-tid=0, i,j = 225,142  nnzA= 1024, nnzB=1024
-tid=0, i,j = 552,145  nnzA= 1024, nnzB=1024
-tid=0, i,j = 0,174  nnzA= 1024, nnzB=1024
-tid=0, i,j = 887,140  nnzA= 1024, nnzB=1024
-tid=0, i,j = 595,174  nnzA= 1024, nnzB=1024
-tid=0, i,j = 801,166  nnzA= 1024, nnzB=1024
-tid=0, i,j = 297,160  nnzA= 1024, nnzB=1024
-tid=0, i,j = 717,150  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,144  nnzA= 1024, nnzB=1024
-tid=0, i,j = 295,135  nnzA= 1024, nnzB=1024
-tid=0, i,j = 945,139  nnzA= 1024, nnzB=1024
-tid=0, i,j = 901,169  nnzA= 1024, nnzB=1024
-tid=0, i,j = 221,164  nnzA= 1024, nnzB=1024
-tid=0, i,j = 111,189  nnzA= 1024, nnzB=1024
-tid=0, i,j = 997,187  nnzA= 1024, nnzB=1024
-tid=0, i,j = 17,177  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 779,186  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,188  nnzA= 1024, nnzB=1024
-tid=0, i,j = 675,182  nnzA= 1024, nnzB=1024
-tid=0, i,j = 692,183  nnzA= 1024, nnzB=1024
-tid=0, i,j = 321,168  nnzA= 1024, nnzB=1024
-tid=0, i,j = 953,192  nnzA= 1024, nnzB=1024
-tid=0, i,j = 951,172  nnzA= 1024, nnzB=1024
-tid=0, i,j = 396,165  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,192  nnzA= 1024, nnzB=1024
-tid=0, i,j = 830,180  nnzA= 1024, nnzB=1024
-tid=0, i,j = 15,171  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1001,182  nnzA= 1024, nnzB=1024
-tid=0, i,j = 188,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 450,185  nnzA= 1024, nnzB=1024
-tid=0, i,j = 781,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 879,167  nnzA= 1024, nnzB=1024
-tid=0, i,j = 922,195  nnzA= 1024, nnzB=1024
-tid=0, i,j = 932,179  nnzA= 1024, nnzB=1024
-tid=0, i,j = 826,207  nnzA= 1024, nnzB=1024
-tid=0, i,j = 108,176  nnzA= 1024, nnzB=1024
-tid=0, i,j = 990,190  nnzA= 1024, nnzB=1024
-tid=0, i,j = 143,184  nnzA= 1024, nnzB=1024
-tid=0, i,j = 612,170  nnzA= 1024, nnzB=1024
-tid=0, i,j = 631,178  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,175  nnzA= 1024, nnzB=1024
-tid=0, i,j = 531,199  nnzA= 1024, nnzB=1024
-tid=0, i,j = 135,193  nnzA= 1024, nnzB=1024
-tid=0, i,j = 545,222  nnzA= 1024, nnzB=1024
-tid=0, i,j = 56,221  nnzA= 1024, nnzB=1024
-tid=0, i,j = 71,211  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,220  nnzA= 1024, nnzB=1024
-tid=0, i,j = 551,221  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,218  nnzA= 1024, nnzB=1024
-tid=0, i,j = 550,216  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,198  nnzA= 1024, nnzB=1024
-tid=0, i,j = 935,225  nnzA= 1024, nnzB=1024
-tid=0, i,j = 865,203  nnzA= 1024, nnzB=1024
-tid=0, i,j = 137,194  nnzA= 1024, nnzB=1024
-tid=0, i,j = 721,224  nnzA= 1024, nnzB=1024
-tid=0, i,j = 921,217  nnzA= 1024, nnzB=1024
-tid=0, i,j = 268,215  nnzA= 1024, nnzB=1024
-tid=0, i,j = 659,210  nnzA= 1024, nnzB=1024
-tid=0, i,j = 66,219  nnzA= 1024, nnzB=1024
-tid=0, i,j = 931,211  nnzA= 1024, nnzB=1024
-tid=0, i,j = 859,197  nnzA= 1024, nnzB=1024
-tid=0, i,j = 535,229  nnzA= 1024, nnzB=1024
-tid=0, i,j = 808,209  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,237  nnzA= 1024, nnzB=1024
-tid=0, i,j = 928,214  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1016,223  nnzA= 1024, nnzB=1024
-tid=0, i,j = 922,218  nnzA= 1024, nnzB=1024
-tid=0, i,j = 152,213  nnzA= 1024, nnzB=1024
-tid=0, i,j = 907,201  nnzA= 1024, nnzB=1024
-tid=0, i,j = 985,208  nnzA= 1024, nnzB=1024
-tid=0, i,j = 897,232  nnzA= 1024, nnzB=1024
-tid=0, i,j = 727,226  nnzA= 1024, nnzB=1024
-tid=0, i,j = 197,252  nnzA= 1024, nnzB=1024
-tid=0, i,j = 426,212  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,204  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,250  nnzA= 1024, nnzB=1024
-tid=0, i,j = 863,202  nnzA= 1024, nnzB=1024
-tid=0, i,j = 500,241  nnzA= 1024, nnzB=1024
-tid=0, i,j = 233,248  nnzA= 1024, nnzB=1024
-tid=0, i,j = 708,251  nnzA= 1024, nnzB=1024
-tid=0, i,j = 551,231  nnzA= 1024, nnzB=1024
-tid=0, i,j = 326,246  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1004,244  nnzA= 1024, nnzB=1024
-tid=0, i,j = 743,228  nnzA= 1024, nnzB=1024
-tid=0, i,j = 40,254  nnzA= 1024, nnzB=1024
-tid=0, i,j = 494,245  nnzA= 1024, nnzB=1024
-tid=0, i,j = 588,244  nnzA= 1024, nnzB=1024
-tid=0, i,j = 522,235  nnzA= 1024, nnzB=1024
-tid=0, i,j = 238,254  nnzA= 1024, nnzB=1024
-tid=0, i,j = 388,240  nnzA= 1024, nnzB=1024
-tid=0, i,j = 382,230  nnzA= 1024, nnzB=1024
-tid=0, i,j = 147,247  nnzA= 1024, nnzB=1024
-tid=0, i,j = 82,239  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,253  nnzA= 1024, nnzB=1024
-tid=0, i,j = 300,243  nnzA= 1024, nnzB=1024
-tid=0, i,j = 124,242  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,247  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,242  nnzA= 1024, nnzB=1024
-tid=0, i,j = 570,233  nnzA= 1024, nnzB=1024
-tid=0, i,j = 964,238  nnzA= 1024, nnzB=1024
-tid=0, i,j = 352,266  nnzA= 1024, nnzB=1024
-tid=0, i,j = 461,257  nnzA= 1024, nnzB=1024
-tid=0, i,j = 895,255  nnzA= 1024, nnzB=1024
-tid=0, i,j = 884,260  nnzA= 1024, nnzB=1024
-tid=0, i,j = 32,285  nnzA= 1024, nnzB=1024
-tid=0, i,j = 193,242  nnzA= 1024, nnzB=1024
-tid=0, i,j = 221,236  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1007,283  nnzA= 1024, nnzB=1024
-tid=0, i,j = 520,234  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,270  nnzA= 1024, nnzB=1024
-tid=0, i,j = 179,282  nnzA= 1024, nnzB=1024
-tid=0, i,j = 595,284  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,279  nnzA= 1024, nnzB=1024
-tid=0, i,j = 233,258  nnzA= 1024, nnzB=1024
-tid=0, i,j = 333,276  nnzA= 1024, nnzB=1024
-tid=0, i,j = 114,256  nnzA= 1024, nnzB=1024
-tid=0, i,j = 377,277  nnzA= 1024, nnzB=1024
-tid=0, i,j = 126,287  nnzA= 1024, nnzB=1024
-tid=0, i,j = 979,274  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,264  nnzA= 1024, nnzB=1024
-tid=0, i,j = 394,287  nnzA= 1024, nnzB=1024
-tid=0, i,j = 449,281  nnzA= 1024, nnzB=1024
-tid=0, i,j = 37,286  nnzA= 1024, nnzB=1024
-tid=0, i,j = 234,268  nnzA= 1024, nnzB=1024
-tid=0, i,j = 400,269  nnzA= 1024, nnzB=1024
-tid=0, i,j = 796,257  nnzA= 1024, nnzB=1024
-tid=0, i,j = 924,270  nnzA= 1024, nnzB=1024
-tid=0, i,j = 100,273  nnzA= 1024, nnzB=1024
-tid=0, i,j = 67,272  nnzA= 1024, nnzB=1024
-tid=0, i,j = 945,261  nnzA= 1024, nnzB=1024
-tid=0, i,j = 18,280  nnzA= 1024, nnzB=1024
-tid=0, i,j = 10,267  nnzA= 1024, nnzB=1024
-tid=0, i,j = 245,296  nnzA= 1024, nnzB=1024
-tid=0, i,j = 594,291  nnzA= 1024, nnzB=1024
-tid=0, i,j = 848,288  nnzA= 1024, nnzB=1024
-tid=0, i,j = 960,294  nnzA= 1024, nnzB=1024
-tid=0, i,j = 571,314  nnzA= 1024, nnzB=1024
-tid=0, i,j = 124,265  nnzA= 1024, nnzB=1024
-tid=0, i,j = 944,271  nnzA= 1024, nnzB=1024
-tid=0, i,j = 61,312  nnzA= 1024, nnzB=1024
-tid=0, i,j = 368,262  nnzA= 1024, nnzB=1024
-tid=0, i,j = 758,313  nnzA= 1024, nnzB=1024
-tid=0, i,j = 554,300  nnzA= 1024, nnzB=1024
-tid=0, i,j = 328,309  nnzA= 1024, nnzB=1024
-tid=0, i,j = 466,294  nnzA= 1024, nnzB=1024
-tid=0, i,j = 264,307  nnzA= 1024, nnzB=1024
-tid=0, i,j = 355,311  nnzA= 1024, nnzB=1024
-tid=0, i,j = 298,315  nnzA= 1024, nnzB=1024
-tid=0, i,j = 28,308  nnzA= 1024, nnzB=1024
-tid=0, i,j = 622,305  nnzA= 1024, nnzB=1024
-tid=0, i,j = 317,290  nnzA= 1024, nnzB=1024
-tid=0, i,j = 109,296  nnzA= 1024, nnzB=1024
-tid=0, i,j = 562,292  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1006,297  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,315  nnzA= 1024, nnzB=1024
-tid=0, i,j = 741,315  nnzA= 1024, nnzB=1024
-tid=0, i,j = 159,299  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,310  nnzA= 1024, nnzB=1024
-tid=0, i,j = 229,304  nnzA= 1024, nnzB=1024
-tid=0, i,j = 645,303  nnzA= 1024, nnzB=1024
-tid=0, i,j = 774,301  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1,295  nnzA= 1024, nnzB=1024
-tid=0, i,j = 627,309  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,297  nnzA= 1024, nnzB=1024
-tid=0, i,j = 30,302  nnzA= 1024, nnzB=1024
-tid=0, i,j = 183,296  nnzA= 1024, nnzB=1024
-tid=0, i,j = 106,295  nnzA= 1024, nnzB=1024
-tid=0, i,j = 126,319  nnzA= 1024, nnzB=1024
-tid=0, i,j = 549,324  nnzA= 1024, nnzB=1024
-tid=0, i,j = 595,318  nnzA= 1024, nnzB=1024
-tid=0, i,j = 235,322  nnzA= 1024, nnzB=1024
-tid=0, i,j = 375,323  nnzA= 1024, nnzB=1024
-tid=0, i,j = 308,316  nnzA= 1024, nnzB=1024
-tid=0, i,j = 651,323  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,325  nnzA= 1024, nnzB=1024
-tid=0, i,j = 323,317  nnzA= 1024, nnzB=1024
-tid=0, i,j = 73,321  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,316  nnzA= 1024, nnzB=1024
-tid=0, i,j = 468,320  nnzA= 1024, nnzB=1024
-tid=0, i,j = 306,325  nnzA= 1024, nnzB=1024
-tid=0, i,j = 993,338  nnzA= 1024, nnzB=1024
-tid=0, i,j = 384,344  nnzA= 1024, nnzB=1024
-tid=0, i,j = 849,340  nnzA= 1024, nnzB=1024
-tid=0, i,j = 680,339  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,327  nnzA= 1024, nnzB=1024
-tid=0, i,j = 176,329  nnzA= 1024, nnzB=1024
-tid=0, i,j = 608,337  nnzA= 1024, nnzB=1024
-tid=0, i,j = 795,332  nnzA= 1024, nnzB=1024
-tid=0, i,j = 695,334  nnzA= 1024, nnzB=1024
-tid=0, i,j = 649,326  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,335  nnzA= 1024, nnzB=1024
-tid=0, i,j = 142,328  nnzA= 1024, nnzB=1024
-tid=0, i,j = 723,342  nnzA= 1024, nnzB=1024
-tid=0, i,j = 848,330  nnzA= 1024, nnzB=1024
-tid=0, i,j = 808,336  nnzA= 1024, nnzB=1024
-tid=0, i,j = 965,330  nnzA= 1024, nnzB=1024
-tid=0, i,j = 775,336  nnzA= 1024, nnzB=1024
-tid=0, i,j = 36,341  nnzA= 1024, nnzB=1024
-tid=0, i,j = 678,343  nnzA= 1024, nnzB=1024
-tid=0, i,j = 821,348  nnzA= 1024, nnzB=1024
-tid=0, i,j = 60,348  nnzA= 1024, nnzB=1024
-tid=0, i,j = 536,353  nnzA= 1024, nnzB=1024
-tid=0, i,j = 937,351  nnzA= 1024, nnzB=1024
-tid=0, i,j = 160,352  nnzA= 1024, nnzB=1024
-tid=0, i,j = 340,356  nnzA= 1024, nnzB=1024
-tid=0, i,j = 142,351  nnzA= 1024, nnzB=1024
-tid=0, i,j = 996,347  nnzA= 1024, nnzB=1024
-tid=0, i,j = 282,350  nnzA= 1024, nnzB=1024
-tid=0, i,j = 804,349  nnzA= 1024, nnzB=1024
-tid=0, i,j = 680,344  nnzA= 1024, nnzB=1024
-tid=0, i,j = 75,345  nnzA= 1024, nnzB=1024
-tid=0, i,j = 352,355  nnzA= 1024, nnzB=1024
-tid=0, i,j = 679,360  nnzA= 1024, nnzB=1024
-tid=0, i,j = 989,361  nnzA= 1024, nnzB=1024
-tid=0, i,j = 678,358  nnzA= 1024, nnzB=1024
-tid=0, i,j = 794,361  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,374  nnzA= 1024, nnzB=1024
-tid=0, i,j = 270,370  nnzA= 1024, nnzB=1024
-tid=0, i,j = 971,366  nnzA= 1024, nnzB=1024
-tid=0, i,j = 327,371  nnzA= 1024, nnzB=1024
-tid=0, i,j = 471,372  nnzA= 1024, nnzB=1024
-tid=0, i,j = 990,365  nnzA= 1024, nnzB=1024
-tid=0, i,j = 629,364  nnzA= 1024, nnzB=1024
-tid=0, i,j = 841,366  nnzA= 1024, nnzB=1024
-tid=0, i,j = 88,373  nnzA= 1024, nnzB=1024
-tid=0, i,j = 587,368  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,362  nnzA= 1024, nnzB=1024
-tid=0, i,j = 888,367  nnzA= 1024, nnzB=1024
-tid=0, i,j = 684,369  nnzA= 1024, nnzB=1024
-tid=0, i,j = 992,375  nnzA= 1024, nnzB=1024
-tid=0, i,j = 206,363  nnzA= 1024, nnzB=1024
-tid=0, i,j = 77,380  nnzA= 1024, nnzB=1024
-tid=0, i,j = 592,379  nnzA= 1024, nnzB=1024
-tid=0, i,j = 537,386  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,382  nnzA= 1024, nnzB=1024
-tid=0, i,j = 494,388  nnzA= 1024, nnzB=1024
-tid=0, i,j = 711,383  nnzA= 1024, nnzB=1024
-tid=0, i,j = 670,385  nnzA= 1024, nnzB=1024
-tid=0, i,j = 86,377  nnzA= 1024, nnzB=1024
-tid=0, i,j = 882,378  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,380  nnzA= 1024, nnzB=1024
-tid=0, i,j = 336,376  nnzA= 1024, nnzB=1024
-tid=0, i,j = 733,389  nnzA= 1024, nnzB=1024
-tid=0, i,j = 347,387  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1012,381  nnzA= 1024, nnzB=1024
-tid=0, i,j = 328,389  nnzA= 1024, nnzB=1024
-tid=0, i,j = 551,390  nnzA= 1024, nnzB=1024
-tid=0, i,j = 59,391  nnzA= 1024, nnzB=1024
-tid=0, i,j = 246,398  nnzA= 1024, nnzB=1024
-tid=0, i,j = 382,400  nnzA= 1024, nnzB=1024
-tid=0, i,j = 790,401  nnzA= 1024, nnzB=1024
-tid=0, i,j = 49,405  nnzA= 1024, nnzB=1024
-tid=0, i,j = 748,420  nnzA= 1024, nnzB=1024
-tid=0, i,j = 26,415  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,422  nnzA= 1024, nnzB=1024
-tid=0, i,j = 645,396  nnzA= 1024, nnzB=1024
-tid=0, i,j = 692,394  nnzA= 1024, nnzB=1024
-tid=0, i,j = 835,397  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,411  nnzA= 1024, nnzB=1024
-tid=0, i,j = 366,417  nnzA= 1024, nnzB=1024
-tid=0, i,j = 285,419  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,421  nnzA= 1024, nnzB=1024
-tid=0, i,j = 209,419  nnzA= 1024, nnzB=1024
-tid=0, i,j = 160,418  nnzA= 1024, nnzB=1024
-tid=0, i,j = 172,400  nnzA= 1024, nnzB=1024
-tid=0, i,j = 600,391  nnzA= 1024, nnzB=1024
-tid=0, i,j = 575,416  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,418  nnzA= 1024, nnzB=1024
-tid=0, i,j = 641,404  nnzA= 1024, nnzB=1024
-tid=0, i,j = 320,407  nnzA= 1024, nnzB=1024
-tid=0, i,j = 94,413  nnzA= 1024, nnzB=1024
-tid=0, i,j = 949,412  nnzA= 1024, nnzB=1024
-tid=0, i,j = 475,405  nnzA= 1024, nnzB=1024
-tid=0, i,j = 320,402  nnzA= 1024, nnzB=1024
-tid=0, i,j = 40,403  nnzA= 1024, nnzB=1024
-tid=0, i,j = 61,408  nnzA= 1024, nnzB=1024
-tid=0, i,j = 107,398  nnzA= 1024, nnzB=1024
-tid=0, i,j = 754,410  nnzA= 1024, nnzB=1024
-tid=0, i,j = 436,399  nnzA= 1024, nnzB=1024
-tid=0, i,j = 991,414  nnzA= 1024, nnzB=1024
-tid=0, i,j = 51,431  nnzA= 1024, nnzB=1024
-tid=0, i,j = 868,452  nnzA= 1024, nnzB=1024
-tid=0, i,j = 377,426  nnzA= 1024, nnzB=1024
-tid=0, i,j = 415,454  nnzA= 1024, nnzB=1024
-tid=0, i,j = 263,425  nnzA= 1024, nnzB=1024
-tid=0, i,j = 857,432  nnzA= 1024, nnzB=1024
-tid=0, i,j = 753,443  nnzA= 1024, nnzB=1024
-tid=0, i,j = 970,447  nnzA= 1024, nnzB=1024
-tid=0, i,j = 542,424  nnzA= 1024, nnzB=1024
-tid=0, i,j = 950,428  nnzA= 1024, nnzB=1024
-tid=0, i,j = 105,437  nnzA= 1024, nnzB=1024
-tid=0, i,j = 26,453  nnzA= 1024, nnzB=1024
-tid=0, i,j = 669,444  nnzA= 1024, nnzB=1024
-tid=0, i,j = 835,449  nnzA= 1024, nnzB=1024
-tid=0, i,j = 356,436  nnzA= 1024, nnzB=1024
-tid=0, i,j = 152,435  nnzA= 1024, nnzB=1024
-tid=0, i,j = 147,451  nnzA= 1024, nnzB=1024
-tid=0, i,j = 873,423  nnzA= 1024, nnzB=1024
-tid=0, i,j = 604,434  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1017,451  nnzA= 1024, nnzB=1024
-tid=0, i,j = 744,448  nnzA= 1024, nnzB=1024
-tid=0, i,j = 338,440  nnzA= 1024, nnzB=1024
-tid=0, i,j = 982,441  nnzA= 1024, nnzB=1024
-tid=0, i,j = 579,450  nnzA= 1024, nnzB=1024
-tid=0, i,j = 880,442  nnzA= 1024, nnzB=1024
-tid=0, i,j = 718,430  nnzA= 1024, nnzB=1024
-tid=0, i,j = 814,438  nnzA= 1024, nnzB=1024
-tid=0, i,j = 646,448  nnzA= 1024, nnzB=1024
-tid=0, i,j = 149,427  nnzA= 1024, nnzB=1024
-tid=0, i,j = 952,445  nnzA= 1024, nnzB=1024
-tid=0, i,j = 305,429  nnzA= 1024, nnzB=1024
-tid=0, i,j = 741,446  nnzA= 1024, nnzB=1024
-tid=0, i,j = 849,456  nnzA= 1024, nnzB=1024
-tid=0, i,j = 43,455  nnzA= 1024, nnzB=1024
-tid=0, i,j = 668,454  nnzA= 1024, nnzB=1024
-tid=0, i,j = 917,461  nnzA= 1024, nnzB=1024
-tid=0, i,j = 510,458  nnzA= 1024, nnzB=1024
-tid=0, i,j = 316,463  nnzA= 1024, nnzB=1024
-tid=0, i,j = 985,456  nnzA= 1024, nnzB=1024
-tid=0, i,j = 650,471  nnzA= 1024, nnzB=1024
-tid=0, i,j = 382,475  nnzA= 1024, nnzB=1024
-tid=0, i,j = 900,462  nnzA= 1024, nnzB=1024
-tid=0, i,j = 889,470  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,457  nnzA= 1024, nnzB=1024
-tid=0, i,j = 194,487  nnzA= 1024, nnzB=1024
-tid=0, i,j = 44,476  nnzA= 1024, nnzB=1024
-tid=0, i,j = 342,477  nnzA= 1024, nnzB=1024
-tid=0, i,j = 517,483  nnzA= 1024, nnzB=1024
-tid=0, i,j = 836,460  nnzA= 1024, nnzB=1024
-tid=0, i,j = 667,480  nnzA= 1024, nnzB=1024
-tid=0, i,j = 762,464  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,473  nnzA= 1024, nnzB=1024
-tid=0, i,j = 673,466  nnzA= 1024, nnzB=1024
-tid=0, i,j = 355,465  nnzA= 1024, nnzB=1024
-tid=0, i,j = 849,461  nnzA= 1024, nnzB=1024
-tid=0, i,j = 112,467  nnzA= 1024, nnzB=1024
-tid=0, i,j = 801,465  nnzA= 1024, nnzB=1024
-tid=0, i,j = 680,482  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,474  nnzA= 1024, nnzB=1024
-tid=0, i,j = 558,481  nnzA= 1024, nnzB=1024
-tid=0, i,j = 961,484  nnzA= 1024, nnzB=1024
-tid=0, i,j = 288,468  nnzA= 1024, nnzB=1024
-tid=0, i,j = 737,459  nnzA= 1024, nnzB=1024
-tid=0, i,j = 121,473  nnzA= 1024, nnzB=1024
-tid=0, i,j = 786,478  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1015,486  nnzA= 1024, nnzB=1024
-tid=0, i,j = 274,485  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,489  nnzA= 1024, nnzB=1024
-tid=0, i,j = 267,496  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,507  nnzA= 1024, nnzB=1024
-tid=0, i,j = 404,514  nnzA= 1024, nnzB=1024
-tid=0, i,j = 194,498  nnzA= 1024, nnzB=1024
-tid=0, i,j = 811,490  nnzA= 1024, nnzB=1024
-tid=0, i,j = 319,491  nnzA= 1024, nnzB=1024
-tid=0, i,j = 362,525  nnzA= 1024, nnzB=1024
-tid=0, i,j = 902,496  nnzA= 1024, nnzB=1024
-tid=0, i,j = 369,517  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,505  nnzA= 1024, nnzB=1024
-tid=0, i,j = 758,515  nnzA= 1024, nnzB=1024
-tid=0, i,j = 284,522  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1013,521  nnzA= 1024, nnzB=1024
-tid=0, i,j = 786,519  nnzA= 1024, nnzB=1024
-tid=0, i,j = 952,499  nnzA= 1024, nnzB=1024
-tid=0, i,j = 809,495  nnzA= 1024, nnzB=1024
-tid=0, i,j = 704,501  nnzA= 1024, nnzB=1024
-tid=0, i,j = 84,500  nnzA= 1024, nnzB=1024
-tid=0, i,j = 271,513  nnzA= 1024, nnzB=1024
-tid=0, i,j = 432,494  nnzA= 1024, nnzB=1024
-tid=0, i,j = 892,512  nnzA= 1024, nnzB=1024
-tid=0, i,j = 519,503  nnzA= 1024, nnzB=1024
-tid=0, i,j = 270,520  nnzA= 1024, nnzB=1024
-tid=0, i,j = 887,504  nnzA= 1024, nnzB=1024
-tid=0, i,j = 510,504  nnzA= 1024, nnzB=1024
-tid=0, i,j = 377,492  nnzA= 1024, nnzB=1024
-tid=0, i,j = 945,524  nnzA= 1024, nnzB=1024
-tid=0, i,j = 632,523  nnzA= 1024, nnzB=1024
-tid=0, i,j = 449,508  nnzA= 1024, nnzB=1024
-tid=0, i,j = 94,525  nnzA= 1024, nnzB=1024
-tid=0, i,j = 293,518  nnzA= 1024, nnzB=1024
-tid=0, i,j = 52,526  nnzA= 1024, nnzB=1024
-tid=0, i,j = 908,531  nnzA= 1024, nnzB=1024
-tid=0, i,j = 137,542  nnzA= 1024, nnzB=1024
-tid=0, i,j = 505,534  nnzA= 1024, nnzB=1024
-tid=0, i,j = 61,527  nnzA= 1024, nnzB=1024
-tid=0, i,j = 294,528  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,547  nnzA= 1024, nnzB=1024
-tid=0, i,j = 674,533  nnzA= 1024, nnzB=1024
-tid=0, i,j = 245,540  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1018,547  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,557  nnzA= 1024, nnzB=1024
-tid=0, i,j = 46,548  nnzA= 1024, nnzB=1024
-tid=0, i,j = 500,536  nnzA= 1024, nnzB=1024
-tid=0, i,j = 112,530  nnzA= 1024, nnzB=1024
-tid=0, i,j = 710,552  nnzA= 1024, nnzB=1024
-tid=0, i,j = 602,549  nnzA= 1024, nnzB=1024
-tid=0, i,j = 660,535  nnzA= 1024, nnzB=1024
-tid=0, i,j = 391,553  nnzA= 1024, nnzB=1024
-tid=0, i,j = 776,535  nnzA= 1024, nnzB=1024
-tid=0, i,j = 767,547  nnzA= 1024, nnzB=1024
-tid=0, i,j = 927,550  nnzA= 1024, nnzB=1024
-tid=0, i,j = 861,538  nnzA= 1024, nnzB=1024
-tid=0, i,j = 492,538  nnzA= 1024, nnzB=1024
-tid=0, i,j = 799,537  nnzA= 1024, nnzB=1024
-tid=0, i,j = 351,554  nnzA= 1024, nnzB=1024
-tid=0, i,j = 145,529  nnzA= 1024, nnzB=1024
-tid=0, i,j = 998,529  nnzA= 1024, nnzB=1024
-tid=0, i,j = 213,546  nnzA= 1024, nnzB=1024
-tid=0, i,j = 10,555  nnzA= 1024, nnzB=1024
-tid=0, i,j = 26,556  nnzA= 1024, nnzB=1024
-tid=0, i,j = 697,548  nnzA= 1024, nnzB=1024
-tid=0, i,j = 658,545  nnzA= 1024, nnzB=1024
-tid=0, i,j = 603,574  nnzA= 1024, nnzB=1024
-tid=0, i,j = 552,558  nnzA= 1024, nnzB=1024
-tid=0, i,j = 310,564  nnzA= 1024, nnzB=1024
-tid=0, i,j = 792,560  nnzA= 1024, nnzB=1024
-tid=0, i,j = 597,561  nnzA= 1024, nnzB=1024
-tid=0, i,j = 445,581  nnzA= 1024, nnzB=1024
-tid=0, i,j = 490,565  nnzA= 1024, nnzB=1024
-tid=0, i,j = 630,572  nnzA= 1024, nnzB=1024
-tid=0, i,j = 872,569  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,582  nnzA= 1024, nnzB=1024
-tid=0, i,j = 118,583  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,563  nnzA= 1024, nnzB=1024
-tid=0, i,j = 609,564  nnzA= 1024, nnzB=1024
-tid=0, i,j = 914,588  nnzA= 1024, nnzB=1024
-tid=0, i,j = 433,585  nnzA= 1024, nnzB=1024
-tid=0, i,j = 564,566  nnzA= 1024, nnzB=1024
-tid=0, i,j = 607,566  nnzA= 1024, nnzB=1024
-tid=0, i,j = 182,580  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1016,589  nnzA= 1024, nnzB=1024
-tid=0, i,j = 465,570  nnzA= 1024, nnzB=1024
-tid=0, i,j = 271,571  nnzA= 1024, nnzB=1024
-tid=0, i,j = 254,587  nnzA= 1024, nnzB=1024
-tid=0, i,j = 919,571  nnzA= 1024, nnzB=1024
-tid=0, i,j = 274,579  nnzA= 1024, nnzB=1024
-tid=0, i,j = 862,562  nnzA= 1024, nnzB=1024
-tid=0, i,j = 399,584  nnzA= 1024, nnzB=1024
-tid=0, i,j = 256,576  nnzA= 1024, nnzB=1024
-tid=0, i,j = 182,562  nnzA= 1024, nnzB=1024
-tid=0, i,j = 478,601  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,599  nnzA= 1024, nnzB=1024
-tid=0, i,j = 95,590  nnzA= 1024, nnzB=1024
-tid=0, i,j = 832,608  nnzA= 1024, nnzB=1024
-tid=0, i,j = 794,610  nnzA= 1024, nnzB=1024
-tid=0, i,j = 759,611  nnzA= 1024, nnzB=1024
-tid=0, i,j = 5,620  nnzA= 1024, nnzB=1024
-tid=0, i,j = 527,592  nnzA= 1024, nnzB=1024
-tid=0, i,j = 41,621  nnzA= 1024, nnzB=1024
-tid=0, i,j = 996,619  nnzA= 1024, nnzB=1024
-tid=0, i,j = 928,616  nnzA= 1024, nnzB=1024
-tid=0, i,j = 978,591  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,590  nnzA= 1024, nnzB=1024
-tid=0, i,j = 398,613  nnzA= 1024, nnzB=1024
-tid=0, i,j = 759,603  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,602  nnzA= 1024, nnzB=1024
-tid=0, i,j = 76,605  nnzA= 1024, nnzB=1024
-tid=0, i,j = 339,600  nnzA= 1024, nnzB=1024
-tid=0, i,j = 270,604  nnzA= 1024, nnzB=1024
-tid=0, i,j = 787,595  nnzA= 1024, nnzB=1024
-tid=0, i,j = 337,620  nnzA= 1024, nnzB=1024
-tid=0, i,j = 430,594  nnzA= 1024, nnzB=1024
-tid=0, i,j = 143,593  nnzA= 1024, nnzB=1024
-tid=0, i,j = 386,614  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,615  nnzA= 1024, nnzB=1024
-tid=0, i,j = 677,596  nnzA= 1024, nnzB=1024
-tid=0, i,j = 287,609  nnzA= 1024, nnzB=1024
-tid=0, i,j = 788,598  nnzA= 1024, nnzB=1024
-tid=0, i,j = 30,617  nnzA= 1024, nnzB=1024
-tid=0, i,j = 930,606  nnzA= 1024, nnzB=1024
-tid=0, i,j = 361,618  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1006,612  nnzA= 1024, nnzB=1024
-tid=0, i,j = 716,637  nnzA= 1024, nnzB=1024
-tid=0, i,j = 341,630  nnzA= 1024, nnzB=1024
-tid=0, i,j = 396,628  nnzA= 1024, nnzB=1024
-tid=0, i,j = 957,650  nnzA= 1024, nnzB=1024
-tid=0, i,j = 44,623  nnzA= 1024, nnzB=1024
-tid=0, i,j = 19,624  nnzA= 1024, nnzB=1024
-tid=0, i,j = 617,636  nnzA= 1024, nnzB=1024
-tid=0, i,j = 113,638  nnzA= 1024, nnzB=1024
-tid=0, i,j = 934,643  nnzA= 1024, nnzB=1024
-tid=0, i,j = 973,652  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,649  nnzA= 1024, nnzB=1024
-tid=0, i,j = 49,631  nnzA= 1024, nnzB=1024
-tid=0, i,j = 73,632  nnzA= 1024, nnzB=1024
-tid=0, i,j = 242,624  nnzA= 1024, nnzB=1024
-tid=0, i,j = 79,623  nnzA= 1024, nnzB=1024
-tid=0, i,j = 585,631  nnzA= 1024, nnzB=1024
-tid=0, i,j = 683,625  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,634  nnzA= 1024, nnzB=1024
-tid=0, i,j = 457,641  nnzA= 1024, nnzB=1024
-tid=0, i,j = 882,629  nnzA= 1024, nnzB=1024
-tid=0, i,j = 281,651  nnzA= 1024, nnzB=1024
-tid=0, i,j = 966,623  nnzA= 1024, nnzB=1024
-tid=0, i,j = 524,624  nnzA= 1024, nnzB=1024
-tid=0, i,j = 783,647  nnzA= 1024, nnzB=1024
-tid=0, i,j = 616,639  nnzA= 1024, nnzB=1024
-tid=0, i,j = 837,640  nnzA= 1024, nnzB=1024
-tid=0, i,j = 367,638  nnzA= 1024, nnzB=1024
-tid=0, i,j = 285,637  nnzA= 1024, nnzB=1024
-tid=0, i,j = 195,648  nnzA= 1024, nnzB=1024
-tid=0, i,j = 51,626  nnzA= 1024, nnzB=1024
-tid=0, i,j = 361,627  nnzA= 1024, nnzB=1024
-tid=0, i,j = 882,635  nnzA= 1024, nnzB=1024
-tid=0, i,j = 158,664  nnzA= 1024, nnzB=1024
-tid=0, i,j = 503,666  nnzA= 1024, nnzB=1024
-tid=0, i,j = 819,674  nnzA= 1024, nnzB=1024
-tid=0, i,j = 819,672  nnzA= 1024, nnzB=1024
-tid=0, i,j = 60,653  nnzA= 1024, nnzB=1024
-tid=0, i,j = 117,682  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,655  nnzA= 1024, nnzB=1024
-tid=0, i,j = 454,675  nnzA= 1024, nnzB=1024
-tid=0, i,j = 951,667  nnzA= 1024, nnzB=1024
-tid=0, i,j = 699,681  nnzA= 1024, nnzB=1024
-tid=0, i,j = 300,669  nnzA= 1024, nnzB=1024
-tid=0, i,j = 899,679  nnzA= 1024, nnzB=1024
-tid=0, i,j = 907,684  nnzA= 1024, nnzB=1024
-tid=0, i,j = 77,665  nnzA= 1024, nnzB=1024
-tid=0, i,j = 141,671  nnzA= 1024, nnzB=1024
-tid=0, i,j = 110,683  nnzA= 1024, nnzB=1024
-tid=0, i,j = 349,656  nnzA= 1024, nnzB=1024
-tid=0, i,j = 591,660  nnzA= 1024, nnzB=1024
-tid=0, i,j = 365,668  nnzA= 1024, nnzB=1024
-tid=0, i,j = 333,653  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,678  nnzA= 1024, nnzB=1024
-tid=0, i,j = 36,674  nnzA= 1024, nnzB=1024
-tid=0, i,j = 17,659  nnzA= 1024, nnzB=1024
-tid=0, i,j = 605,654  nnzA= 1024, nnzB=1024
-tid=0, i,j = 289,677  nnzA= 1024, nnzB=1024
-tid=0, i,j = 767,663  nnzA= 1024, nnzB=1024
-tid=0, i,j = 512,661  nnzA= 1024, nnzB=1024
-tid=0, i,j = 398,680  nnzA= 1024, nnzB=1024
-tid=0, i,j = 647,677  nnzA= 1024, nnzB=1024
-tid=0, i,j = 565,671  nnzA= 1024, nnzB=1024
-tid=0, i,j = 390,681  nnzA= 1024, nnzB=1024
-tid=0, i,j = 242,676  nnzA= 1024, nnzB=1024
-tid=0, i,j = 702,695  nnzA= 1024, nnzB=1024
-tid=0, i,j = 17,685  nnzA= 1024, nnzB=1024
-tid=0, i,j = 287,688  nnzA= 1024, nnzB=1024
-tid=0, i,j = 152,699  nnzA= 1024, nnzB=1024
-tid=0, i,j = 631,698  nnzA= 1024, nnzB=1024
-tid=0, i,j = 374,704  nnzA= 1024, nnzB=1024
-tid=0, i,j = 511,706  nnzA= 1024, nnzB=1024
-tid=0, i,j = 475,714  nnzA= 1024, nnzB=1024
-tid=0, i,j = 76,707  nnzA= 1024, nnzB=1024
-tid=0, i,j = 323,701  nnzA= 1024, nnzB=1024
-tid=0, i,j = 102,696  nnzA= 1024, nnzB=1024
-tid=0, i,j = 392,691  nnzA= 1024, nnzB=1024
-tid=0, i,j = 777,702  nnzA= 1024, nnzB=1024
-tid=0, i,j = 751,715  nnzA= 1024, nnzB=1024
-tid=0, i,j = 107,710  nnzA= 1024, nnzB=1024
-tid=0, i,j = 172,715  nnzA= 1024, nnzB=1024
-tid=0, i,j = 462,711  nnzA= 1024, nnzB=1024
-tid=0, i,j = 502,689  nnzA= 1024, nnzB=1024
-tid=0, i,j = 299,690  nnzA= 1024, nnzB=1024
-tid=0, i,j = 202,686  nnzA= 1024, nnzB=1024
-tid=0, i,j = 873,709  nnzA= 1024, nnzB=1024
-tid=0, i,j = 579,705  nnzA= 1024, nnzB=1024
-tid=0, i,j = 378,694  nnzA= 1024, nnzB=1024
-tid=0, i,j = 600,692  nnzA= 1024, nnzB=1024
-tid=0, i,j = 840,700  nnzA= 1024, nnzB=1024
-tid=0, i,j = 293,710  nnzA= 1024, nnzB=1024
-tid=0, i,j = 45,687  nnzA= 1024, nnzB=1024
-tid=0, i,j = 925,708  nnzA= 1024, nnzB=1024
-tid=0, i,j = 259,708  nnzA= 1024, nnzB=1024
-tid=0, i,j = 872,709  nnzA= 1024, nnzB=1024
-tid=0, i,j = 210,711  nnzA= 1024, nnzB=1024
-tid=0, i,j = 132,703  nnzA= 1024, nnzB=1024
-tid=0, i,j = 934,722  nnzA= 1024, nnzB=1024
-tid=0, i,j = 697,716  nnzA= 1024, nnzB=1024
-tid=0, i,j = 337,726  nnzA= 1024, nnzB=1024
-tid=0, i,j = 848,718  nnzA= 1024, nnzB=1024
-tid=0, i,j = 480,724  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,734  nnzA= 1024, nnzB=1024
-tid=0, i,j = 630,741  nnzA= 1024, nnzB=1024
-tid=0, i,j = 797,728  nnzA= 1024, nnzB=1024
-tid=0, i,j = 275,731  nnzA= 1024, nnzB=1024
-tid=0, i,j = 763,733  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,742  nnzA= 1024, nnzB=1024
-tid=0, i,j = 675,723  nnzA= 1024, nnzB=1024
-tid=0, i,j = 884,729  nnzA= 1024, nnzB=1024
-tid=0, i,j = 725,720  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1023,738  nnzA= 1024, nnzB=1024
-tid=0, i,j = 186,743  nnzA= 1024, nnzB=1024
-tid=0, i,j = 912,737  nnzA= 1024, nnzB=1024
-tid=0, i,j = 234,717  nnzA= 1024, nnzB=1024
-tid=0, i,j = 910,732  nnzA= 1024, nnzB=1024
-tid=0, i,j = 331,719  nnzA= 1024, nnzB=1024
-tid=0, i,j = 365,741  nnzA= 1024, nnzB=1024
-tid=0, i,j = 335,739  nnzA= 1024, nnzB=1024
-tid=0, i,j = 201,720  nnzA= 1024, nnzB=1024
-tid=0, i,j = 806,721  nnzA= 1024, nnzB=1024
-tid=0, i,j = 415,722  nnzA= 1024, nnzB=1024
-tid=0, i,j = 146,737  nnzA= 1024, nnzB=1024
-tid=0, i,j = 177,727  nnzA= 1024, nnzB=1024
-tid=0, i,j = 620,717  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,736  nnzA= 1024, nnzB=1024
-tid=0, i,j = 268,735  nnzA= 1024, nnzB=1024
-tid=0, i,j = 596,740  nnzA= 1024, nnzB=1024
-tid=0, i,j = 767,730  nnzA= 1024, nnzB=1024
-tid=0, i,j = 742,752  nnzA= 1024, nnzB=1024
-tid=0, i,j = 621,745  nnzA= 1024, nnzB=1024
-tid=0, i,j = 592,753  nnzA= 1024, nnzB=1024
-tid=0, i,j = 91,747  nnzA= 1024, nnzB=1024
-tid=0, i,j = 954,763  nnzA= 1024, nnzB=1024
-tid=0, i,j = 472,753  nnzA= 1024, nnzB=1024
-tid=0, i,j = 962,762  nnzA= 1024, nnzB=1024
-tid=0, i,j = 745,752  nnzA= 1024, nnzB=1024
-tid=0, i,j = 340,760  nnzA= 1024, nnzB=1024
-tid=0, i,j = 847,758  nnzA= 1024, nnzB=1024
-tid=0, i,j = 633,756  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,749  nnzA= 1024, nnzB=1024
-tid=0, i,j = 936,766  nnzA= 1024, nnzB=1024
-tid=0, i,j = 381,760  nnzA= 1024, nnzB=1024
-tid=0, i,j = 645,745  nnzA= 1024, nnzB=1024
-tid=0, i,j = 886,748  nnzA= 1024, nnzB=1024
-tid=0, i,j = 313,752  nnzA= 1024, nnzB=1024
-tid=0, i,j = 59,749  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1007,754  nnzA= 1024, nnzB=1024
-tid=0, i,j = 915,766  nnzA= 1024, nnzB=1024
-tid=0, i,j = 348,751  nnzA= 1024, nnzB=1024
-tid=0, i,j = 913,764  nnzA= 1024, nnzB=1024
-tid=0, i,j = 392,764  nnzA= 1024, nnzB=1024
-tid=0, i,j = 273,746  nnzA= 1024, nnzB=1024
-tid=0, i,j = 500,759  nnzA= 1024, nnzB=1024
-tid=0, i,j = 56,773  nnzA= 1024, nnzB=1024
-tid=0, i,j = 372,767  nnzA= 1024, nnzB=1024
-tid=0, i,j = 773,774  nnzA= 1024, nnzB=1024
-tid=0, i,j = 307,768  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,771  nnzA= 1024, nnzB=1024
-tid=0, i,j = 103,771  nnzA= 1024, nnzB=1024
-tid=0, i,j = 458,770  nnzA= 1024, nnzB=1024
-tid=0, i,j = 614,783  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,775  nnzA= 1024, nnzB=1024
-tid=0, i,j = 462,786  nnzA= 1024, nnzB=1024
-tid=0, i,j = 893,778  nnzA= 1024, nnzB=1024
-tid=0, i,j = 809,793  nnzA= 1024, nnzB=1024
-tid=0, i,j = 793,791  nnzA= 1024, nnzB=1024
-tid=0, i,j = 273,788  nnzA= 1024, nnzB=1024
-tid=0, i,j = 816,785  nnzA= 1024, nnzB=1024
-tid=0, i,j = 54,784  nnzA= 1024, nnzB=1024
-tid=0, i,j = 696,789  nnzA= 1024, nnzB=1024
-tid=0, i,j = 955,792  nnzA= 1024, nnzB=1024
-tid=0, i,j = 899,781  nnzA= 1024, nnzB=1024
-tid=0, i,j = 636,792  nnzA= 1024, nnzB=1024
-tid=0, i,j = 880,797  nnzA= 1024, nnzB=1024
-tid=0, i,j = 256,780  nnzA= 1024, nnzB=1024
-tid=0, i,j = 537,776  nnzA= 1024, nnzB=1024
-tid=0, i,j = 876,787  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,779  nnzA= 1024, nnzB=1024
-tid=0, i,j = 892,782  nnzA= 1024, nnzB=1024
-tid=0, i,j = 399,782  nnzA= 1024, nnzB=1024
-tid=0, i,j = 392,777  nnzA= 1024, nnzB=1024
-tid=0, i,j = 347,796  nnzA= 1024, nnzB=1024
-tid=0, i,j = 986,794  nnzA= 1024, nnzB=1024
-tid=0, i,j = 471,790  nnzA= 1024, nnzB=1024
-tid=0, i,j = 656,795  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1022,804  nnzA= 1024, nnzB=1024
-tid=0, i,j = 896,803  nnzA= 1024, nnzB=1024
-tid=0, i,j = 802,798  nnzA= 1024, nnzB=1024
-tid=0, i,j = 32,805  nnzA= 1024, nnzB=1024
-tid=0, i,j = 294,799  nnzA= 1024, nnzB=1024
-tid=0, i,j = 130,801  nnzA= 1024, nnzB=1024
-tid=0, i,j = 970,800  nnzA= 1024, nnzB=1024
-tid=0, i,j = 408,815  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,805  nnzA= 1024, nnzB=1024
-tid=0, i,j = 985,818  nnzA= 1024, nnzB=1024
-tid=0, i,j = 719,809  nnzA= 1024, nnzB=1024
-tid=0, i,j = 430,817  nnzA= 1024, nnzB=1024
-tid=0, i,j = 839,821  nnzA= 1024, nnzB=1024
-tid=0, i,j = 747,822  nnzA= 1024, nnzB=1024
-tid=0, i,j = 547,811  nnzA= 1024, nnzB=1024
-tid=0, i,j = 902,816  nnzA= 1024, nnzB=1024
-tid=0, i,j = 646,810  nnzA= 1024, nnzB=1024
-tid=0, i,j = 737,809  nnzA= 1024, nnzB=1024
-tid=0, i,j = 889,806  nnzA= 1024, nnzB=1024
-tid=0, i,j = 200,813  nnzA= 1024, nnzB=1024
-tid=0, i,j = 375,812  nnzA= 1024, nnzB=1024
-tid=0, i,j = 688,819  nnzA= 1024, nnzB=1024
-tid=0, i,j = 504,807  nnzA= 1024, nnzB=1024
-tid=0, i,j = 625,827  nnzA= 1024, nnzB=1024
-tid=0, i,j = 886,824  nnzA= 1024, nnzB=1024
-tid=0, i,j = 814,826  nnzA= 1024, nnzB=1024
-tid=0, i,j = 406,825  nnzA= 1024, nnzB=1024
-tid=0, i,j = 982,831  nnzA= 1024, nnzB=1024
-tid=0, i,j = 915,830  nnzA= 1024, nnzB=1024
-tid=0, i,j = 511,829  nnzA= 1024, nnzB=1024
-tid=0, i,j = 407,828  nnzA= 1024, nnzB=1024
-tid=0, i,j = 39,823  nnzA= 1024, nnzB=1024
-tid=0, i,j = 115,835  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1003,832  nnzA= 1024, nnzB=1024
-tid=0, i,j = 218,836  nnzA= 1024, nnzB=1024
-tid=0, i,j = 362,833  nnzA= 1024, nnzB=1024
-tid=0, i,j = 295,834  nnzA= 1024, nnzB=1024
-tid=0, i,j = 999,833  nnzA= 1024, nnzB=1024
-tid=0, i,j = 136,834  nnzA= 1024, nnzB=1024
-tid=0, i,j = 565,837  nnzA= 1024, nnzB=1024
-tid=0, i,j = 159,840  nnzA= 1024, nnzB=1024
-tid=0, i,j = 103,841  nnzA= 1024, nnzB=1024
-tid=0, i,j = 541,839  nnzA= 1024, nnzB=1024
-tid=0, i,j = 636,841  nnzA= 1024, nnzB=1024
-tid=0, i,j = 533,846  nnzA= 1024, nnzB=1024
-tid=0, i,j = 483,848  nnzA= 1024, nnzB=1024
-tid=0, i,j = 741,847  nnzA= 1024, nnzB=1024
-tid=0, i,j = 711,839  nnzA= 1024, nnzB=1024
-tid=0, i,j = 529,852  nnzA= 1024, nnzB=1024
-tid=0, i,j = 524,843  nnzA= 1024, nnzB=1024
-tid=0, i,j = 464,849  nnzA= 1024, nnzB=1024
-tid=0, i,j = 136,842  nnzA= 1024, nnzB=1024
-tid=0, i,j = 694,845  nnzA= 1024, nnzB=1024
-tid=0, i,j = 567,851  nnzA= 1024, nnzB=1024
-tid=0, i,j = 302,850  nnzA= 1024, nnzB=1024
-tid=0, i,j = 114,844  nnzA= 1024, nnzB=1024
-tid=0, i,j = 623,853  nnzA= 1024, nnzB=1024
-tid=0, i,j = 150,852  nnzA= 1024, nnzB=1024
-tid=0, i,j = 972,865  nnzA= 1024, nnzB=1024
-tid=0, i,j = 781,858  nnzA= 1024, nnzB=1024
-tid=0, i,j = 106,855  nnzA= 1024, nnzB=1024
-tid=0, i,j = 994,858  nnzA= 1024, nnzB=1024
-tid=0, i,j = 97,866  nnzA= 1024, nnzB=1024
-tid=0, i,j = 716,859  nnzA= 1024, nnzB=1024
-tid=0, i,j = 650,857  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1022,864  nnzA= 1024, nnzB=1024
-tid=0, i,j = 508,859  nnzA= 1024, nnzB=1024
-tid=0, i,j = 21,863  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1014,856  nnzA= 1024, nnzB=1024
-tid=0, i,j = 636,862  nnzA= 1024, nnzB=1024
-tid=0, i,j = 151,857  nnzA= 1024, nnzB=1024
-tid=0, i,j = 48,867  nnzA= 1024, nnzB=1024
-tid=0, i,j = 506,870  nnzA= 1024, nnzB=1024
-tid=0, i,j = 453,871  nnzA= 1024, nnzB=1024
-tid=0, i,j = 303,868  nnzA= 1024, nnzB=1024
-tid=0, i,j = 775,872  nnzA= 1024, nnzB=1024
-tid=0, i,j = 364,869  nnzA= 1024, nnzB=1024
-tid=0, i,j = 173,873  nnzA= 1024, nnzB=1024
-tid=0, i,j = 990,877  nnzA= 1024, nnzB=1024
-tid=0, i,j = 357,876  nnzA= 1024, nnzB=1024
-tid=0, i,j = 110,888  nnzA= 1024, nnzB=1024
-tid=0, i,j = 679,891  nnzA= 1024, nnzB=1024
-tid=0, i,j = 390,896  nnzA= 1024, nnzB=1024
-tid=0, i,j = 42,897  nnzA= 1024, nnzB=1024
-tid=0, i,j = 485,874  nnzA= 1024, nnzB=1024
-tid=0, i,j = 37,895  nnzA= 1024, nnzB=1024
-tid=0, i,j = 168,875  nnzA= 1024, nnzB=1024
-tid=0, i,j = 628,890  nnzA= 1024, nnzB=1024
-tid=0, i,j = 695,895  nnzA= 1024, nnzB=1024
-tid=0, i,j = 687,885  nnzA= 1024, nnzB=1024
-tid=0, i,j = 892,889  nnzA= 1024, nnzB=1024
-tid=0, i,j = 929,883  nnzA= 1024, nnzB=1024
-tid=0, i,j = 722,877  nnzA= 1024, nnzB=1024
-tid=0, i,j = 608,882  nnzA= 1024, nnzB=1024
-tid=0, i,j = 33,894  nnzA= 1024, nnzB=1024
-tid=0, i,j = 605,901  nnzA= 1024, nnzB=1024
-tid=0, i,j = 943,888  nnzA= 1024, nnzB=1024
-tid=0, i,j = 176,880  nnzA= 1024, nnzB=1024
-tid=0, i,j = 146,901  nnzA= 1024, nnzB=1024
-tid=0, i,j = 687,900  nnzA= 1024, nnzB=1024
-tid=0, i,j = 653,892  nnzA= 1024, nnzB=1024
-tid=0, i,j = 643,884  nnzA= 1024, nnzB=1024
-tid=0, i,j = 487,887  nnzA= 1024, nnzB=1024
-tid=0, i,j = 23,881  nnzA= 1024, nnzB=1024
-tid=0, i,j = 808,904  nnzA= 1024, nnzB=1024
-tid=0, i,j = 795,905  nnzA= 1024, nnzB=1024
-tid=0, i,j = 57,902  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,906  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1021,903  nnzA= 1024, nnzB=1024
-tid=0, i,j = 674,907  nnzA= 1024, nnzB=1024
-tid=0, i,j = 548,911  nnzA= 1024, nnzB=1024
-tid=0, i,j = 823,916  nnzA= 1024, nnzB=1024
-tid=0, i,j = 366,915  nnzA= 1024, nnzB=1024
-tid=0, i,j = 502,915  nnzA= 1024, nnzB=1024
-tid=0, i,j = 414,940  nnzA= 1024, nnzB=1024
-tid=0, i,j = 109,944  nnzA= 1024, nnzB=1024
-tid=0, i,j = 13,930  nnzA= 1024, nnzB=1024
-tid=0, i,j = 165,934  nnzA= 1024, nnzB=1024
-tid=0, i,j = 966,921  nnzA= 1024, nnzB=1024
-tid=0, i,j = 31,943  nnzA= 1024, nnzB=1024
-tid=0, i,j = 538,943  nnzA= 1024, nnzB=1024
-tid=0, i,j = 216,923  nnzA= 1024, nnzB=1024
-tid=0, i,j = 420,916  nnzA= 1024, nnzB=1024
-tid=0, i,j = 456,909  nnzA= 1024, nnzB=1024
-tid=0, i,j = 399,936  nnzA= 1024, nnzB=1024
-tid=0, i,j = 748,939  nnzA= 1024, nnzB=1024
-tid=0, i,j = 970,919  nnzA= 1024, nnzB=1024
-tid=0, i,j = 755,926  nnzA= 1024, nnzB=1024
-tid=0, i,j = 351,935  nnzA= 1024, nnzB=1024
-tid=0, i,j = 480,918  nnzA= 1024, nnzB=1024
-tid=0, i,j = 924,914  nnzA= 1024, nnzB=1024
-tid=0, i,j = 685,925  nnzA= 1024, nnzB=1024
-tid=0, i,j = 167,910  nnzA= 1024, nnzB=1024
-tid=0, i,j = 496,939  nnzA= 1024, nnzB=1024
-tid=0, i,j = 860,933  nnzA= 1024, nnzB=1024
-tid=0, i,j = 356,942  nnzA= 1024, nnzB=1024
-tid=0, i,j = 215,938  nnzA= 1024, nnzB=1024
-tid=0, i,j = 608,920  nnzA= 1024, nnzB=1024
-tid=0, i,j = 586,941  nnzA= 1024, nnzB=1024
-tid=0, i,j = 479,931  nnzA= 1024, nnzB=1024
-tid=0, i,j = 538,929  nnzA= 1024, nnzB=1024
-tid=0, i,j = 671,945  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1017,950  nnzA= 1024, nnzB=1024
-tid=0, i,j = 221,951  nnzA= 1024, nnzB=1024
-tid=0, i,j = 628,948  nnzA= 1024, nnzB=1024
-tid=0, i,j = 823,955  nnzA= 1024, nnzB=1024
-tid=0, i,j = 370,964  nnzA= 1024, nnzB=1024
-tid=0, i,j = 667,959  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1023,967  nnzA= 1024, nnzB=1024
-tid=0, i,j = 78,960  nnzA= 1024, nnzB=1024
-tid=0, i,j = 316,949  nnzA= 1024, nnzB=1024
-tid=0, i,j = 653,956  nnzA= 1024, nnzB=1024
-tid=0, i,j = 564,969  nnzA= 1024, nnzB=1024
-tid=0, i,j = 656,957  nnzA= 1024, nnzB=1024
-tid=0, i,j = 400,968  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,962  nnzA= 1024, nnzB=1024
-tid=0, i,j = 644,958  nnzA= 1024, nnzB=1024
-tid=0, i,j = 246,946  nnzA= 1024, nnzB=1024
-tid=0, i,j = 457,955  nnzA= 1024, nnzB=1024
-tid=0, i,j = 504,965  nnzA= 1024, nnzB=1024
-tid=0, i,j = 828,961  nnzA= 1024, nnzB=1024
-tid=0, i,j = 182,947  nnzA= 1024, nnzB=1024
-tid=0, i,j = 483,966  nnzA= 1024, nnzB=1024
-tid=0, i,j = 397,963  nnzA= 1024, nnzB=1024
-tid=0, i,j = 123,978  nnzA= 1024, nnzB=1024
-tid=0, i,j = 549,972  nnzA= 1024, nnzB=1024
-tid=0, i,j = 980,976  nnzA= 1024, nnzB=1024
-tid=0, i,j = 743,985  nnzA= 1024, nnzB=1024
-tid=0, i,j = 175,981  nnzA= 1024, nnzB=1024
-tid=0, i,j = 371,980  nnzA= 1024, nnzB=1024
-tid=0, i,j = 630,972  nnzA= 1024, nnzB=1024
-tid=0, i,j = 875,971  nnzA= 1024, nnzB=1024
-tid=0, i,j = 347,977  nnzA= 1024, nnzB=1024
-tid=0, i,j = 934,974  nnzA= 1024, nnzB=1024
-tid=0, i,j = 856,970  nnzA= 1024, nnzB=1024
-tid=0, i,j = 323,986  nnzA= 1024, nnzB=1024
-tid=0, i,j = 582,984  nnzA= 1024, nnzB=1024
-tid=0, i,j = 340,987  nnzA= 1024, nnzB=1024
-tid=0, i,j = 72,985  nnzA= 1024, nnzB=1024
-tid=0, i,j = 172,988  nnzA= 1024, nnzB=1024
-tid=0, i,j = 678,998  nnzA= 1024, nnzB=1024
-tid=0, i,j = 660,991  nnzA= 1024, nnzB=1024
-tid=0, i,j = 992,994  nnzA= 1024, nnzB=1024
-tid=0, i,j = 15,992  nnzA= 1024, nnzB=1024
-tid=0, i,j = 877,1001  nnzA= 1024, nnzB=1024
-tid=0, i,j = 514,990  nnzA= 1024, nnzB=1024
-tid=0, i,j = 585,989  nnzA= 1024, nnzB=1024
-tid=0, i,j = 167,999  nnzA= 1024, nnzB=1024
-tid=0, i,j = 58,983  nnzA= 1024, nnzB=1024
-tid=0, i,j = 170,995  nnzA= 1024, nnzB=1024
-tid=0, i,j = 946,997  nnzA= 1024, nnzB=1024
-tid=0, i,j = 449,984  nnzA= 1024, nnzB=1024
-tid=0, i,j = 87,993  nnzA= 1024, nnzB=1024
-tid=0, i,j = 120,987  nnzA= 1024, nnzB=1024
-tid=0, i,j = 531,992  nnzA= 1024, nnzB=1024
-tid=0, i,j = 674,993  nnzA= 1024, nnzB=1024
-tid=0, i,j = 667,1014  nnzA= 1024, nnzB=1024
-tid=0, i,j = 159,1008  nnzA= 1024, nnzB=1024
-tid=0, i,j = 1022,1006  nnzA= 1024, nnzB=1024
-tid=0, i,j = 67,1020  nnzA= 1024, nnzB=1024
-tid=0, i,j = 613,1015  nnzA= 1024, nnzB=1024
-tid=0, i,j = 533,1012  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,1013  nnzA= 1024, nnzB=1024
-tid=0, i,j = 151,1020  nnzA= 1024, nnzB=1024
-tid=0, i,j = 250,1004  nnzA= 1024, nnzB=1024
-tid=0, i,j = 127,1015  nnzA= 1024, nnzB=1024
-tid=0, i,j = 574,1009  nnzA= 1024, nnzB=1024
-tid=0, i,j = 254,1018  nnzA= 1024, nnzB=1024
-tid=0, i,j = 286,1002  nnzA= 1024, nnzB=1024
-tid=0, i,j = 491,1022  nnzA= 1024, nnzB=1024
-tid=0, i,j = 840,1023  nnzA= 1024, nnzB=1024
-tid=0, i,j = 287,1019  nnzA= 1024, nnzB=1024
-tid=0, i,j = 180,1017  nnzA= 1024, nnzB=1024
-tid=0, i,j = 457,1016  nnzA= 1024, nnzB=1024
-tid=0, i,j = 810,1021  nnzA= 1024, nnzB=1024
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 13.5107ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-
-    (0,478)   268
-    (0,574)   246
-    (2,376)   235
-    (5,560)   278
-    (6,996)   255
-    (7,183)   256
-    (7,666)   248
-    (8,896)   255
-    (9,187)   274
-    (10,446)   256
-    (11,46)   270
-    (11,955)   284
-    (12,397)   250
-    (12,953)   259
-    (13,192)   278
-    (14,421)   267
-    (15,568)   251
-    (16,788)   225
-    (16,904)   246
-    (17,928)   240
-    (18,103)   262
-    (19,821)   235
-    (19,886)   236
-    (20,474)   267
-    (21,479)   248
-    (21,975)   251
-    (22,569)   255
-    (23,310)   272
-    (24,905)   262
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-
-    (0,478)   268
-    (0,574)   246
-    (2,376)   235
-    (5,560)   278
-    (6,996)   255
-    (7,183)   256
-    (7,666)   248
-    (8,896)   255
-    (9,187)   274
-    (10,446)   256
-    (11,46)   270
-    (11,955)   284
-    (12,397)   250
-    (12,953)   259
-    (13,192)   278
-    (14,421)   267
-    (15,568)   251
-    (16,788)   225
-    (16,904)   246
-    (17,928)   240
-    (18,103)   262
-    (19,821)   235
-    (19,886)   236
-    (20,474)   267
-    (21,479)   248
-    (21,975)   251
-    (22,569)   255
-    (23,310)   272
-    (24,905)   262
-    (25,241)   225
-    (26,428)   224
-    (28,107)   228
-    (28,441)   274
-    (30,694)   269
-    (32,121)   239
-    (33,81)   249
-    (34,804)   269
-    (36,451)   264
-    (37,609)   263
-    (38,138)   258
-    (39,698)   263
-    (40,950)   236
-    (41,568)   236
-    (42,324)   238
-    (43,798)   244
-    (46,208)   240
-    (47,70)   264
-    (48,336)   277
-    (49,476)   254
-    (50,35)   242
-    (51,556)   265
-    (52,999)   247
-    (53,940)   264
-    (54,558)   257
-    (54,960)   259
-    (55,979)   251
-    (56,90)   305
-    (57,846)   275
-    (57,893)   272
-    (58,35)   260
-    (59,108)   255
-    (60,479)   255
-    (61,590)   264
-    (62,771)   259
-    (63,50)   267
-    (64,268)   276
-    (65,694)   249
-    (66,719)   261
-    (67,411)   239
-    (68,324)   246
-    (69,477)   254
-    (70,539)   241
-    (71,228)   235
-    (72,297)   242
-    (73,665)   269
-    (75,855)   227
-    (76,248)   235
-    (77,433)   251
-    (78,90)   275
-    (81,754)   270
-    (82,243)   286
-    (84,253)   267
-    (86,104)   247
-    (87,657)   255
-    (89,825)   251
-    (90,37)   248
-    (91,234)   259
-    (91,519)   276
-    (92,74)   259
-    (92,218)   266
-    (92,690)   256
-    (93,486)   268
-    (94,637)   277
-    (94,722)   261
-    (96,564)   282
-    (97,748)   245
-    (99,326)   249
-    (100,281)   248
-    (102,609)   258
-    (103,621)   277
-    (104,644)   226
-    (106,652)   244
-    (107,239)   273
-    (107,522)   234
-    (108,131)   274
-    (109,884)   253
-    (110,402)   251
-    (111,905)   256
-    (112,127)   241
-    (112,779)   239
-    (113,278)   251
-    (114,519)   264
-    (115,240)   262
-    (116,198)   258
-    (117,219)   230
-    (117,338)   251
-    (118,99)   260
-    (120,477)   266
-    (121,554)   271
-    (121,715)   291
-    (122,151)   253
-    (123,621)   252
-    (125,177)   236
-    (126,36)   275
-    (128,820)   263
-    (128,835)   248
-    (129,660)   255
-    (130,623)   246
-    (130,807)   273
-    (131,253)   271
-    (131,355)   260
-    (132,570)   264
-    (133,492)   278
-    (134,821)   268
-    (135,295)   266
-    (136,108)   263
-    (137,834)   271
-    (138,288)   253
-    (139,284)   249
-    (139,945)   286
-    (140,887)   265
-    (141,199)   274
-    (142,87)   235
-    (142,225)   261
-    (143,123)   258
-    (144,574)   262
-    (145,552)   250
-    (146,194)   244
-    (146,995)   255
-    (148,357)   253
-    (149,949)   253
-    (150,717)   255
-    (151,484)   272
-    (156,290)   250
-    (157,714)   302
-    (157,974)   274
-    (158,959)   228
-    (160,297)   252
-    (162,601)   264
-    (163,816)   271
-    (164,221)   254
-    (165,396)   243
-    (166,801)   242
-    (167,879)   234
-    (168,321)   273
-    (169,901)   286
-    (170,612)   282
-    (171,15)   253
-    (172,951)   261
-    (174,0)   258
-    (174,595)   259
-    (175,669)   254
-    (176,108)   261
-    (176,188)   279
-    (176,614)   269
-    (176,781)   255
-    (177,17)   261
-    (178,631)   265
-    (179,932)   225
-    (180,830)   258
-    (182,675)   259
-    (182,1001)   257
-    (183,692)   240
-    (184,143)   247
-    (185,450)   240
-    (186,779)   270
-    (187,997)   256
-    (188,357)   265
-    (189,111)   250
-    (190,990)   262
-    (192,644)   269
-    (192,953)   250
-    (193,135)   246
-    (194,137)   267
-    (195,922)   276
-    (197,859)   269
-    (198,910)   239
-    (199,531)   270
-    (201,907)   253
-    (202,863)   255
-    (203,865)   232
-    (204,614)   268
-    (207,826)   239
-    (208,985)   262
-    (209,808)   256
-    (210,659)   250
-    (211,71)   236
-    (211,931)   266
-    (212,426)   291
-    (213,152)   255
-    (214,928)   264
-    (215,268)   270
-    (216,550)   268
-    (217,921)   252
-    (218,704)   246
-    (218,922)   265
-    (219,66)   232
-    (220,704)   235
-    (221,56)   280
-    (221,551)   273
-    (222,545)   243
-    (223,1016)   249
-    (224,721)   261
-    (225,935)   270
-    (226,727)   254
-    (228,743)   240
-    (229,535)   242
-    (230,382)   245
-    (231,551)   260
-    (232,897)   273
-    (233,570)   235
-    (234,520)   246
-    (235,522)   261
-    (236,221)   244
-    (237,755)   271
-    (238,964)   243
-    (239,82)   243
-    (240,388)   238
-    (241,500)   276
-    (242,124)   240
-    (242,193)   243
-    (242,621)   243
-    (243,300)   254
-    (244,588)   256
-    (244,1004)   265
-    (245,494)   253
-    (246,326)   262
-    (247,115)   263
-    (247,147)   263
-    (248,233)   224
-    (250,485)   259
-    (251,708)   262
-    (252,197)   237
-    (253,485)   256
-    (254,40)   243
-    (254,238)   261
-    (255,895)   243
-    (256,114)   268
-    (257,461)   250
-    (257,796)   237
-    (258,233)   236
-    (260,884)   257
-    (261,945)   279
-    (262,368)   260
-    (264,755)   251
-    (265,124)   253
-    (266,352)   255
-    (267,10)   238
-    (268,234)   248
-    (269,400)   248
-    (270,877)   259
-    (270,924)   231
-    (271,944)   245
-    (272,67)   253
-    (273,100)   273
-    (274,979)   284
-    (276,333)   258
-    (277,377)   245
-    (279,877)   252
-    (280,18)   242
-    (281,449)   240
-    (282,179)   259
-    (283,1007)   244
-    (284,595)   271
-    (285,32)   231
-    (286,37)   245
-    (287,126)   299
-    (287,394)   257
-    (288,848)   267
-    (290,317)   257
-    (291,594)   264
-    (292,562)   257
-    (294,466)   265
-    (294,960)   262
-    (295,1)   245
-    (295,106)   252
-    (296,109)   245
-    (296,183)   243
-    (296,245)   238
-    (297,912)   281
-    (297,1006)   269
-    (299,159)   271
-    (300,554)   260
-    (301,774)   240
-    (302,30)   273
-    (303,645)   243
-    (304,229)   263
-    (305,622)   282
-    (307,264)   267
-    (308,28)   241
-    (309,328)   249
-    (309,627)   280
-    (310,357)   234
-    (311,355)   243
-    (312,61)   239
-    (313,758)   265
-    (314,571)   268
-    (315,177)   236
-    (315,298)   244
-    (315,741)   236
-    (316,177)   226
-    (316,308)   279
-    (317,323)   245
-    (318,595)   288
-    (319,126)   281
-    (320,468)   260
-    (321,73)   267
-    (322,235)   246
-    (323,375)   233
-    (323,651)   255
-    (324,549)   239
-    (325,306)   246
-    (325,487)   279
-    (326,649)   272
-    (327,704)   246
-    (328,142)   271
-    (329,176)   257
-    (330,848)   249
-    (330,965)   244
-    (332,795)   265
-    (334,695)   275
-    (335,694)   236
-    (336,775)   251
-    (336,808)   231
-    (337,608)   236
-    (338,993)   243
-    (339,680)   277
-    (340,849)   251
-    (341,36)   273
-    (342,723)   252
-    (343,678)   235
-    (344,384)   255
-    (344,680)   248
-    (345,75)   252
-    (347,996)   264
-    (348,60)   280
-    (348,821)   297
-    (349,804)   265
-    (350,282)   254
-    (351,142)   272
-    (351,937)   275
-    (352,160)   256
-    (353,536)   260
-    (355,352)   264
-    (356,340)   243
-    (358,678)   257
-    (360,679)   276
-    (361,794)   255
-    (361,989)   264
-    (362,816)   295
-    (363,206)   250
-    (364,629)   267
-    (365,990)   269
-    (366,841)   262
-    (366,971)   261
-    (367,888)   315
-    (368,587)   245
-    (369,684)   261
-    (370,270)   253
-    (371,327)   257
-    (372,471)   258
-    (373,88)   246
-    (374,669)   242
-    (375,992)   241
-    (376,336)   259
-    (377,86)   292
-    (378,882)   270
-    (379,592)   264
-    (380,77)   258
-    (380,643)   240
-    (381,1012)   255
-    (382,816)   253
-    (383,711)   240
-    (385,670)   249
-    (386,537)   255
-    (387,347)   240
-    (388,494)   268
-    (389,328)   235
-    (389,733)   237
-    (390,551)   269
-    (391,59)   254
-    (391,600)   270
-    (394,692)   247
-    (396,645)   233
-    (397,835)   259
-    (398,107)   261
-    (398,246)   264
-    (399,436)   267
-    (400,172)   260
-    (400,382)   240
-    (401,790)   245
-    (402,320)   258
-    (403,40)   257
-    (404,641)   250
-    (405,49)   269
-    (405,475)   257
-    (407,320)   277
-    (408,61)   253
-    (410,754)   239
-    (411,643)   269
-    (412,949)   260
-    (413,94)   254
-    (414,991)   257
-    (415,26)   244
-    (416,575)   254
-    (417,366)   232
-    (418,160)   258
-    (418,669)   266
-    (419,209)   252
-    (419,285)   266
-    (420,748)   277
-    (421,614)   258
-    (422,177)   217
-    (423,873)   251
-    (424,542)   258
-    (425,263)   247
-    (426,377)   261
-    (427,149)   236
-    (428,950)   246
-    (429,305)   277
-    (430,718)   237
-    (431,51)   246
-    (432,857)   246
-    (434,604)   248
-    (435,152)   248
-    (436,356)   286
-    (437,105)   235
-    (438,814)   254
-    (440,338)   251
-    (441,982)   259
-    (442,880)   244
-    (443,753)   273
-    (444,669)   240
-    (445,952)   236
-    (446,741)   264
-    (447,970)   247
-    (448,646)   244
-    (448,744)   237
-    (449,835)   286
-    (450,579)   241
-    (451,147)   258
-    (451,1017)   257
-    (452,868)   247
-    (453,26)   262
-    (454,415)   236
-    (454,668)   249
-    (455,43)   247
-    (456,849)   270
-    (456,985)   251
-    (457,218)   266
-    (458,510)   282
-    (459,737)   250
-    (460,836)   269
-    (461,849)   263
-    (461,917)   270
-    (462,900)   262
-    (463,316)   256
-    (464,762)   250
-    (465,355)   262
-    (465,801)   254
-    (466,673)   247
-    (467,112)   260
-    (468,288)   261
-    (470,889)   248
-    (471,650)   269
-    (473,121)   239
-    (473,127)   251
-    (474,487)   265
-    (475,382)   218
-    (476,44)   258
-    (477,342)   257
-    (478,786)   267
-    (480,667)   244
-    (481,558)   252
-    (482,680)   224
-    (483,517)   270
-    (484,961)   276
-    (485,274)   249
-    (486,1015)   262
-    (487,194)   241
-    (489,802)   252
-    (490,811)   260
-    (491,319)   254
-    (492,377)   242
-    (494,432)   207
-    (495,809)   292
-    (496,267)   255
-    (496,902)   247
-    (498,194)   244
-    (499,952)   273
-    (500,84)   259
-    (501,704)   233
-    (503,519)   278
-    (504,510)   264
-    (504,887)   262
-    (505,574)   285
-    (507,643)   259
-    (508,449)   241
-    (512,892)   253
-    (513,271)   242
-    (514,404)   276
-    (515,758)   263
-    (517,369)   271
-    (518,293)   245
-    (519,786)   261
-    (520,270)   256
-    (521,1013)   259
-    (522,284)   262
-    (523,632)   265
-    (524,945)   273
-    (525,94)   249
-    (525,362)   257
-    (526,52)   282
-    (527,61)   242
-    (528,294)   274
-    (529,145)   248
-    (529,998)   261
-    (530,112)   253
-    (531,908)   249
-    (533,674)   252
-    (534,505)   227
-    (535,660)   261
-    (535,776)   265
-    (536,500)   274
-    (537,799)   258
-    (538,492)   241
-    (538,861)   258
-    (540,245)   272
-    (542,137)   268
-    (545,658)   246
-    (546,213)   272
-    (547,767)   255
-    (547,912)   279
-    (547,1018)   252
-    (548,46)   261
-    (548,697)   265
-    (549,602)   257
-    (550,927)   277
-    (552,710)   271
-    (553,391)   244
-    (554,351)   227
-    (555,10)   235
-    (556,26)   238
-    (557,910)   255
-    (558,552)   261
-    (560,792)   265
-    (561,597)   257
-    (562,182)   264
-    (562,862)   261
-    (563,877)   276
-    (564,310)   259
-    (564,609)   251
-    (565,490)   251
-    (566,564)   263
-    (566,607)   251
-    (569,872)   279
-    (570,465)   263
-    (571,271)   271
-    (571,919)   243
-    (572,630)   237
-    (574,603)   272
-    (576,256)   284
-    (579,274)   236
-    (580,182)   252
-    (581,445)   251
-    (582,177)   196
-    (583,118)   280
-    (584,399)   250
-    (585,433)   244
-    (587,254)   237
-    (588,914)   254
-    (589,1016)   269
-    (590,95)   277
-    (590,802)   279
-    (591,978)   265
-    (592,527)   245
-    (593,143)   276
-    (594,430)   232
-    (595,787)   261
-    (596,677)   247
-    (598,788)   250
-    (599,127)   228
-    (600,339)   249
-    (601,478)   271
-    (602,218)   271
-    (603,759)   242
-    (604,270)   247
-    (605,76)   243
-    (606,930)   257
-    (608,832)   267
-    (609,287)   265
-    (610,794)   256
-    (611,759)   247
-    (612,1006)   282
-    (613,398)   239
-    (614,386)   259
-    (615,115)   264
-    (616,928)   254
-    (617,30)   260
-    (618,361)   243
-    (619,996)   222
-    (620,5)   248
-    (620,337)   256
-    (621,41)   251
-    (623,44)   267
-    (623,79)   252
-    (623,966)   263
-    (624,19)   270
-    (624,242)   258
-    (624,524)   244
-    (625,683)   288
-    (626,51)   242
-    (627,361)   257
-    (628,396)   248
-    (629,882)   260
-    (630,341)   237
-    (631,49)   238
-    (631,585)   234
-    (632,73)   268
-    (634,912)   278
-    (635,882)   266
-    (636,617)   252
-    (637,285)   251
-    (637,716)   275
-    (638,113)   274
-    (638,367)   254
-    (639,616)   258
-    (640,837)   234
-    (641,457)   251
-    (643,934)   265
-    (647,783)   240
-    (648,195)   270
-    (649,614)   239
-    (650,957)   265
-    (651,281)   252
-    (652,973)   267
-    (653,60)   249
-    (653,333)   268
-    (654,605)   272
-    (655,910)   234
-    (656,349)   255
-    (659,17)   250
-    (660,591)   275
-    (661,512)   277
-    (663,767)   258
-    (664,158)   224
-    (665,77)   239
-    (666,503)   248
-    (667,951)   261
-    (668,365)   278
-    (669,300)   273
-    (671,141)   272
-    (671,565)   285
-    (672,819)   223
-    (674,36)   249
-    (674,819)   249
-    (675,454)   234
-    (676,242)   263
-    (677,289)   278
-    (677,647)   255
-    (678,802)   240
-    (679,899)   242
-    (680,398)   266
-    (681,390)   266
-    (681,699)   233
-    (682,117)   246
-    (683,110)   265
-    (684,907)   243
-    (685,17)   239
-    (686,202)   255
-    (687,45)   222
-    (688,287)   242
-    (689,502)   257
-    (690,299)   252
-    (691,392)   256
-    (692,600)   264
-    (694,378)   243
-    (695,702)   271
-    (696,102)   251
-    (698,631)   252
-    (699,152)   272
-    (700,840)   267
-    (701,323)   239
-    (702,777)   232
-    (703,132)   264
-    (704,374)   261
-    (705,579)   254
-    (706,511)   233
-    (707,76)   261
-    (708,259)   269
-    (708,925)   266
-    (709,872)   269
-    (709,873)   265
-    (710,107)   235
-    (710,293)   266
-    (711,210)   257
-    (711,462)   267
-    (714,475)   245
-    (715,172)   253
-    (715,751)   241
-    (716,697)   249
-    (717,234)   239
-    (717,620)   244
-    (718,848)   260
-    (719,331)   265
-    (720,201)   255
-    (720,725)   272
-    (721,806)   262
-    (722,415)   239
-    (722,934)   262
-    (723,675)   249
-    (724,480)   259
-    (726,337)   259
-    (727,177)   237
-    (728,797)   272
-    (729,884)   241
-    (730,767)   249
-    (731,275)   275
-    (732,910)   231
-    (733,763)   283
-    (734,574)   263
-    (735,268)   253
-    (736,115)   218
-    (737,146)   238
-    (737,912)   249
-    (738,1023)   252
-    (739,335)   259
-    (740,596)   233
-    (741,365)   270
-    (741,630)   256
-    (742,485)   250
-    (743,186)   252
-    (745,621)   250
-    (745,645)   246
-    (746,273)   276
-    (747,91)   256
-    (748,886)   245
-    (749,59)   273
-    (749,755)   254
-    (751,348)   253
-    (752,313)   255
-    (752,742)   277
-    (752,745)   260
-    (753,472)   260
-    (753,592)   249
-    (754,1007)   234
-    (756,633)   255
-    (758,847)   268
-    (759,500)   253
-    (760,340)   251
-    (760,381)   270
-    (762,962)   270
-    (763,954)   236
-    (764,392)   236
-    (764,913)   258
-    (766,915)   265
-    (766,936)   259
-    (767,372)   266
-    (768,307)   266
-    (770,458)   265
-    (771,103)   241
-    (771,487)   264
-    (773,56)   248
-    (774,773)   259
-    (775,115)   266
-    (776,537)   254
-    (777,392)   258
-    (778,893)   287
-    (779,644)   270
-    (780,256)   263
-    (781,899)   261
-    (782,399)   251
-    (782,892)   277
-    (783,614)   237
-    (784,54)   231
-    (785,816)   261
-    (786,462)   248
-    (787,876)   262
-    (788,273)   276
-    (789,696)   260
-    (790,471)   251
-    (791,793)   261
-    (792,636)   264
-    (792,955)   263
-    (793,809)   269
-    (794,986)   249
-    (795,656)   253
-    (796,347)   246
-    (797,880)   264
-    (798,802)   256
-    (799,294)   267
-    (800,970)   231
-    (801,130)   244
-    (803,896)   256
-    (804,1022)   257
-    (805,32)   232
-    (805,479)   257
-    (806,889)   245
-    (807,504)   251
-    (809,719)   272
-    (809,737)   270
-    (810,646)   241
-    (811,547)   238
-    (812,375)   262
-    (813,200)   257
-    (815,408)   252
-    (816,902)   256
-    (817,430)   241
-    (818,985)   256
-    (819,688)   254
-    (821,839)   257
-    (822,747)   262
-    (823,39)   259
-    (824,886)   241
-    (825,406)   247
-    (826,814)   242
-    (827,625)   266
-    (828,407)   260
-    (829,511)   254
-    (830,915)   263
-    (831,982)   266
-    (832,1003)   246
-    (833,362)   259
-    (833,999)   258
-    (834,136)   263
-    (834,295)   267
-    (835,115)   281
-    (836,218)   272
-    (837,565)   285
-    (839,541)   280
-    (839,711)   273
-    (840,159)   251
-    (841,103)   240
-    (841,636)   271
-    (842,136)   257
-    (843,524)   254
-    (844,114)   260
-    (845,694)   268
-    (846,533)   274
-    (847,741)   243
-    (848,483)   269
-    (849,464)   257
-    (850,302)   245
-    (851,567)   248
-    (852,150)   262
-    (852,529)   258
-    (853,623)   234
-    (855,106)   265
-    (856,1014)   261
-    (857,151)   270
-    (857,650)   280
-    (858,781)   242
-    (858,994)   242
-    (859,508)   255
-    (859,716)   284
-    (862,636)   241
-    (863,21)   242
-    (864,1022)   242
-    (865,972)   264
-    (866,97)   243
-    (867,48)   235
-    (868,303)   249
-    (869,364)   255
-    (870,506)   241
-    (871,453)   255
-    (872,775)   259
-    (873,173)   269
-    (874,485)   249
-    (875,168)   249
-    (876,357)   243
-    (877,722)   255
-    (877,990)   267
-    (880,176)   291
-    (881,23)   268
-    (882,608)   248
-    (883,929)   251
-    (884,643)   247
-    (885,687)   259
-    (887,487)   257
-    (888,110)   266
-    (888,943)   264
-    (889,892)   267
-    (890,628)   261
-    (891,679)   258
-    (892,653)   254
-    (894,33)   258
-    (895,37)   266
-    (895,695)   269
-    (896,390)   269
-    (897,42)   265
-    (900,687)   281
-    (901,146)   241
-    (901,605)   261
-    (902,57)   230
-    (903,1021)   250
-    (904,808)   237
-    (905,795)   271
-    (906,479)   257
-    (907,674)   277
-    (909,456)   250
-    (910,167)   265
-    (911,548)   248
-    (914,924)   250
-    (915,366)   253
-    (915,502)   238
-    (916,420)   273
-    (916,823)   247
-    (918,480)   248
-    (919,970)   259
-    (920,608)   246
-    (921,966)   230
-    (923,216)   247
-    (925,685)   275
-    (926,755)   274
-    (929,538)   268
-    (930,13)   259
-    (931,479)   250
-    (933,860)   261
-    (934,165)   250
-    (935,351)   233
-    (936,399)   244
-    (938,215)   264
-    (939,496)   276
-    (939,748)   262
-    (940,414)   242
-    (941,586)   265
-    (942,356)   274
-    (943,31)   263
-    (943,538)   262
-    (944,109)   249
-    (945,671)   258
-    (946,246)   255
-    (947,182)   262
-    (948,628)   262
-    (949,316)   238
-    (950,1017)   259
-    (951,221)   250
-    (955,457)   237
-    (955,823)   241
-    (956,653)   258
-    (957,656)   255
-    (958,644)   238
-    (959,667)   246
-    (960,78)   247
-    (961,828)   252
-    (962,877)   269
-    (963,397)   284
-    (964,370)   262
-    (965,504)   244
-    (966,483)   246
-    (967,1023)   246
-    (968,400)   233
-    (969,564)   254
-    (970,856)   257
-    (971,875)   243
-    (972,549)   259
-    (972,630)   240
-    (974,934)   281
-    (976,980)   247
-    (977,347)   230
-    (978,123)   258
-    (980,371)   245
-    (981,175)   258
-    (983,58)   252
-    (984,449)   248
-    (984,582)   246
-    (985,72)   253
-    (985,743)   237
-    (986,323)   248
-    (987,120)   241
-    (987,340)   266
-    (988,172)   251
-    (989,585)   241
-    (990,514)   271
-    (991,660)   256
-    (992,15)   283
-    (992,531)   277
-    (993,87)   267
-    (993,674)   252
-    (994,992)   244
-    (995,170)   269
-    (997,946)   270
-    (998,678)   251
-    (999,167)   258
-    (1001,877)   250
-    (1002,286)   242
-    (1004,250)   259
-    (1006,1022)   248
-    (1008,159)   264
-    (1009,574)   258
-    (1012,533)   270
-    (1013,574)   273
-    (1014,667)   247
-    (1015,127)   244
-    (1015,613)   245
-    (1016,457)   246
-    (1017,180)   267
-    (1018,254)   237
-    (1019,287)   248
-    (1020,67)   261
-    (1020,151)   248
-    (1021,810)   239
-    (1022,491)   268
-    (1023,840)   264
-
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 1024 entries, memory: 40.2 KB
-
-    (0,478)   268
-    (0,574)   246
-    (2,376)   235
-    (5,560)   278
-    (6,996)   255
-    (7,183)   256
-    (7,666)   248
-    (8,896)   255
-    (9,187)   274
-    (10,446)   256
-    (11,46)   270
-    (11,955)   284
-    (12,397)   250
-    (12,953)   259
-    (13,192)   278
-    (14,421)   267
-    (15,568)   251
-    (16,788)   225
-    (16,904)   246
-    (17,928)   240
-    (18,103)   262
-    (19,821)   235
-    (19,886)   236
-    (20,474)   267
-    (21,479)   248
-    (21,975)   251
-    (22,569)   255
-    (23,310)   272
-    (24,905)   262
-    (25,241)   225
-    (26,428)   224
-    (28,107)   228
-    (28,441)   274
-    (30,694)   269
-    (32,121)   239
-    (33,81)   249
-    (34,804)   269
-    (36,451)   264
-    (37,609)   263
-    (38,138)   258
-    (39,698)   263
-    (40,950)   236
-    (41,568)   236
-    (42,324)   238
-    (43,798)   244
-    (46,208)   240
-    (47,70)   264
-    (48,336)   277
-    (49,476)   254
-    (50,35)   242
-    (51,556)   265
-    (52,999)   247
-    (53,940)   264
-    (54,558)   257
-    (54,960)   259
-    (55,979)   251
-    (56,90)   305
-    (57,846)   275
-    (57,893)   272
-    (58,35)   260
-    (59,108)   255
-    (60,479)   255
-    (61,590)   264
-    (62,771)   259
-    (63,50)   267
-    (64,268)   276
-    (65,694)   249
-    (66,719)   261
-    (67,411)   239
-    (68,324)   246
-    (69,477)   254
-    (70,539)   241
-    (71,228)   235
-    (72,297)   242
-    (73,665)   269
-    (75,855)   227
-    (76,248)   235
-    (77,433)   251
-    (78,90)   275
-    (81,754)   270
-    (82,243)   286
-    (84,253)   267
-    (86,104)   247
-    (87,657)   255
-    (89,825)   251
-    (90,37)   248
-    (91,234)   259
-    (91,519)   276
-    (92,74)   259
-    (92,218)   266
-    (92,690)   256
-    (93,486)   268
-    (94,637)   277
-    (94,722)   261
-    (96,564)   282
-    (97,748)   245
-    (99,326)   249
-    (100,281)   248
-    (102,609)   258
-    (103,621)   277
-    (104,644)   226
-    (106,652)   244
-    (107,239)   273
-    (107,522)   234
-    (108,131)   274
-    (109,884)   253
-    (110,402)   251
-    (111,905)   256
-    (112,127)   241
-    (112,779)   239
-    (113,278)   251
-    (114,519)   264
-    (115,240)   262
-    (116,198)   258
-    (117,219)   230
-    (117,338)   251
-    (118,99)   260
-    (120,477)   266
-    (121,554)   271
-    (121,715)   291
-    (122,151)   253
-    (123,621)   252
-    (125,177)   236
-    (126,36)   275
-    (128,820)   263
-    (128,835)   248
-    (129,660)   255
-    (130,623)   246
-    (130,807)   273
-    (131,253)   271
-    (131,355)   260
-    (132,570)   264
-    (133,492)   278
-    (134,821)   268
-    (135,295)   266
-    (136,108)   263
-    (137,834)   271
-    (138,288)   253
-    (139,284)   249
-    (139,945)   286
-    (140,887)   265
-    (141,199)   274
-    (142,87)   235
-    (142,225)   261
-    (143,123)   258
-    (144,574)   262
-    (145,552)   250
-    (146,194)   244
-    (146,995)   255
-    (148,357)   253
-    (149,949)   253
-    (150,717)   255
-    (151,484)   272
-    (156,290)   250
-    (157,714)   302
-    (157,974)   274
-    (158,959)   228
-    (160,297)   252
-    (162,601)   264
-    (163,816)   271
-    (164,221)   254
-    (165,396)   243
-    (166,801)   242
-    (167,879)   234
-    (168,321)   273
-    (169,901)   286
-    (170,612)   282
-    (171,15)   253
-    (172,951)   261
-    (174,0)   258
-    (174,595)   259
-    (175,669)   254
-    (176,108)   261
-    (176,188)   279
-    (176,614)   269
-    (176,781)   255
-    (177,17)   261
-    (178,631)   265
-    (179,932)   225
-    (180,830)   258
-    (182,675)   259
-    (182,1001)   257
-    (183,692)   240
-    (184,143)   247
-    (185,450)   240
-    (186,779)   270
-    (187,997)   256
-    (188,357)   265
-    (189,111)   250
-    (190,990)   262
-    (192,644)   269
-    (192,953)   250
-    (193,135)   246
-    (194,137)   267
-    (195,922)   276
-    (197,859)   269
-    (198,910)   239
-    (199,531)   270
-    (201,907)   253
-    (202,863)   255
-    (203,865)   232
-    (204,614)   268
-    (207,826)   239
-    (208,985)   262
-    (209,808)   256
-    (210,659)   250
-    (211,71)   236
-    (211,931)   266
-    (212,426)   291
-    (213,152)   255
-    (214,928)   264
-    (215,268)   270
-    (216,550)   268
-    (217,921)   252
-    (218,704)   246
-    (218,922)   265
-    (219,66)   232
-    (220,704)   235
-    (221,56)   280
-    (221,551)   273
-    (222,545)   243
-    (223,1016)   249
-    (224,721)   261
-    (225,935)   270
-    (226,727)   254
-    (228,743)   240
-    (229,535)   242
-    (230,382)   245
-    (231,551)   260
-    (232,897)   273
-    (233,570)   235
-    (234,520)   246
-    (235,522)   261
-    (236,221)   244
-    (237,755)   271
-    (238,964)   243
-    (239,82)   243
-    (240,388)   238
-    (241,500)   276
-    (242,124)   240
-    (242,193)   243
-    (242,621)   243
-    (243,300)   254
-    (244,588)   256
-    (244,1004)   265
-    (245,494)   253
-    (246,326)   262
-    (247,115)   263
-    (247,147)   263
-    (248,233)   224
-    (250,485)   259
-    (251,708)   262
-    (252,197)   237
-    (253,485)   256
-    (254,40)   243
-    (254,238)   261
-    (255,895)   243
-    (256,114)   268
-    (257,461)   250
-    (257,796)   237
-    (258,233)   236
-    (260,884)   257
-    (261,945)   279
-    (262,368)   260
-    (264,755)   251
-    (265,124)   253
-    (266,352)   255
-    (267,10)   238
-    (268,234)   248
-    (269,400)   248
-    (270,877)   259
-    (270,924)   231
-    (271,944)   245
-    (272,67)   253
-    (273,100)   273
-    (274,979)   284
-    (276,333)   258
-    (277,377)   245
-    (279,877)   252
-    (280,18)   242
-    (281,449)   240
-    (282,179)   259
-    (283,1007)   244
-    (284,595)   271
-    (285,32)   231
-    (286,37)   245
-    (287,126)   299
-    (287,394)   257
-    (288,848)   267
-    (290,317)   257
-    (291,594)   264
-    (292,562)   257
-    (294,466)   265
-    (294,960)   262
-    (295,1)   245
-    (295,106)   252
-    (296,109)   245
-    (296,183)   243
-    (296,245)   238
-    (297,912)   281
-    (297,1006)   269
-    (299,159)   271
-    (300,554)   260
-    (301,774)   240
-    (302,30)   273
-    (303,645)   243
-    (304,229)   263
-    (305,622)   282
-    (307,264)   267
-    (308,28)   241
-    (309,328)   249
-    (309,627)   280
-    (310,357)   234
-    (311,355)   243
-    (312,61)   239
-    (313,758)   265
-    (314,571)   268
-    (315,177)   236
-    (315,298)   244
-    (315,741)   236
-    (316,177)   226
-    (316,308)   279
-    (317,323)   245
-    (318,595)   288
-    (319,126)   281
-    (320,468)   260
-    (321,73)   267
-    (322,235)   246
-    (323,375)   233
-    (323,651)   255
-    (324,549)   239
-    (325,306)   246
-    (325,487)   279
-    (326,649)   272
-    (327,704)   246
-    (328,142)   271
-    (329,176)   257
-    (330,848)   249
-    (330,965)   244
-    (332,795)   265
-    (334,695)   275
-    (335,694)   236
-    (336,775)   251
-    (336,808)   231
-    (337,608)   236
-    (338,993)   243
-    (339,680)   277
-    (340,849)   251
-    (341,36)   273
-    (342,723)   252
-    (343,678)   235
-    (344,384)   255
-    (344,680)   248
-    (345,75)   252
-    (347,996)   264
-    (348,60)   280
-    (348,821)   297
-    (349,804)   265
-    (350,282)   254
-    (351,142)   272
-    (351,937)   275
-    (352,160)   256
-    (353,536)   260
-    (355,352)   264
-    (356,340)   243
-    (358,678)   257
-    (360,679)   276
-    (361,794)   255
-    (361,989)   264
-    (362,816)   295
-    (363,206)   250
-    (364,629)   267
-    (365,990)   269
-    (366,841)   262
-    (366,971)   261
-    (367,888)   315
-    (368,587)   245
-    (369,684)   261
-    (370,270)   253
-    (371,327)   257
-    (372,471)   258
-    (373,88)   246
-    (374,669)   242
-    (375,992)   241
-    (376,336)   259
-    (377,86)   292
-    (378,882)   270
-    (379,592)   264
-    (380,77)   258
-    (380,643)   240
-    (381,1012)   255
-    (382,816)   253
-    (383,711)   240
-    (385,670)   249
-    (386,537)   255
-    (387,347)   240
-    (388,494)   268
-    (389,328)   235
-    (389,733)   237
-    (390,551)   269
-    (391,59)   254
-    (391,600)   270
-    (394,692)   247
-    (396,645)   233
-    (397,835)   259
-    (398,107)   261
-    (398,246)   264
-    (399,436)   267
-    (400,172)   260
-    (400,382)   240
-    (401,790)   245
-    (402,320)   258
-    (403,40)   257
-    (404,641)   250
-    (405,49)   269
-    (405,475)   257
-    (407,320)   277
-    (408,61)   253
-    (410,754)   239
-    (411,643)   269
-    (412,949)   260
-    (413,94)   254
-    (414,991)   257
-    (415,26)   244
-    (416,575)   254
-    (417,366)   232
-    (418,160)   258
-    (418,669)   266
-    (419,209)   252
-    (419,285)   266
-    (420,748)   277
-    (421,614)   258
-    (422,177)   217
-    (423,873)   251
-    (424,542)   258
-    (425,263)   247
-    (426,377)   261
-    (427,149)   236
-    (428,950)   246
-    (429,305)   277
-    (430,718)   237
-    (431,51)   246
-    (432,857)   246
-    (434,604)   248
-    (435,152)   248
-    (436,356)   286
-    (437,105)   235
-    (438,814)   254
-    (440,338)   251
-    (441,982)   259
-    (442,880)   244
-    (443,753)   273
-    (444,669)   240
-    (445,952)   236
-    (446,741)   264
-    (447,970)   247
-    (448,646)   244
-    (448,744)   237
-    (449,835)   286
-    (450,579)   241
-    (451,147)   258
-    (451,1017)   257
-    (452,868)   247
-    (453,26)   262
-    (454,415)   236
-    (454,668)   249
-    (455,43)   247
-    (456,849)   270
-    (456,985)   251
-    (457,218)   266
-    (458,510)   282
-    (459,737)   250
-    (460,836)   269
-    (461,849)   263
-    (461,917)   270
-    (462,900)   262
-    (463,316)   256
-    (464,762)   250
-    (465,355)   262
-    (465,801)   254
-    (466,673)   247
-    (467,112)   260
-    (468,288)   261
-    (470,889)   248
-    (471,650)   269
-    (473,121)   239
-    (473,127)   251
-    (474,487)   265
-    (475,382)   218
-    (476,44)   258
-    (477,342)   257
-    (478,786)   267
-    (480,667)   244
-    (481,558)   252
-    (482,680)   224
-    (483,517)   270
-    (484,961)   276
-    (485,274)   249
-    (486,1015)   262
-    (487,194)   241
-    (489,802)   252
-    (490,811)   260
-    (491,319)   254
-    (492,377)   242
-    (494,432)   207
-    (495,809)   292
-    (496,267)   255
-    (496,902)   247
-    (498,194)   244
-    (499,952)   273
-    (500,84)   259
-    (501,704)   233
-    (503,519)   278
-    (504,510)   264
-    (504,887)   262
-    (505,574)   285
-    (507,643)   259
-    (508,449)   241
-    (512,892)   253
-    (513,271)   242
-    (514,404)   276
-    (515,758)   263
-    (517,369)   271
-    (518,293)   245
-    (519,786)   261
-    (520,270)   256
-    (521,1013)   259
-    (522,284)   262
-    (523,632)   265
-    (524,945)   273
-    (525,94)   249
-    (525,362)   257
-    (526,52)   282
-    (527,61)   242
-    (528,294)   274
-    (529,145)   248
-    (529,998)   261
-    (530,112)   253
-    (531,908)   249
-    (533,674)   252
-    (534,505)   227
-    (535,660)   261
-    (535,776)   265
-    (536,500)   274
-    (537,799)   258
-    (538,492)   241
-    (538,861)   258
-    (540,245)   272
-    (542,137)   268
-    (545,658)   246
-    (546,213)   272
-    (547,767)   255
-    (547,912)   279
-    (547,1018)   252
-    (548,46)   261
-    (548,697)   265
-    (549,602)   257
-    (550,927)   277
-    (552,710)   271
-    (553,391)   244
-    (554,351)   227
-    (555,10)   235
-    (556,26)   238
-    (557,910)   255
-    (558,552)   261
-    (560,792)   265
-    (561,597)   257
-    (562,182)   264
-    (562,862)   261
-    (563,877)   276
-    (564,310)   259
-    (564,609)   251
-    (565,490)   251
-    (566,564)   263
-    (566,607)   251
-    (569,872)   279
-    (570,465)   263
-    (571,271)   271
-    (571,919)   243
-    (572,630)   237
-    (574,603)   272
-    (576,256)   284
-    (579,274)   236
-    (580,182)   252
-    (581,445)   251
-    (582,177)   196
-    (583,118)   280
-    (584,399)   250
-    (585,433)   244
-    (587,254)   237
-    (588,914)   254
-    (589,1016)   269
-    (590,95)   277
-    (590,802)   279
-    (591,978)   265
-    (592,527)   245
-    (593,143)   276
-    (594,430)   232
-    (595,787)   261
-    (596,677)   247
-    (598,788)   250
-    (599,127)   228
-    (600,339)   249
-    (601,478)   271
-    (602,218)   271
-    (603,759)   242
-    (604,270)   247
-    (605,76)   243
-    (606,930)   257
-    (608,832)   267
-    (609,287)   265
-    (610,794)   256
-    (611,759)   247
-    (612,1006)   282
-    (613,398)   239
-    (614,386)   259
-    (615,115)   264
-    (616,928)   254
-    (617,30)   260
-    (618,361)   243
-    (619,996)   222
-    (620,5)   248
-    (620,337)   256
-    (621,41)   251
-    (623,44)   267
-    (623,79)   252
-    (623,966)   263
-    (624,19)   270
-    (624,242)   258
-    (624,524)   244
-    (625,683)   288
-    (626,51)   242
-    (627,361)   257
-    (628,396)   248
-    (629,882)   260
-    (630,341)   237
-    (631,49)   238
-    (631,585)   234
-    (632,73)   268
-    (634,912)   278
-    (635,882)   266
-    (636,617)   252
-    (637,285)   251
-    (637,716)   275
-    (638,113)   274
-    (638,367)   254
-    (639,616)   258
-    (640,837)   234
-    (641,457)   251
-    (643,934)   265
-    (647,783)   240
-    (648,195)   270
-    (649,614)   239
-    (650,957)   265
-    (651,281)   252
-    (652,973)   267
-    (653,60)   249
-    (653,333)   268
-    (654,605)   272
-    (655,910)   234
-    (656,349)   255
-    (659,17)   250
-    (660,591)   275
-    (661,512)   277
-    (663,767)   258
-    (664,158)   224
-    (665,77)   239
-    (666,503)   248
-    (667,951)   261
-    (668,365)   278
-    (669,300)   273
-    (671,141)   272
-    (671,565)   285
-    (672,819)   223
-    (674,36)   249
-    (674,819)   249
-    (675,454)   234
-    (676,242)   263
-    (677,289)   278
-    (677,647)   255
-    (678,802)   240
-    (679,899)   242
-    (680,398)   266
-    (681,390)   266
-    (681,699)   233
-    (682,117)   246
-    (683,110)   265
-    (684,907)   243
-    (685,17)   239
-    (686,202)   255
-    (687,45)   222
-    (688,287)   242
-    (689,502)   257
-    (690,299)   252
-    (691,392)   256
-    (692,600)   264
-    (694,378)   243
-    (695,702)   271
-    (696,102)   251
-    (698,631)   252
-    (699,152)   272
-    (700,840)   267
-    (701,323)   239
-    (702,777)   232
-    (703,132)   264
-    (704,374)   261
-    (705,579)   254
-    (706,511)   233
-    (707,76)   261
-    (708,259)   269
-    (708,925)   266
-    (709,872)   269
-    (709,873)   265
-    (710,107)   235
-    (710,293)   266
-    (711,210)   257
-    (711,462)   267
-    (714,475)   245
-    (715,172)   253
-    (715,751)   241
-    (716,697)   249
-    (717,234)   239
-    (717,620)   244
-    (718,848)   260
-    (719,331)   265
-    (720,201)   255
-    (720,725)   272
-    (721,806)   262
-    (722,415)   239
-    (722,934)   262
-    (723,675)   249
-    (724,480)   259
-    (726,337)   259
-    (727,177)   237
-    (728,797)   272
-    (729,884)   241
-    (730,767)   249
-    (731,275)   275
-    (732,910)   231
-    (733,763)   283
-    (734,574)   263
-    (735,268)   253
-    (736,115)   218
-    (737,146)   238
-    (737,912)   249
-    (738,1023)   252
-    (739,335)   259
-    (740,596)   233
-    (741,365)   270
-    (741,630)   256
-    (742,485)   250
-    (743,186)   252
-    (745,621)   250
-    (745,645)   246
-    (746,273)   276
-    (747,91)   256
-    (748,886)   245
-    (749,59)   273
-    (749,755)   254
-    (751,348)   253
-    (752,313)   255
-    (752,742)   277
-    (752,745)   260
-    (753,472)   260
-    (753,592)   249
-    (754,1007)   234
-    (756,633)   255
-    (758,847)   268
-    (759,500)   253
-    (760,340)   251
-    (760,381)   270
-    (762,962)   270
-    (763,954)   236
-    (764,392)   236
-    (764,913)   258
-    (766,915)   265
-    (766,936)   259
-    (767,372)   266
-    (768,307)   266
-    (770,458)   265
-    (771,103)   241
-    (771,487)   264
-    (773,56)   248
-    (774,773)   259
-    (775,115)   266
-    (776,537)   254
-    (777,392)   258
-    (778,893)   287
-    (779,644)   270
-    (780,256)   263
-    (781,899)   261
-    (782,399)   251
-    (782,892)   277
-    (783,614)   237
-    (784,54)   231
-    (785,816)   261
-    (786,462)   248
-    (787,876)   262
-    (788,273)   276
-    (789,696)   260
-    (790,471)   251
-    (791,793)   261
-    (792,636)   264
-    (792,955)   263
-    (793,809)   269
-    (794,986)   249
-    (795,656)   253
-    (796,347)   246
-    (797,880)   264
-    (798,802)   256
-    (799,294)   267
-    (800,970)   231
-    (801,130)   244
-    (803,896)   256
-    (804,1022)   257
-    (805,32)   232
-    (805,479)   257
-    (806,889)   245
-    (807,504)   251
-    (809,719)   272
-    (809,737)   270
-    (810,646)   241
-    (811,547)   238
-    (812,375)   262
-    (813,200)   257
-    (815,408)   252
-    (816,902)   256
-    (817,430)   241
-    (818,985)   256
-    (819,688)   254
-    (821,839)   257
-    (822,747)   262
-    (823,39)   259
-    (824,886)   241
-    (825,406)   247
-    (826,814)   242
-    (827,625)   266
-    (828,407)   260
-    (829,511)   254
-    (830,915)   263
-    (831,982)   266
-    (832,1003)   246
-    (833,362)   259
-    (833,999)   258
-    (834,136)   263
-    (834,295)   267
-    (835,115)   281
-    (836,218)   272
-    (837,565)   285
-    (839,541)   280
-    (839,711)   273
-    (840,159)   251
-    (841,103)   240
-    (841,636)   271
-    (842,136)   257
-    (843,524)   254
-    (844,114)   260
-    (845,694)   268
-    (846,533)   274
-    (847,741)   243
-    (848,483)   269
-    (849,464)   257
-    (850,302)   245
-    (851,567)   248
-    (852,150)   262
-    (852,529)   258
-    (853,623)   234
-    (855,106)   265
-    (856,1014)   261
-    (857,151)   270
-    (857,650)   280
-    (858,781)   242
-    (858,994)   242
-    (859,508)   255
-    (859,716)   284
-    (862,636)   241
-    (863,21)   242
-    (864,1022)   242
-    (865,972)   264
-    (866,97)   243
-    (867,48)   235
-    (868,303)   249
-    (869,364)   255
-    (870,506)   241
-    (871,453)   255
-    (872,775)   259
-    (873,173)   269
-    (874,485)   249
-    (875,168)   249
-    (876,357)   243
-    (877,722)   255
-    (877,990)   267
-    (880,176)   291
-    (881,23)   268
-    (882,608)   248
-    (883,929)   251
-    (884,643)   247
-    (885,687)   259
-    (887,487)   257
-    (888,110)   266
-    (888,943)   264
-    (889,892)   267
-    (890,628)   261
-    (891,679)   258
-    (892,653)   254
-    (894,33)   258
-    (895,37)   266
-    (895,695)   269
-    (896,390)   269
-    (897,42)   265
-    (900,687)   281
-    (901,146)   241
-    (901,605)   261
-    (902,57)   230
-    (903,1021)   250
-    (904,808)   237
-    (905,795)   271
-    (906,479)   257
-    (907,674)   277
-    (909,456)   250
-    (910,167)   265
-    (911,548)   248
-    (914,924)   250
-    (915,366)   253
-    (915,502)   238
-    (916,420)   273
-    (916,823)   247
-    (918,480)   248
-    (919,970)   259
-    (920,608)   246
-    (921,966)   230
-    (923,216)   247
-    (925,685)   275
-    (926,755)   274
-    (929,538)   268
-    (930,13)   259
-    (931,479)   250
-    (933,860)   261
-    (934,165)   250
-    (935,351)   233
-    (936,399)   244
-    (938,215)   264
-    (939,496)   276
-    (939,748)   262
-    (940,414)   242
-    (941,586)   265
-    (942,356)   274
-    (943,31)   263
-    (943,538)   262
-    (944,109)   249
-    (945,671)   258
-    (946,246)   255
-    (947,182)   262
-    (948,628)   262
-    (949,316)   238
-    (950,1017)   259
-    (951,221)   250
-    (955,457)   237
-    (955,823)   241
-    (956,653)   258
-    (957,656)   255
-    (958,644)   238
-    (959,667)   246
-    (960,78)   247
-    (961,828)   252
-    (962,877)   269
-    (963,397)   284
-    (964,370)   262
-    (965,504)   244
-    (966,483)   246
-    (967,1023)   246
-    (968,400)   233
-    (969,564)   254
-    (970,856)   257
-    (971,875)   243
-    (972,549)   259
-    (972,630)   240
-    (974,934)   281
-    (976,980)   247
-    (977,347)   230
-    (978,123)   258
-    (980,371)   245
-    (981,175)   258
-    (983,58)   252
-    (984,449)   248
-    (984,582)   246
-    (985,72)   253
-    (985,743)   237
-    (986,323)   248
-    (987,120)   241
-    (987,340)   266
-    (988,172)   251
-    (989,585)   241
-    (990,514)   271
-    (991,660)   256
-    (992,15)   283
-    (992,531)   277
-    (993,87)   267
-    (993,674)   252
-    (994,992)   244
-    (995,170)   269
-    (997,946)   270
-    (998,678)   251
-    (999,167)   258
-    (1001,877)   250
-    (1002,286)   242
-    (1004,250)   259
-    (1006,1022)   248
-    (1008,159)   264
-    (1009,574)   258
-    (1012,533)   270
-    (1013,574)   273
-    (1014,667)   247
-    (1015,127)   244
-    (1015,613)   245
-    (1016,457)   246
-    (1017,180)   267
-    (1018,254)   237
-    (1019,287)   248
-    (1020,67)   261
-    (1020,151)   248
-    (1021,810)   239
-    (1022,491)   268
-    (1023,840)   264
-
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS double matrix, sparse by row
-  Diff actual, 1024 entries, memory: 32.2 KB
-
-    (0,478)    0
-    (0,574)    0
-    (2,376)    0
-    (5,560)    0
-    (6,996)    0
-    (7,183)    0
-    (7,666)    0
-    (8,896)    0
-    (9,187)    0
-    (10,446)    0
-    (11,46)    0
-    (11,955)    0
-    (12,397)    0
-    (12,953)    0
-    (13,192)    0
-    (14,421)    0
-    (15,568)    0
-    (16,788)    0
-    (16,904)    0
-    (17,928)    0
-    (18,103)    0
-    (19,821)    0
-    (19,886)    0
-    (20,474)    0
-    (21,479)    0
-    (21,975)    0
-    (22,569)    0
-    (23,310)    0
-    (24,905)    0
-    (25,241)    0
-    (26,428)    0
-    (28,107)    0
-    (28,441)    0
-    (30,694)    0
-    (32,121)    0
-    (33,81)    0
-    (34,804)    0
-    (36,451)    0
-    (37,609)    0
-    (38,138)    0
-    (39,698)    0
-    (40,950)    0
-    (41,568)    0
-    (42,324)    0
-    (43,798)    0
-    (46,208)    0
-    (47,70)    0
-    (48,336)    0
-    (49,476)    0
-    (50,35)    0
-    (51,556)    0
-    (52,999)    0
-    (53,940)    0
-    (54,558)    0
-    (54,960)    0
-    (55,979)    0
-    (56,90)    0
-    (57,846)    0
-    (57,893)    0
-    (58,35)    0
-    (59,108)    0
-    (60,479)    0
-    (61,590)    0
-    (62,771)    0
-    (63,50)    0
-    (64,268)    0
-    (65,694)    0
-    (66,719)    0
-    (67,411)    0
-    (68,324)    0
-    (69,477)    0
-    (70,539)    0
-    (71,228)    0
-    (72,297)    0
-    (73,665)    0
-    (75,855)    0
-    (76,248)    0
-    (77,433)    0
-    (78,90)    0
-    (81,754)    0
-    (82,243)    0
-    (84,253)    0
-    (86,104)    0
-    (87,657)    0
-    (89,825)    0
-    (90,37)    0
-    (91,234)    0
-    (91,519)    0
-    (92,74)    0
-    (92,218)    0
-    (92,690)    0
-    (93,486)    0
-    (94,637)    0
-    (94,722)    0
-    (96,564)    0
-    (97,748)    0
-    (99,326)    0
-    (100,281)    0
-    (102,609)    0
-    (103,621)    0
-    (104,644)    0
-    (106,652)    0
-    (107,239)    0
-    (107,522)    0
-    (108,131)    0
-    (109,884)    0
-    (110,402)    0
-    (111,905)    0
-    (112,127)    0
-    (112,779)    0
-    (113,278)    0
-    (114,519)    0
-    (115,240)    0
-    (116,198)    0
-    (117,219)    0
-    (117,338)    0
-    (118,99)    0
-    (120,477)    0
-    (121,554)    0
-    (121,715)    0
-    (122,151)    0
-    (123,621)    0
-    (125,177)    0
-    (126,36)    0
-    (128,820)    0
-    (128,835)    0
-    (129,660)    0
-    (130,623)    0
-    (130,807)    0
-    (131,253)    0
-    (131,355)    0
-    (132,570)    0
-    (133,492)    0
-    (134,821)    0
-    (135,295)    0
-    (136,108)    0
-    (137,834)    0
-    (138,288)    0
-    (139,284)    0
-    (139,945)    0
-    (140,887)    0
-    (141,199)    0
-    (142,87)    0
-    (142,225)    0
-    (143,123)    0
-    (144,574)    0
-    (145,552)    0
-    (146,194)    0
-    (146,995)    0
-    (148,357)    0
-    (149,949)    0
-    (150,717)    0
-    (151,484)    0
-    (156,290)    0
-    (157,714)    0
-    (157,974)    0
-    (158,959)    0
-    (160,297)    0
-    (162,601)    0
-    (163,816)    0
-    (164,221)    0
-    (165,396)    0
-    (166,801)    0
-    (167,879)    0
-    (168,321)    0
-    (169,901)    0
-    (170,612)    0
-    (171,15)    0
-    (172,951)    0
-    (174,0)    0
-    (174,595)    0
-    (175,669)    0
-    (176,108)    0
-    (176,188)    0
-    (176,614)    0
-    (176,781)    0
-    (177,17)    0
-    (178,631)    0
-    (179,932)    0
-    (180,830)    0
-    (182,675)    0
-    (182,1001)    0
-    (183,692)    0
-    (184,143)    0
-    (185,450)    0
-    (186,779)    0
-    (187,997)    0
-    (188,357)    0
-    (189,111)    0
-    (190,990)    0
-    (192,644)    0
-    (192,953)    0
-    (193,135)    0
-    (194,137)    0
-    (195,922)    0
-    (197,859)    0
-    (198,910)    0
-    (199,531)    0
-    (201,907)    0
-    (202,863)    0
-    (203,865)    0
-    (204,614)    0
-    (207,826)    0
-    (208,985)    0
-    (209,808)    0
-    (210,659)    0
-    (211,71)    0
-    (211,931)    0
-    (212,426)    0
-    (213,152)    0
-    (214,928)    0
-    (215,268)    0
-    (216,550)    0
-    (217,921)    0
-    (218,704)    0
-    (218,922)    0
-    (219,66)    0
-    (220,704)    0
-    (221,56)    0
-    (221,551)    0
-    (222,545)    0
-    (223,1016)    0
-    (224,721)    0
-    (225,935)    0
-    (226,727)    0
-    (228,743)    0
-    (229,535)    0
-    (230,382)    0
-    (231,551)    0
-    (232,897)    0
-    (233,570)    0
-    (234,520)    0
-    (235,522)    0
-    (236,221)    0
-    (237,755)    0
-    (238,964)    0
-    (239,82)    0
-    (240,388)    0
-    (241,500)    0
-    (242,124)    0
-    (242,193)    0
-    (242,621)    0
-    (243,300)    0
-    (244,588)    0
-    (244,1004)    0
-    (245,494)    0
-    (246,326)    0
-    (247,115)    0
-    (247,147)    0
-    (248,233)    0
-    (250,485)    0
-    (251,708)    0
-    (252,197)    0
-    (253,485)    0
-    (254,40)    0
-    (254,238)    0
-    (255,895)    0
-    (256,114)    0
-    (257,461)    0
-    (257,796)    0
-    (258,233)    0
-    (260,884)    0
-    (261,945)    0
-    (262,368)    0
-    (264,755)    0
-    (265,124)    0
-    (266,352)    0
-    (267,10)    0
-    (268,234)    0
-    (269,400)    0
-    (270,877)    0
-    (270,924)    0
-    (271,944)    0
-    (272,67)    0
-    (273,100)    0
-    (274,979)    0
-    (276,333)    0
-    (277,377)    0
-    (279,877)    0
-    (280,18)    0
-    (281,449)    0
-    (282,179)    0
-    (283,1007)    0
-    (284,595)    0
-    (285,32)    0
-    (286,37)    0
-    (287,126)    0
-    (287,394)    0
-    (288,848)    0
-    (290,317)    0
-    (291,594)    0
-    (292,562)    0
-    (294,466)    0
-    (294,960)    0
-    (295,1)    0
-    (295,106)    0
-    (296,109)    0
-    (296,183)    0
-    (296,245)    0
-    (297,912)    0
-    (297,1006)    0
-    (299,159)    0
-    (300,554)    0
-    (301,774)    0
-    (302,30)    0
-    (303,645)    0
-    (304,229)    0
-    (305,622)    0
-    (307,264)    0
-    (308,28)    0
-    (309,328)    0
-    (309,627)    0
-    (310,357)    0
-    (311,355)    0
-    (312,61)    0
-    (313,758)    0
-    (314,571)    0
-    (315,177)    0
-    (315,298)    0
-    (315,741)    0
-    (316,177)    0
-    (316,308)    0
-    (317,323)    0
-    (318,595)    0
-    (319,126)    0
-    (320,468)    0
-    (321,73)    0
-    (322,235)    0
-    (323,375)    0
-    (323,651)    0
-    (324,549)    0
-    (325,306)    0
-    (325,487)    0
-    (326,649)    0
-    (327,704)    0
-    (328,142)    0
-    (329,176)    0
-    (330,848)    0
-    (330,965)    0
-    (332,795)    0
-    (334,695)    0
-    (335,694)    0
-    (336,775)    0
-    (336,808)    0
-    (337,608)    0
-    (338,993)    0
-    (339,680)    0
-    (340,849)    0
-    (341,36)    0
-    (342,723)    0
-    (343,678)    0
-    (344,384)    0
-    (344,680)    0
-    (345,75)    0
-    (347,996)    0
-    (348,60)    0
-    (348,821)    0
-    (349,804)    0
-    (350,282)    0
-    (351,142)    0
-    (351,937)    0
-    (352,160)    0
-    (353,536)    0
-    (355,352)    0
-    (356,340)    0
-    (358,678)    0
-    (360,679)    0
-    (361,794)    0
-    (361,989)    0
-    (362,816)    0
-    (363,206)    0
-    (364,629)    0
-    (365,990)    0
-    (366,841)    0
-    (366,971)    0
-    (367,888)    0
-    (368,587)    0
-    (369,684)    0
-    (370,270)    0
-    (371,327)    0
-    (372,471)    0
-    (373,88)    0
-    (374,669)    0
-    (375,992)    0
-    (376,336)    0
-    (377,86)    0
-    (378,882)    0
-    (379,592)    0
-    (380,77)    0
-    (380,643)    0
-    (381,1012)    0
-    (382,816)    0
-    (383,711)    0
-    (385,670)    0
-    (386,537)    0
-    (387,347)    0
-    (388,494)    0
-    (389,328)    0
-    (389,733)    0
-    (390,551)    0
-    (391,59)    0
-    (391,600)    0
-    (394,692)    0
-    (396,645)    0
-    (397,835)    0
-    (398,107)    0
-    (398,246)    0
-    (399,436)    0
-    (400,172)    0
-    (400,382)    0
-    (401,790)    0
-    (402,320)    0
-    (403,40)    0
-    (404,641)    0
-    (405,49)    0
-    (405,475)    0
-    (407,320)    0
-    (408,61)    0
-    (410,754)    0
-    (411,643)    0
-    (412,949)    0
-    (413,94)    0
-    (414,991)    0
-    (415,26)    0
-    (416,575)    0
-    (417,366)    0
-    (418,160)    0
-    (418,669)    0
-    (419,209)    0
-    (419,285)    0
-    (420,748)    0
-    (421,614)    0
-    (422,177)    0
-    (423,873)    0
-    (424,542)    0
-    (425,263)    0
-    (426,377)    0
-    (427,149)    0
-    (428,950)    0
-    (429,305)    0
-    (430,718)    0
-    (431,51)    0
-    (432,857)    0
-    (434,604)    0
-    (435,152)    0
-    (436,356)    0
-    (437,105)    0
-    (438,814)    0
-    (440,338)    0
-    (441,982)    0
-    (442,880)    0
-    (443,753)    0
-    (444,669)    0
-    (445,952)    0
-    (446,741)    0
-    (447,970)    0
-    (448,646)    0
-    (448,744)    0
-    (449,835)    0
-    (450,579)    0
-    (451,147)    0
-    (451,1017)    0
-    (452,868)    0
-    (453,26)    0
-    (454,415)    0
-    (454,668)    0
-    (455,43)    0
-    (456,849)    0
-    (456,985)    0
-    (457,218)    0
-    (458,510)    0
-    (459,737)    0
-    (460,836)    0
-    (461,849)    0
-    (461,917)    0
-    (462,900)    0
-    (463,316)    0
-    (464,762)    0
-    (465,355)    0
-    (465,801)    0
-    (466,673)    0
-    (467,112)    0
-    (468,288)    0
-    (470,889)    0
-    (471,650)    0
-    (473,121)    0
-    (473,127)    0
-    (474,487)    0
-    (475,382)    0
-    (476,44)    0
-    (477,342)    0
-    (478,786)    0
-    (480,667)    0
-    (481,558)    0
-    (482,680)    0
-    (483,517)    0
-    (484,961)    0
-    (485,274)    0
-    (486,1015)    0
-    (487,194)    0
-    (489,802)    0
-    (490,811)    0
-    (491,319)    0
-    (492,377)    0
-    (494,432)    0
-    (495,809)    0
-    (496,267)    0
-    (496,902)    0
-    (498,194)    0
-    (499,952)    0
-    (500,84)    0
-    (501,704)    0
-    (503,519)    0
-    (504,510)    0
-    (504,887)    0
-    (505,574)    0
-    (507,643)    0
-    (508,449)    0
-    (512,892)    0
-    (513,271)    0
-    (514,404)    0
-    (515,758)    0
-    (517,369)    0
-    (518,293)    0
-    (519,786)    0
-    (520,270)    0
-    (521,1013)    0
-    (522,284)    0
-    (523,632)    0
-    (524,945)    0
-    (525,94)    0
-    (525,362)    0
-    (526,52)    0
-    (527,61)    0
-    (528,294)    0
-    (529,145)    0
-    (529,998)    0
-    (530,112)    0
-    (531,908)    0
-    (533,674)    0
-    (534,505)    0
-    (535,660)    0
-    (535,776)    0
-    (536,500)    0
-    (537,799)    0
-    (538,492)    0
-    (538,861)    0
-    (540,245)    0
-    (542,137)    0
-    (545,658)    0
-    (546,213)    0
-    (547,767)    0
-    (547,912)    0
-    (547,1018)    0
-    (548,46)    0
-    (548,697)    0
-    (549,602)    0
-    (550,927)    0
-    (552,710)    0
-    (553,391)    0
-    (554,351)    0
-    (555,10)    0
-    (556,26)    0
-    (557,910)    0
-    (558,552)    0
-    (560,792)    0
-    (561,597)    0
-    (562,182)    0
-    (562,862)    0
-    (563,877)    0
-    (564,310)    0
-    (564,609)    0
-    (565,490)    0
-    (566,564)    0
-    (566,607)    0
-    (569,872)    0
-    (570,465)    0
-    (571,271)    0
-    (571,919)    0
-    (572,630)    0
-    (574,603)    0
-    (576,256)    0
-    (579,274)    0
-    (580,182)    0
-    (581,445)    0
-    (582,177)    0
-    (583,118)    0
-    (584,399)    0
-    (585,433)    0
-    (587,254)    0
-    (588,914)    0
-    (589,1016)    0
-    (590,95)    0
-    (590,802)    0
-    (591,978)    0
-    (592,527)    0
-    (593,143)    0
-    (594,430)    0
-    (595,787)    0
-    (596,677)    0
-    (598,788)    0
-    (599,127)    0
-    (600,339)    0
-    (601,478)    0
-    (602,218)    0
-    (603,759)    0
-    (604,270)    0
-    (605,76)    0
-    (606,930)    0
-    (608,832)    0
-    (609,287)    0
-    (610,794)    0
-    (611,759)    0
-    (612,1006)    0
-    (613,398)    0
-    (614,386)    0
-    (615,115)    0
-    (616,928)    0
-    (617,30)    0
-    (618,361)    0
-    (619,996)    0
-    (620,5)    0
-    (620,337)    0
-    (621,41)    0
-    (623,44)    0
-    (623,79)    0
-    (623,966)    0
-    (624,19)    0
-    (624,242)    0
-    (624,524)    0
-    (625,683)    0
-    (626,51)    0
-    (627,361)    0
-    (628,396)    0
-    (629,882)    0
-    (630,341)    0
-    (631,49)    0
-    (631,585)    0
-    (632,73)    0
-    (634,912)    0
-    (635,882)    0
-    (636,617)    0
-    (637,285)    0
-    (637,716)    0
-    (638,113)    0
-    (638,367)    0
-    (639,616)    0
-    (640,837)    0
-    (641,457)    0
-    (643,934)    0
-    (647,783)    0
-    (648,195)    0
-    (649,614)    0
-    (650,957)    0
-    (651,281)    0
-    (652,973)    0
-    (653,60)    0
-    (653,333)    0
-    (654,605)    0
-    (655,910)    0
-    (656,349)    0
-    (659,17)    0
-    (660,591)    0
-    (661,512)    0
-    (663,767)    0
-    (664,158)    0
-    (665,77)    0
-    (666,503)    0
-    (667,951)    0
-    (668,365)    0
-    (669,300)    0
-    (671,141)    0
-    (671,565)    0
-    (672,819)    0
-    (674,36)    0
-    (674,819)    0
-    (675,454)    0
-    (676,242)    0
-    (677,289)    0
-    (677,647)    0
-    (678,802)    0
-    (679,899)    0
-    (680,398)    0
-    (681,390)    0
-    (681,699)    0
-    (682,117)    0
-    (683,110)    0
-    (684,907)    0
-    (685,17)    0
-    (686,202)    0
-    (687,45)    0
-    (688,287)    0
-    (689,502)    0
-    (690,299)    0
-    (691,392)    0
-    (692,600)    0
-    (694,378)    0
-    (695,702)    0
-    (696,102)    0
-    (698,631)    0
-    (699,152)    0
-    (700,840)    0
-    (701,323)    0
-    (702,777)    0
-    (703,132)    0
-    (704,374)    0
-    (705,579)    0
-    (706,511)    0
-    (707,76)    0
-    (708,259)    0
-    (708,925)    0
-    (709,872)    0
-    (709,873)    0
-    (710,107)    0
-    (710,293)    0
-    (711,210)    0
-    (711,462)    0
-    (714,475)    0
-    (715,172)    0
-    (715,751)    0
-    (716,697)    0
-    (717,234)    0
-    (717,620)    0
-    (718,848)    0
-    (719,331)    0
-    (720,201)    0
-    (720,725)    0
-    (721,806)    0
-    (722,415)    0
-    (722,934)    0
-    (723,675)    0
-    (724,480)    0
-    (726,337)    0
-    (727,177)    0
-    (728,797)    0
-    (729,884)    0
-    (730,767)    0
-    (731,275)    0
-    (732,910)    0
-    (733,763)    0
-    (734,574)    0
-    (735,268)    0
-    (736,115)    0
-    (737,146)    0
-    (737,912)    0
-    (738,1023)    0
-    (739,335)    0
-    (740,596)    0
-    (741,365)    0
-    (741,630)    0
-    (742,485)    0
-    (743,186)    0
-    (745,621)    0
-    (745,645)    0
-    (746,273)    0
-    (747,91)    0
-    (748,886)    0
-    (749,59)    0
-    (749,755)    0
-    (751,348)    0
-    (752,313)    0
-    (752,742)    0
-    (752,745)    0
-    (753,472)    0
-    (753,592)    0
-    (754,1007)    0
-    (756,633)    0
-    (758,847)    0
-    (759,500)    0
-    (760,340)    0
-    (760,381)    0
-    (762,962)    0
-    (763,954)    0
-    (764,392)    0
-    (764,913)    0
-    (766,915)    0
-    (766,936)    0
-    (767,372)    0
-    (768,307)    0
-    (770,458)    0
-    (771,103)    0
-    (771,487)    0
-    (773,56)    0
-    (774,773)    0
-    (775,115)    0
-    (776,537)    0
-    (777,392)    0
-    (778,893)    0
-    (779,644)    0
-    (780,256)    0
-    (781,899)    0
-    (782,399)    0
-    (782,892)    0
-    (783,614)    0
-    (784,54)    0
-    (785,816)    0
-    (786,462)    0
-    (787,876)    0
-    (788,273)    0
-    (789,696)    0
-    (790,471)    0
-    (791,793)    0
-    (792,636)    0
-    (792,955)    0
-    (793,809)    0
-    (794,986)    0
-    (795,656)    0
-    (796,347)    0
-    (797,880)    0
-    (798,802)    0
-    (799,294)    0
-    (800,970)    0
-    (801,130)    0
-    (803,896)    0
-    (804,1022)    0
-    (805,32)    0
-    (805,479)    0
-    (806,889)    0
-    (807,504)    0
-    (809,719)    0
-    (809,737)    0
-    (810,646)    0
-    (811,547)    0
-    (812,375)    0
-    (813,200)    0
-    (815,408)    0
-    (816,902)    0
-    (817,430)    0
-    (818,985)    0
-    (819,688)    0
-    (821,839)    0
-    (822,747)    0
-    (823,39)    0
-    (824,886)    0
-    (825,406)    0
-    (826,814)    0
-    (827,625)    0
-    (828,407)    0
-    (829,511)    0
-    (830,915)    0
-    (831,982)    0
-    (832,1003)    0
-    (833,362)    0
-    (833,999)    0
-    (834,136)    0
-    (834,295)    0
-    (835,115)    0
-    (836,218)    0
-    (837,565)    0
-    (839,541)    0
-    (839,711)    0
-    (840,159)    0
-    (841,103)    0
-    (841,636)    0
-    (842,136)    0
-    (843,524)    0
-    (844,114)    0
-    (845,694)    0
-    (846,533)    0
-    (847,741)    0
-    (848,483)    0
-    (849,464)    0
-    (850,302)    0
-    (851,567)    0
-    (852,150)    0
-    (852,529)    0
-    (853,623)    0
-    (855,106)    0
-    (856,1014)    0
-    (857,151)    0
-    (857,650)    0
-    (858,781)    0
-    (858,994)    0
-    (859,508)    0
-    (859,716)    0
-    (862,636)    0
-    (863,21)    0
-    (864,1022)    0
-    (865,972)    0
-    (866,97)    0
-    (867,48)    0
-    (868,303)    0
-    (869,364)    0
-    (870,506)    0
-    (871,453)    0
-    (872,775)    0
-    (873,173)    0
-    (874,485)    0
-    (875,168)    0
-    (876,357)    0
-    (877,722)    0
-    (877,990)    0
-    (880,176)    0
-    (881,23)    0
-    (882,608)    0
-    (883,929)    0
-    (884,643)    0
-    (885,687)    0
-    (887,487)    0
-    (888,110)    0
-    (888,943)    0
-    (889,892)    0
-    (890,628)    0
-    (891,679)    0
-    (892,653)    0
-    (894,33)    0
-    (895,37)    0
-    (895,695)    0
-    (896,390)    0
-    (897,42)    0
-    (900,687)    0
-    (901,146)    0
-    (901,605)    0
-    (902,57)    0
-    (903,1021)    0
-    (904,808)    0
-    (905,795)    0
-    (906,479)    0
-    (907,674)    0
-    (909,456)    0
-    (910,167)    0
-    (911,548)    0
-    (914,924)    0
-    (915,366)    0
-    (915,502)    0
-    (916,420)    0
-    (916,823)    0
-    (918,480)    0
-    (919,970)    0
-    (920,608)    0
-    (921,966)    0
-    (923,216)    0
-    (925,685)    0
-    (926,755)    0
-    (929,538)    0
-    (930,13)    0
-    (931,479)    0
-    (933,860)    0
-    (934,165)    0
-    (935,351)    0
-    (936,399)    0
-    (938,215)    0
-    (939,496)    0
-    (939,748)    0
-    (940,414)    0
-    (941,586)    0
-    (942,356)    0
-    (943,31)    0
-    (943,538)    0
-    (944,109)    0
-    (945,671)    0
-    (946,246)    0
-    (947,182)    0
-    (948,628)    0
-    (949,316)    0
-    (950,1017)    0
-    (951,221)    0
-    (955,457)    0
-    (955,823)    0
-    (956,653)    0
-    (957,656)    0
-    (958,644)    0
-    (959,667)    0
-    (960,78)    0
-    (961,828)    0
-    (962,877)    0
-    (963,397)    0
-    (964,370)    0
-    (965,504)    0
-    (966,483)    0
-    (967,1023)    0
-    (968,400)    0
-    (969,564)    0
-    (970,856)    0
-    (971,875)    0
-    (972,549)    0
-    (972,630)    0
-    (974,934)    0
-    (976,980)    0
-    (977,347)    0
-    (978,123)    0
-    (980,371)    0
-    (981,175)    0
-    (983,58)    0
-    (984,449)    0
-    (984,582)    0
-    (985,72)    0
-    (985,743)    0
-    (986,323)    0
-    (987,120)    0
-    (987,340)    0
-    (988,172)    0
-    (989,585)    0
-    (990,514)    0
-    (991,660)    0
-    (992,15)    0
-    (992,531)    0
-    (993,87)    0
-    (993,674)    0
-    (994,992)    0
-    (995,170)    0
-    (997,946)    0
-    (998,678)    0
-    (999,167)    0
-    (1001,877)    0
-    (1002,286)    0
-    (1004,250)    0
-    (1006,1022)    0
-    (1008,159)    0
-    (1009,574)    0
-    (1012,533)    0
-    (1013,574)    0
-    (1014,667)    0
-    (1015,127)    0
-    (1015,613)    0
-    (1016,457)    0
-    (1017,180)    0
-    (1018,254)    0
-    (1019,287)    0
-    (1020,67)    0
-    (1020,151)    0
-    (1021,810)    0
-    (1022,491)    0
-    (1023,840)    0
-
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  T actual, 1024 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (2,376)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (18,103)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    ...
- work:1024 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 1048576 values, invsparse = 1
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling dense
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 524288 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 1048576 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 2097152 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
- rmm_wrap_alloc 4194304 bytes
- rmm_wrap_alloc 8388608 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 5120 values, invsparse = 205
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-5120 nonzeroes left to fill..
-2026 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-Building semiring factgory
- rmm_wrap_alloc 256 bytes
- calling stringify semiring: 0x7f1ff53ef300
-inside enumify: 0x7f1ff53ef300
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 5, no filling
-done assigning buckets
-bucket 5 has 1024 dots to do
-LAUNCHING BUCKET CODE: 5
-Confiring spdnINside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_spdn
-found memory-cached prog GB_jit_AxB_dot3_phase3_spdn
- got kernel instance AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_spdn_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_spdnIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 3.78778ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 131
-
-    (0,478)   1
-    (0,574)   2
-    (2,376) zombie
-    (5,560)   3
-    (6,996)   2
-    (7,183)   0
-    (7,666)   0
-    (8,896)   2
-    (9,187)   0
-    (10,446)   2
-    (11,46)   2
-    (11,955)   2
-    (12,397)   1
-    (12,953)   0
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   0
-    (17,928)   0
-    (18,103) zombie
-    (19,821)   1
-    (19,886)   0
-    (20,474)   4
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   2
-    (24,905)   0
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 893 entries, memory: 28.2 KB
-
-    (0,478)   1
-    (0,574)   2
-    (5,560)   3
-    (6,996)   2
-    (7,183)   0
-    (7,666)   0
-    (8,896)   2
-    (9,187)   0
-    (10,446)   2
-    (11,46)   2
-    (11,955)   2
-    (12,397)   1
-    (12,953)   0
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   0
-    (17,928)   0
-    (19,821)   1
-    (19,886)   0
-    (20,474)   4
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   2
-    (24,905)   0
-    (25,241)   0
-    (26,428)   0
-    (28,107)   2
-    (32,121)   0
-    (33,81)   2
-    (37,609)   2
-    (39,698)   1
-    (41,568)   1
-    (42,324)   0
-    (43,798)   1
-    (46,208)   0
-    (47,70)   1
-    (48,336)   1
-    (49,476)   1
-    (50,35)   0
-    (51,556)   0
-    (52,999)   1
-    (53,940)   1
-    (54,558)   0
-    (54,960)   1
-    (55,979)   1
-    (56,90)   2
-    (57,846)   3
-    (57,893)   0
-    (58,35)   0
-    (59,108)   3
-    (60,479)   1
-    (61,590)   2
-    (62,771)   0
-    (63,50)   0
-    (64,268)   3
-    (66,719)   2
-    (67,411)   2
-    (68,324)   0
-    (69,477)   0
-    (70,539)   1
-    (71,228)   3
-    (72,297)   3
-    (73,665)   0
-    (75,855)   0
-    (76,248)   0
-    (77,433)   4
-    (78,90)   3
-    (81,754)   4
-    (82,243)   2
-    (84,253)   1
-    (86,104)   3
-    (87,657)   0
-    (89,825)   2
-    (90,37)   4
-    (91,234)   1
-    (91,519)   1
-    (92,74)   3
-    (92,218)   1
-    (92,690)   1
-    (93,486)   2
-    (94,637)   0
-    (94,722)   1
-    (96,564)   1
-    (99,326)   2
-    (100,281)   1
-    (102,609)   2
-    (104,644)   0
-    (106,652)   1
-    (107,239)   0
-    (107,522)   2
-    (108,131)   1
-    (109,884)   2
-    (110,402)   3
-    (111,905)   2
-    (112,127)   0
-    (112,779)   0
-    (113,278)   0
-    (114,519)   1
-    (115,240)   4
-    (117,219)   0
-    (117,338)   2
-    (118,99)   4
-    (120,477)   1
-    (121,554)   3
-    (121,715)   3
-    (122,151)   3
-    (125,177)   5
-    (128,820)   6
-    (129,660)   0
-    (130,623)   1
-    (131,253)   1
-    (131,355)   1
-    (133,492)   1
-    (134,821)   0
-    (135,295)   2
-    (136,108)   3
-    (137,834)   2
-    (138,288)   1
-    (139,284)   2
-    (139,945)   0
-    (141,199)   1
-    (142,87)   4
-    (142,225)   1
-    (143,123)   0
-    (144,574)   0
-    (146,194)   3
-    (148,357)   0
-    (149,949)   1
-    (150,717)   2
-    (151,484)   2
-    (156,290)   2
-    (157,714)   0
-    (157,974)   1
-    (160,297)   1
-    (162,601)   2
-    (163,816)   3
-    (164,221)   1
-    (165,396)   1
-    (166,801)   3
-    (167,879)   3
-    (168,321)   0
-    (169,901)   3
-    (172,951)   1
-    (176,108)   1
-    (176,188)   1
-    (176,614)   2
-    (176,781)   1
-    (178,631)   1
-    (179,932)   2
-    (180,830)   3
-    (182,675)   1
-    (182,1001)   2
-    (183,692)   1
-    (184,143)   2
-    (185,450)   1
-    (186,779)   0
-    (187,997)   3
-    (188,357)   1
-    (189,111)   2
-    (190,990)   1
-    (192,644)   0
-    (192,953)   0
-    (193,135)   1
-    (194,137)   4
-    (195,922)   4
-    (197,859)   1
-    (198,910)   1
-    (199,531)   3
-    (201,907)   0
-    (202,863)   1
-    (203,865)   4
-    (204,614)   3
-    (207,826)   1
-    (208,985)   2
-    (209,808)   3
-    (211,71)   4
-    (211,931)   3
-    (212,426)   0
-    (213,152)   0
-    (214,928)   0
-    (215,268)   3
-    (216,550)   3
-    (217,921)   0
-    (218,704)   2
-    (218,922)   2
-    (219,66)   1
-    (220,704)   2
-    (221,56)   1
-    (221,551)   2
-    (222,545)   1
-    (223,1016)   2
-    (224,721)   1
-    (225,935)   1
-    (226,727)   0
-    (228,743)   4
-    (229,535)   2
-    (231,551)   3
-    (232,897)   2
-    (234,520)   2
-    (235,522)   2
-    (236,221)   3
-    (237,755)   2
-    (238,964)   2
-    (239,82)   0
-    (240,388)   0
-    (241,500)   2
-    (242,124)   3
-    (242,193)   0
-    (243,300)   0
-    (244,588)   0
-    (244,1004)   3
-    (245,494)   0
-    (246,326)   1
-    (247,115)   1
-    (247,147)   1
-    (248,233)   0
-    (250,485)   6
-    (251,708)   0
-    (252,197)   1
-    (253,485)   5
-    (254,40)   3
-    (254,238)   0
-    (255,895)   3
-    (256,114)   0
-    (257,461)   2
-    (257,796)   0
-    (258,233)   1
-    (260,884)   2
-    (261,945)   1
-    (262,368)   2
-    (264,755)   1
-    (265,124)   1
-    (266,352)   3
-    (267,10)   1
-    (268,234)   1
-    (269,400)   1
-    (270,877)   0
-    (270,924)   0
-    (271,944)   0
-    (272,67)   3
-    (273,100)   1
-    (274,979)   4
-    (276,333)   2
-    (277,377)   0
-    (279,877)   1
-    (280,18)   3
-    (281,449)   3
-    (282,179)   2
-    (283,1007)   2
-    (285,32)   1
-    (286,37)   2
-    (287,394)   3
-    (288,848)   0
-    (290,317)   0
-    (291,594)   1
-    (294,466)   2
-    (294,960)   0
-    (295,1)   0
-    (295,106)   2
-    (296,109)   2
-    (296,183)   0
-    (296,245)   0
-    (297,912)   1
-    (299,159)   1
-    (300,554)   1
-    (301,774)   1
-    (302,30)   1
-    (303,645)   1
-    (304,229)   1
-    (305,622)   0
-    (307,264)   3
-    (308,28)   0
-    (309,328)   4
-    (309,627)   0
-    (310,357)   1
-    (311,355)   1
-    (312,61)   2
-    (314,571)   3
-    (315,177)   3
-    (315,741)   0
-    (316,177)   3
-    (316,308)   4
-    (320,468)   1
-    (321,73)   0
-    (322,235)   2
-    (323,375)   3
-    (323,651)   3
-    (324,549)   2
-    (325,306)   1
-    (325,487)   1
-    (326,649)   2
-    (327,704)   0
-    (329,176)   2
-    (330,848)   1
-    (330,965)   2
-    (332,795)   1
-    (334,695)   1
-    (336,808)   4
-    (337,608)   1
-    (338,993)   2
-    (339,680)   0
-    (340,849)   1
-    (342,723)   2
-    (343,678)   2
-    (344,384)   3
-    (344,680)   0
-    (345,75)   0
-    (347,996)   3
-    (348,60)   3
-    (348,821)   1
-    (350,282)   1
-    (352,160)   2
-    (353,536)   1
-    (355,352)   5
-    (356,340)   2
-    (358,678)   2
-    (360,679)   1
-    (361,794)   0
-    (361,989)   3
-    (362,816)   2
-    (363,206)   4
-    (364,629)   0
-    (365,990)   0
-    (366,841)   1
-    (366,971)   0
-    (367,888)   2
-    (368,587)   0
-    (369,684)   3
-    (370,270)   1
-    (372,471)   1
-    (373,88)   1
-    (375,992)   2
-    (376,336)   3
-    (377,86)   1
-    (378,882)   1
-    (379,592)   2
-    (380,77)   2
-    (380,643)   2
-    (381,1012)   2
-    (382,816)   2
-    (383,711)   2
-    (385,670)   1
-    (386,537)   1
-    (387,347)   2
-    (388,494)   1
-    (389,328)   3
-    (390,551)   1
-    (391,59)   2
-    (391,600)   1
-    (394,692)   4
-    (396,645)   2
-    (398,107)   3
-    (398,246)   2
-    (399,436)   3
-    (400,172)   0
-    (401,790)   3
-    (402,320)   2
-    (403,40)   2
-    (404,641)   0
-    (405,49)   0
-    (405,475)   1
-    (407,320)   3
-    (408,61)   4
-    (410,754)   3
-    (411,643)   2
-    (412,949)   1
-    (413,94)   5
-    (415,26)   1
-    (416,575)   0
-    (417,366)   3
-    (418,160)   0
-    (419,209)   1
-    (421,614)   1
-    (422,177)   2
-    (423,873)   1
-    (424,542)   3
-    (425,263)   0
-    (426,377)   0
-    (427,149)   0
-    (429,305)   0
-    (430,718)   1
-    (431,51)   0
-    (432,857)   2
-    (434,604)   0
-    (435,152)   2
-    (436,356)   1
-    (437,105)   3
-    (440,338)   0
-    (441,982)   2
-    (442,880)   1
-    (443,753)   1
-    (446,741)   0
-    (448,646)   0
-    (448,744)   2
-    (450,579)   1
-    (451,147)   0
-    (451,1017)   0
-    (452,868)   3
-    (453,26)   1
-    (454,415)   1
-    (454,668)   0
-    (455,43)   0
-    (456,849)   1
-    (456,985)   2
-    (457,218)   2
-    (458,510)   4
-    (459,737)   2
-    (460,836)   2
-    (461,849)   0
-    (461,917)   2
-    (462,900)   1
-    (463,316)   1
-    (464,762)   1
-    (465,355)   1
-    (465,801)   1
-    (466,673)   0
-    (468,288)   1
-    (470,889)   2
-    (471,650)   1
-    (473,121)   1
-    (473,127)   2
-    (474,487)   0
-    (476,44)   0
-    (477,342)   1
-    (480,667)   1
-    (481,558)   0
-    (482,680)   1
-    (483,517)   1
-    (484,961)   1
-    (485,274)   0
-    (486,1015)   3
-    (487,194)   1
-    (489,802)   2
-    (490,811)   1
-    (491,319)   4
-    (492,377)   1
-    (494,432)   1
-    (495,809)   0
-    (496,267)   2
-    (496,902)   1
-    (498,194)   1
-    (500,84)   0
-    (501,704)   2
-    (503,519)   2
-    (504,510)   3
-    (505,574)   1
-    (507,643)   3
-    (508,449)   3
-    (512,892)   3
-    (513,271)   2
-    (517,369)   1
-    (518,293)   2
-    (520,270)   1
-    (521,1013)   1
-    (522,284)   1
-    (524,945)   1
-    (525,94)   5
-    (525,362)   2
-    (526,52)   1
-    (527,61)   3
-    (529,998)   0
-    (531,908)   1
-    (533,674)   4
-    (535,660)   1
-    (535,776)   1
-    (536,500)   3
-    (537,799)   2
-    (538,492)   2
-    (538,861)   1
-    (540,245)   0
-    (542,137)   2
-    (545,658)   0
-    (546,213)   1
-    (547,767)   1
-    (547,912)   3
-    (547,1018)   1
-    (548,46)   2
-    (548,697)   0
-    (549,602)   2
-    (550,927)   2
-    (553,391)   1
-    (554,351)   2
-    (555,10)   2
-    (556,26)   2
-    (557,910)   0
-    (560,792)   0
-    (562,182)   0
-    (562,862)   1
-    (563,877)   0
-    (564,310)   3
-    (564,609)   3
-    (565,490)   0
-    (566,564)   2
-    (566,607)   1
-    (569,872)   0
-    (570,465)   1
-    (571,271)   3
-    (571,919)   1
-    (574,603)   0
-    (576,256)   4
-    (579,274)   0
-    (580,182)   0
-    (581,445)   0
-    (582,177)   3
-    (583,118)   0
-    (584,399)   1
-    (585,433)   4
-    (587,254)   2
-    (588,914)   2
-    (589,1016)   3
-    (590,95)   3
-    (590,802)   2
-    (592,527)   0
-    (593,143)   2
-    (594,430)   0
-    (595,787)   2
-    (598,788)   1
-    (599,127)   3
-    (601,478)   2
-    (602,218)   0
-    (603,759)   1
-    (604,270)   1
-    (605,76)   3
-    (606,930)   0
-    (608,832)   1
-    (609,287)   1
-    (610,794)   0
-    (611,759)   1
-    (613,398)   3
-    (614,386)   4
-    (615,115)   0
-    (616,928)   0
-    (617,30)   2
-    (618,361)   5
-    (619,996)   4
-    (620,5)   3
-    (621,41)   0
-    (623,44)   2
-    (624,19)   1
-    (624,242)   2
-    (624,524)   1
-    (626,51)   0
-    (627,361)   1
-    (628,396)   3
-    (629,882)   1
-    (630,341)   1
-    (631,49)   1
-    (631,585)   1
-    (632,73)   1
-    (634,912)   2
-    (635,882)   1
-    (636,617)   1
-    (637,716)   0
-    (638,113)   1
-    (639,616)   5
-    (640,837)   2
-    (641,457)   1
-    (643,934)   3
-    (647,783)   2
-    (648,195)   1
-    (649,614)   1
-    (650,957)   1
-    (651,281)   2
-    (652,973)   1
-    (653,60)   1
-    (653,333)   2
-    (654,605)   3
-    (655,910)   0
-    (656,349)   3
-    (660,591)   4
-    (661,512)   2
-    (663,767)   0
-    (665,77)   3
-    (666,503)   4
-    (667,951)   2
-    (668,365)   4
-    (669,300)   1
-    (671,141)   1
-    (671,565)   2
-    (672,819)   1
-    (674,819)   1
-    (675,454)   0
-    (676,242)   2
-    (677,289)   4
-    (678,802)   3
-    (680,398)   1
-    (681,390)   1
-    (682,117)   4
-    (683,110)   2
-    (684,907)   0
-    (686,202)   0
-    (687,45)   1
-    (688,287)   2
-    (689,502)   3
-    (690,299)   3
-    (691,392)   2
-    (692,600)   0
-    (694,378)   1
-    (695,702)   1
-    (696,102)   2
-    (698,631)   0
-    (699,152)   1
-    (700,840)   1
-    (702,777)   1
-    (703,132)   1
-    (704,374)   1
-    (705,579)   1
-    (706,511)   3
-    (707,76)   3
-    (708,259)   2
-    (708,925)   0
-    (709,872)   1
-    (709,873)   1
-    (710,107)   3
-    (710,293)   2
-    (711,462)   0
-    (714,475)   2
-    (715,172)   0
-    (715,751)   2
-    (716,697)   0
-    (717,234)   0
-    (718,848)   2
-    (719,331)   1
-    (720,201)   1
-    (720,725)   2
-    (722,415)   2
-    (722,934)   2
-    (723,675)   2
-    (724,480)   3
-    (727,177)   4
-    (728,797)   1
-    (729,884)   1
-    (730,767)   0
-    (731,275)   1
-    (732,910)   0
-    (733,763)   5
-    (734,574)   0
-    (735,268)   3
-    (736,115)   1
-    (737,912)   2
-    (738,1023)   2
-    (739,335)   0
-    (740,596)   3
-    (741,365)   1
-    (742,485)   5
-    (743,186)   1
-    (745,645)   2
-    (746,273)   3
-    (747,91)   5
-    (748,886)   0
-    (749,59)   2
-    (749,755)   2
-    (751,348)   0
-    (752,313)   2
-    (752,742)   0
-    (752,745)   1
-    (753,472)   1
-    (753,592)   1
-    (754,1007)   0
-    (756,633)   1
-    (758,847)   2
-    (759,500)   3
-    (760,340)   2
-    (760,381)   2
-    (762,962)   3
-    (763,954)   0
-    (764,392)   1
-    (764,913)   3
-    (766,915)   3
-    (766,936)   0
-    (767,372)   1
-    (768,307)   0
-    (770,458)   0
-    (771,487)   0
-    (773,56)   1
-    (774,773)   0
-    (775,115)   1
-    (776,537)   1
-    (777,392)   1
-    (778,893)   0
-    (779,644)   0
-    (780,256)   2
-    (782,399)   1
-    (782,892)   2
-    (783,614)   2
-    (785,816)   1
-    (786,462)   1
-    (787,876)   1
-    (788,273)   4
-    (789,696)   2
-    (790,471)   1
-    (791,793)   3
-    (792,636)   3
-    (792,955)   3
-    (793,809)   0
-    (794,986)   1
-    (795,656)   0
-    (796,347)   3
-    (797,880)   2
-    (798,802)   0
-    (801,130)   1
-    (803,896)   3
-    (804,1022)   3
-    (805,32)   1
-    (805,479)   1
-    (806,889)   2
-    (807,504)   3
-    (809,719)   1
-    (809,737)   2
-    (810,646)   0
-    (812,375)   3
-    (813,200)   2
-    (815,408)   3
-    (816,902)   1
-    (817,430)   1
-    (818,985)   5
-    (819,688)   1
-    (821,839)   1
-    (822,747)   1
-    (823,39)   1
-    (824,886)   0
-    (825,406)   0
-    (828,407)   2
-    (829,511)   1
-    (830,915)   2
-    (831,982)   1
-    (832,1003)   2
-    (833,362)   2
-    (833,999)   2
-    (834,136)   2
-    (834,295)   1
-    (835,115)   1
-    (836,218)   2
-    (837,565)   4
-    (839,541)   0
-    (839,711)   0
-    (840,159)   1
-    (841,636)   1
-    (842,136)   2
-    (843,524)   0
-    (844,114)   0
-    (846,533)   1
-    (847,741)   0
-    (848,483)   1
-    (849,464)   3
-    (850,302)   0
-    (851,567)   1
-    (852,150)   4
-    (852,529)   0
-    (853,623)   1
-    (855,106)   2
-    (856,1014)   1
-    (857,151)   2
-    (857,650)   1
-    (858,781)   1
-    (858,994)   0
-    (859,508)   0
-    (859,716)   0
-    (862,636)   2
-    (863,21)   4
-    (864,1022)   2
-    (866,97)   0
-    (867,48)   1
-    (868,303)   1
-    (869,364)   4
-    (871,453)   1
-    (873,173)   0
-    (874,485)   7
-    (875,168)   1
-    (876,357)   0
-    (877,722)   1
-    (877,990)   0
-    (880,176)   2
-    (881,23)   1
-    (882,608)   0
-    (884,643)   3
-    (885,687)   0
-    (887,487)   0
-    (888,110)   2
-    (888,943)   0
-    (889,892)   3
-    (890,628)   2
-    (891,679)   1
-    (892,653)   2
-    (894,33)   0
-    (895,37)   2
-    (895,695)   0
-    (896,390)   0
-    (897,42)   2
-    (900,687)   0
-    (901,605)   2
-    (902,57)   1
-    (903,1021)   1
-    (904,808)   4
-    (905,795)   3
-    (906,479)   0
-    (907,674)   2
-    (909,456)   2
-    (911,548)   1
-    (914,924)   1
-    (915,366)   2
-    (915,502)   3
-    (916,420)   3
-    (916,823)   1
-    (918,480)   3
-    (920,608)   1
-    (925,685)   0
-    (926,755)   4
-    (929,538)   0
-    (930,13)   1
-    (931,479)   3
-    (933,860)   0
-    (934,165)   0
-    (935,351)   2
-    (936,399)   1
-    (938,215)   0
-    (939,496)   0
-    (940,414)   0
-    (941,586)   5
-    (942,356)   1
-    (943,31)   4
-    (943,538)   0
-    (944,109)   3
-    (945,671)   1
-    (946,246)   3
-    (947,182)   0
-    (948,628)   2
-    (949,316)   0
-    (950,1017)   0
-    (951,221)   2
-    (955,457)   1
-    (955,823)   0
-    (956,653)   2
-    (957,656)   0
-    (958,644)   0
-    (959,667)   2
-    (960,78)   3
-    (961,828)   4
-    (962,877)   1
-    (963,397)   1
-    (964,370)   1
-    (965,504)   3
-    (966,483)   2
-    (967,1023)   2
-    (968,400)   0
-    (969,564)   1
-    (970,856)   1
-    (971,875)   1
-    (972,549)   1
-    (974,934)   2
-    (977,347)   3
-    (978,123)   0
-    (981,175)   3
-    (983,58)   1
-    (984,449)   1
-    (984,582)   2
-    (985,72)   1
-    (985,743)   2
-    (987,120)   2
-    (987,340)   4
-    (988,172)   0
-    (989,585)   2
-    (991,660)   1
-    (992,531)   3
-    (993,87)   2
-    (993,674)   2
-    (994,992)   2
-    (995,170)   2
-    (997,946)   1
-    (998,678)   2
-    (1001,877)   1
-    (1002,286)   2
-    (1004,250)   3
-    (1006,1022)   3
-    (1008,159)   1
-    (1009,574)   0
-    (1012,533)   1
-    (1013,574)   1
-    (1014,667)   3
-    (1015,127)   1
-    (1015,613)   2
-    (1016,457)   1
-    (1017,180)   2
-    (1018,254)   2
-    (1019,287)   3
-    (1020,67)   3
-    (1020,151)   2
-    (1021,810)   1
-    (1022,491)   0
-    (1023,840)   2
-
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  C_actual, 893 entries, memory: 28.2 KB
-
-    (0,478)   1
-    (0,574)   2
-    (5,560)   3
-    (6,996)   2
-    (7,183)   0
-    (7,666)   0
-    (8,896)   2
-    (9,187)   0
-    (10,446)   2
-    (11,46)   2
-    (11,955)   2
-    (12,397)   1
-    (12,953)   0
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   0
-    (17,928)   0
-    (19,821)   1
-    (19,886)   0
-    (20,474)   4
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   2
-    (24,905)   0
-    (25,241)   0
-    (26,428)   0
-    (28,107)   2
-    (32,121)   0
-    (33,81)   2
-    (37,609)   2
-    (39,698)   1
-    (41,568)   1
-    (42,324)   0
-    (43,798)   1
-    (46,208)   0
-    (47,70)   1
-    (48,336)   1
-    (49,476)   1
-    (50,35)   0
-    (51,556)   0
-    (52,999)   1
-    (53,940)   1
-    (54,558)   0
-    (54,960)   1
-    (55,979)   1
-    (56,90)   2
-    (57,846)   3
-    (57,893)   0
-    (58,35)   0
-    (59,108)   3
-    (60,479)   1
-    (61,590)   2
-    (62,771)   0
-    (63,50)   0
-    (64,268)   3
-    (66,719)   2
-    (67,411)   2
-    (68,324)   0
-    (69,477)   0
-    (70,539)   1
-    (71,228)   3
-    (72,297)   3
-    (73,665)   0
-    (75,855)   0
-    (76,248)   0
-    (77,433)   4
-    (78,90)   3
-    (81,754)   4
-    (82,243)   2
-    (84,253)   1
-    (86,104)   3
-    (87,657)   0
-    (89,825)   2
-    (90,37)   4
-    (91,234)   1
-    (91,519)   1
-    (92,74)   3
-    (92,218)   1
-    (92,690)   1
-    (93,486)   2
-    (94,637)   0
-    (94,722)   1
-    (96,564)   1
-    (99,326)   2
-    (100,281)   1
-    (102,609)   2
-    (104,644)   0
-    (106,652)   1
-    (107,239)   0
-    (107,522)   2
-    (108,131)   1
-    (109,884)   2
-    (110,402)   3
-    (111,905)   2
-    (112,127)   0
-    (112,779)   0
-    (113,278)   0
-    (114,519)   1
-    (115,240)   4
-    (117,219)   0
-    (117,338)   2
-    (118,99)   4
-    (120,477)   1
-    (121,554)   3
-    (121,715)   3
-    (122,151)   3
-    (125,177)   5
-    (128,820)   6
-    (129,660)   0
-    (130,623)   1
-    (131,253)   1
-    (131,355)   1
-    (133,492)   1
-    (134,821)   0
-    (135,295)   2
-    (136,108)   3
-    (137,834)   2
-    (138,288)   1
-    (139,284)   2
-    (139,945)   0
-    (141,199)   1
-    (142,87)   4
-    (142,225)   1
-    (143,123)   0
-    (144,574)   0
-    (146,194)   3
-    (148,357)   0
-    (149,949)   1
-    (150,717)   2
-    (151,484)   2
-    (156,290)   2
-    (157,714)   0
-    (157,974)   1
-    (160,297)   1
-    (162,601)   2
-    (163,816)   3
-    (164,221)   1
-    (165,396)   1
-    (166,801)   3
-    (167,879)   3
-    (168,321)   0
-    (169,901)   3
-    (172,951)   1
-    (176,108)   1
-    (176,188)   1
-    (176,614)   2
-    (176,781)   1
-    (178,631)   1
-    (179,932)   2
-    (180,830)   3
-    (182,675)   1
-    (182,1001)   2
-    (183,692)   1
-    (184,143)   2
-    (185,450)   1
-    (186,779)   0
-    (187,997)   3
-    (188,357)   1
-    (189,111)   2
-    (190,990)   1
-    (192,644)   0
-    (192,953)   0
-    (193,135)   1
-    (194,137)   4
-    (195,922)   4
-    (197,859)   1
-    (198,910)   1
-    (199,531)   3
-    (201,907)   0
-    (202,863)   1
-    (203,865)   4
-    (204,614)   3
-    (207,826)   1
-    (208,985)   2
-    (209,808)   3
-    (211,71)   4
-    (211,931)   3
-    (212,426)   0
-    (213,152)   0
-    (214,928)   0
-    (215,268)   3
-    (216,550)   3
-    (217,921)   0
-    (218,704)   2
-    (218,922)   2
-    (219,66)   1
-    (220,704)   2
-    (221,56)   1
-    (221,551)   2
-    (222,545)   1
-    (223,1016)   2
-    (224,721)   1
-    (225,935)   1
-    (226,727)   0
-    (228,743)   4
-    (229,535)   2
-    (231,551)   3
-    (232,897)   2
-    (234,520)   2
-    (235,522)   2
-    (236,221)   3
-    (237,755)   2
-    (238,964)   2
-    (239,82)   0
-    (240,388)   0
-    (241,500)   2
-    (242,124)   3
-    (242,193)   0
-    (243,300)   0
-    (244,588)   0
-    (244,1004)   3
-    (245,494)   0
-    (246,326)   1
-    (247,115)   1
-    (247,147)   1
-    (248,233)   0
-    (250,485)   6
-    (251,708)   0
-    (252,197)   1
-    (253,485)   5
-    (254,40)   3
-    (254,238)   0
-    (255,895)   3
-    (256,114)   0
-    (257,461)   2
-    (257,796)   0
-    (258,233)   1
-    (260,884)   2
-    (261,945)   1
-    (262,368)   2
-    (264,755)   1
-    (265,124)   1
-    (266,352)   3
-    (267,10)   1
-    (268,234)   1
-    (269,400)   1
-    (270,877)   0
-    (270,924)   0
-    (271,944)   0
-    (272,67)   3
-    (273,100)   1
-    (274,979)   4
-    (276,333)   2
-    (277,377)   0
-    (279,877)   1
-    (280,18)   3
-    (281,449)   3
-    (282,179)   2
-    (283,1007)   2
-    (285,32)   1
-    (286,37)   2
-    (287,394)   3
-    (288,848)   0
-    (290,317)   0
-    (291,594)   1
-    (294,466)   2
-    (294,960)   0
-    (295,1)   0
-    (295,106)   2
-    (296,109)   2
-    (296,183)   0
-    (296,245)   0
-    (297,912)   1
-    (299,159)   1
-    (300,554)   1
-    (301,774)   1
-    (302,30)   1
-    (303,645)   1
-    (304,229)   1
-    (305,622)   0
-    (307,264)   3
-    (308,28)   0
-    (309,328)   4
-    (309,627)   0
-    (310,357)   1
-    (311,355)   1
-    (312,61)   2
-    (314,571)   3
-    (315,177)   3
-    (315,741)   0
-    (316,177)   3
-    (316,308)   4
-    (320,468)   1
-    (321,73)   0
-    (322,235)   2
-    (323,375)   3
-    (323,651)   3
-    (324,549)   2
-    (325,306)   1
-    (325,487)   1
-    (326,649)   2
-    (327,704)   0
-    (329,176)   2
-    (330,848)   1
-    (330,965)   2
-    (332,795)   1
-    (334,695)   1
-    (336,808)   4
-    (337,608)   1
-    (338,993)   2
-    (339,680)   0
-    (340,849)   1
-    (342,723)   2
-    (343,678)   2
-    (344,384)   3
-    (344,680)   0
-    (345,75)   0
-    (347,996)   3
-    (348,60)   3
-    (348,821)   1
-    (350,282)   1
-    (352,160)   2
-    (353,536)   1
-    (355,352)   5
-    (356,340)   2
-    (358,678)   2
-    (360,679)   1
-    (361,794)   0
-    (361,989)   3
-    (362,816)   2
-    (363,206)   4
-    (364,629)   0
-    (365,990)   0
-    (366,841)   1
-    (366,971)   0
-    (367,888)   2
-    (368,587)   0
-    (369,684)   3
-    (370,270)   1
-    (372,471)   1
-    (373,88)   1
-    (375,992)   2
-    (376,336)   3
-    (377,86)   1
-    (378,882)   1
-    (379,592)   2
-    (380,77)   2
-    (380,643)   2
-    (381,1012)   2
-    (382,816)   2
-    (383,711)   2
-    (385,670)   1
-    (386,537)   1
-    (387,347)   2
-    (388,494)   1
-    (389,328)   3
-    (390,551)   1
-    (391,59)   2
-    (391,600)   1
-    (394,692)   4
-    (396,645)   2
-    (398,107)   3
-    (398,246)   2
-    (399,436)   3
-    (400,172)   0
-    (401,790)   3
-    (402,320)   2
-    (403,40)   2
-    (404,641)   0
-    (405,49)   0
-    (405,475)   1
-    (407,320)   3
-    (408,61)   4
-    (410,754)   3
-    (411,643)   2
-    (412,949)   1
-    (413,94)   5
-    (415,26)   1
-    (416,575)   0
-    (417,366)   3
-    (418,160)   0
-    (419,209)   1
-    (421,614)   1
-    (422,177)   2
-    (423,873)   1
-    (424,542)   3
-    (425,263)   0
-    (426,377)   0
-    (427,149)   0
-    (429,305)   0
-    (430,718)   1
-    (431,51)   0
-    (432,857)   2
-    (434,604)   0
-    (435,152)   2
-    (436,356)   1
-    (437,105)   3
-    (440,338)   0
-    (441,982)   2
-    (442,880)   1
-    (443,753)   1
-    (446,741)   0
-    (448,646)   0
-    (448,744)   2
-    (450,579)   1
-    (451,147)   0
-    (451,1017)   0
-    (452,868)   3
-    (453,26)   1
-    (454,415)   1
-    (454,668)   0
-    (455,43)   0
-    (456,849)   1
-    (456,985)   2
-    (457,218)   2
-    (458,510)   4
-    (459,737)   2
-    (460,836)   2
-    (461,849)   0
-    (461,917)   2
-    (462,900)   1
-    (463,316)   1
-    (464,762)   1
-    (465,355)   1
-    (465,801)   1
-    (466,673)   0
-    (468,288)   1
-    (470,889)   2
-    (471,650)   1
-    (473,121)   1
-    (473,127)   2
-    (474,487)   0
-    (476,44)   0
-    (477,342)   1
-    (480,667)   1
-    (481,558)   0
-    (482,680)   1
-    (483,517)   1
-    (484,961)   1
-    (485,274)   0
-    (486,1015)   3
-    (487,194)   1
-    (489,802)   2
-    (490,811)   1
-    (491,319)   4
-    (492,377)   1
-    (494,432)   1
-    (495,809)   0
-    (496,267)   2
-    (496,902)   1
-    (498,194)   1
-    (500,84)   0
-    (501,704)   2
-    (503,519)   2
-    (504,510)   3
-    (505,574)   1
-    (507,643)   3
-    (508,449)   3
-    (512,892)   3
-    (513,271)   2
-    (517,369)   1
-    (518,293)   2
-    (520,270)   1
-    (521,1013)   1
-    (522,284)   1
-    (524,945)   1
-    (525,94)   5
-    (525,362)   2
-    (526,52)   1
-    (527,61)   3
-    (529,998)   0
-    (531,908)   1
-    (533,674)   4
-    (535,660)   1
-    (535,776)   1
-    (536,500)   3
-    (537,799)   2
-    (538,492)   2
-    (538,861)   1
-    (540,245)   0
-    (542,137)   2
-    (545,658)   0
-    (546,213)   1
-    (547,767)   1
-    (547,912)   3
-    (547,1018)   1
-    (548,46)   2
-    (548,697)   0
-    (549,602)   2
-    (550,927)   2
-    (553,391)   1
-    (554,351)   2
-    (555,10)   2
-    (556,26)   2
-    (557,910)   0
-    (560,792)   0
-    (562,182)   0
-    (562,862)   1
-    (563,877)   0
-    (564,310)   3
-    (564,609)   3
-    (565,490)   0
-    (566,564)   2
-    (566,607)   1
-    (569,872)   0
-    (570,465)   1
-    (571,271)   3
-    (571,919)   1
-    (574,603)   0
-    (576,256)   4
-    (579,274)   0
-    (580,182)   0
-    (581,445)   0
-    (582,177)   3
-    (583,118)   0
-    (584,399)   1
-    (585,433)   4
-    (587,254)   2
-    (588,914)   2
-    (589,1016)   3
-    (590,95)   3
-    (590,802)   2
-    (592,527)   0
-    (593,143)   2
-    (594,430)   0
-    (595,787)   2
-    (598,788)   1
-    (599,127)   3
-    (601,478)   2
-    (602,218)   0
-    (603,759)   1
-    (604,270)   1
-    (605,76)   3
-    (606,930)   0
-    (608,832)   1
-    (609,287)   1
-    (610,794)   0
-    (611,759)   1
-    (613,398)   3
-    (614,386)   4
-    (615,115)   0
-    (616,928)   0
-    (617,30)   2
-    (618,361)   5
-    (619,996)   4
-    (620,5)   3
-    (621,41)   0
-    (623,44)   2
-    (624,19)   1
-    (624,242)   2
-    (624,524)   1
-    (626,51)   0
-    (627,361)   1
-    (628,396)   3
-    (629,882)   1
-    (630,341)   1
-    (631,49)   1
-    (631,585)   1
-    (632,73)   1
-    (634,912)   2
-    (635,882)   1
-    (636,617)   1
-    (637,716)   0
-    (638,113)   1
-    (639,616)   5
-    (640,837)   2
-    (641,457)   1
-    (643,934)   3
-    (647,783)   2
-    (648,195)   1
-    (649,614)   1
-    (650,957)   1
-    (651,281)   2
-    (652,973)   1
-    (653,60)   1
-    (653,333)   2
-    (654,605)   3
-    (655,910)   0
-    (656,349)   3
-    (660,591)   4
-    (661,512)   2
-    (663,767)   0
-    (665,77)   3
-    (666,503)   4
-    (667,951)   2
-    (668,365)   4
-    (669,300)   1
-    (671,141)   1
-    (671,565)   2
-    (672,819)   1
-    (674,819)   1
-    (675,454)   0
-    (676,242)   2
-    (677,289)   4
-    (678,802)   3
-    (680,398)   1
-    (681,390)   1
-    (682,117)   4
-    (683,110)   2
-    (684,907)   0
-    (686,202)   0
-    (687,45)   1
-    (688,287)   2
-    (689,502)   3
-    (690,299)   3
-    (691,392)   2
-    (692,600)   0
-    (694,378)   1
-    (695,702)   1
-    (696,102)   2
-    (698,631)   0
-    (699,152)   1
-    (700,840)   1
-    (702,777)   1
-    (703,132)   1
-    (704,374)   1
-    (705,579)   1
-    (706,511)   3
-    (707,76)   3
-    (708,259)   2
-    (708,925)   0
-    (709,872)   1
-    (709,873)   1
-    (710,107)   3
-    (710,293)   2
-    (711,462)   0
-    (714,475)   2
-    (715,172)   0
-    (715,751)   2
-    (716,697)   0
-    (717,234)   0
-    (718,848)   2
-    (719,331)   1
-    (720,201)   1
-    (720,725)   2
-    (722,415)   2
-    (722,934)   2
-    (723,675)   2
-    (724,480)   3
-    (727,177)   4
-    (728,797)   1
-    (729,884)   1
-    (730,767)   0
-    (731,275)   1
-    (732,910)   0
-    (733,763)   5
-    (734,574)   0
-    (735,268)   3
-    (736,115)   1
-    (737,912)   2
-    (738,1023)   2
-    (739,335)   0
-    (740,596)   3
-    (741,365)   1
-    (742,485)   5
-    (743,186)   1
-    (745,645)   2
-    (746,273)   3
-    (747,91)   5
-    (748,886)   0
-    (749,59)   2
-    (749,755)   2
-    (751,348)   0
-    (752,313)   2
-    (752,742)   0
-    (752,745)   1
-    (753,472)   1
-    (753,592)   1
-    (754,1007)   0
-    (756,633)   1
-    (758,847)   2
-    (759,500)   3
-    (760,340)   2
-    (760,381)   2
-    (762,962)   3
-    (763,954)   0
-    (764,392)   1
-    (764,913)   3
-    (766,915)   3
-    (766,936)   0
-    (767,372)   1
-    (768,307)   0
-    (770,458)   0
-    (771,487)   0
-    (773,56)   1
-    (774,773)   0
-    (775,115)   1
-    (776,537)   1
-    (777,392)   1
-    (778,893)   0
-    (779,644)   0
-    (780,256)   2
-    (782,399)   1
-    (782,892)   2
-    (783,614)   2
-    (785,816)   1
-    (786,462)   1
-    (787,876)   1
-    (788,273)   4
-    (789,696)   2
-    (790,471)   1
-    (791,793)   3
-    (792,636)   3
-    (792,955)   3
-    (793,809)   0
-    (794,986)   1
-    (795,656)   0
-    (796,347)   3
-    (797,880)   2
-    (798,802)   0
-    (801,130)   1
-    (803,896)   3
-    (804,1022)   3
-    (805,32)   1
-    (805,479)   1
-    (806,889)   2
-    (807,504)   3
-    (809,719)   1
-    (809,737)   2
-    (810,646)   0
-    (812,375)   3
-    (813,200)   2
-    (815,408)   3
-    (816,902)   1
-    (817,430)   1
-    (818,985)   5
-    (819,688)   1
-    (821,839)   1
-    (822,747)   1
-    (823,39)   1
-    (824,886)   0
-    (825,406)   0
-    (828,407)   2
-    (829,511)   1
-    (830,915)   2
-    (831,982)   1
-    (832,1003)   2
-    (833,362)   2
-    (833,999)   2
-    (834,136)   2
-    (834,295)   1
-    (835,115)   1
-    (836,218)   2
-    (837,565)   4
-    (839,541)   0
-    (839,711)   0
-    (840,159)   1
-    (841,636)   1
-    (842,136)   2
-    (843,524)   0
-    (844,114)   0
-    (846,533)   1
-    (847,741)   0
-    (848,483)   1
-    (849,464)   3
-    (850,302)   0
-    (851,567)   1
-    (852,150)   4
-    (852,529)   0
-    (853,623)   1
-    (855,106)   2
-    (856,1014)   1
-    (857,151)   2
-    (857,650)   1
-    (858,781)   1
-    (858,994)   0
-    (859,508)   0
-    (859,716)   0
-    (862,636)   2
-    (863,21)   4
-    (864,1022)   2
-    (866,97)   0
-    (867,48)   1
-    (868,303)   1
-    (869,364)   4
-    (871,453)   1
-    (873,173)   0
-    (874,485)   7
-    (875,168)   1
-    (876,357)   0
-    (877,722)   1
-    (877,990)   0
-    (880,176)   2
-    (881,23)   1
-    (882,608)   0
-    (884,643)   3
-    (885,687)   0
-    (887,487)   0
-    (888,110)   2
-    (888,943)   0
-    (889,892)   3
-    (890,628)   2
-    (891,679)   1
-    (892,653)   2
-    (894,33)   0
-    (895,37)   2
-    (895,695)   0
-    (896,390)   0
-    (897,42)   2
-    (900,687)   0
-    (901,605)   2
-    (902,57)   1
-    (903,1021)   1
-    (904,808)   4
-    (905,795)   3
-    (906,479)   0
-    (907,674)   2
-    (909,456)   2
-    (911,548)   1
-    (914,924)   1
-    (915,366)   2
-    (915,502)   3
-    (916,420)   3
-    (916,823)   1
-    (918,480)   3
-    (920,608)   1
-    (925,685)   0
-    (926,755)   4
-    (929,538)   0
-    (930,13)   1
-    (931,479)   3
-    (933,860)   0
-    (934,165)   0
-    (935,351)   2
-    (936,399)   1
-    (938,215)   0
-    (939,496)   0
-    (940,414)   0
-    (941,586)   5
-    (942,356)   1
-    (943,31)   4
-    (943,538)   0
-    (944,109)   3
-    (945,671)   1
-    (946,246)   3
-    (947,182)   0
-    (948,628)   2
-    (949,316)   0
-    (950,1017)   0
-    (951,221)   2
-    (955,457)   1
-    (955,823)   0
-    (956,653)   2
-    (957,656)   0
-    (958,644)   0
-    (959,667)   2
-    (960,78)   3
-    (961,828)   4
-    (962,877)   1
-    (963,397)   1
-    (964,370)   1
-    (965,504)   3
-    (966,483)   2
-    (967,1023)   2
-    (968,400)   0
-    (969,564)   1
-    (970,856)   1
-    (971,875)   1
-    (972,549)   1
-    (974,934)   2
-    (977,347)   3
-    (978,123)   0
-    (981,175)   3
-    (983,58)   1
-    (984,449)   1
-    (984,582)   2
-    (985,72)   1
-    (985,743)   2
-    (987,120)   2
-    (987,340)   4
-    (988,172)   0
-    (989,585)   2
-    (991,660)   1
-    (992,531)   3
-    (993,87)   2
-    (993,674)   2
-    (994,992)   2
-    (995,170)   2
-    (997,946)   1
-    (998,678)   2
-    (1001,877)   1
-    (1002,286)   2
-    (1004,250)   3
-    (1006,1022)   3
-    (1008,159)   1
-    (1009,574)   0
-    (1012,533)   1
-    (1013,574)   1
-    (1014,667)   3
-    (1015,127)   1
-    (1015,613)   2
-    (1016,457)   1
-    (1017,180)   2
-    (1018,254)   2
-    (1019,287)   3
-    (1020,67)   3
-    (1020,151)   2
-    (1021,810)   1
-    (1022,491)   0
-    (1023,840)   2
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, sparse by row
-  Diff actual, 893 entries, memory: 32.2 KB
-
-    (0,478)    0
-    (0,574)    0
-    (5,560)    0
-    (6,996)    0
-    (7,183)    0
-    (7,666)    0
-    (8,896)    0
-    (9,187)    0
-    (10,446)    0
-    (11,46)    0
-    (11,955)    0
-    (12,397)    0
-    (12,953)    0
-    (13,192)    0
-    (14,421)    0
-    (15,568)    0
-    (16,788)    0
-    (16,904)    0
-    (17,928)    0
-    (19,821)    0
-    (19,886)    0
-    (20,474)    0
-    (21,479)    0
-    (21,975)    0
-    (22,569)    0
-    (23,310)    0
-    (24,905)    0
-    (25,241)    0
-    (26,428)    0
-    (28,107)    0
-    (32,121)    0
-    (33,81)    0
-    (37,609)    0
-    (39,698)    0
-    (41,568)    0
-    (42,324)    0
-    (43,798)    0
-    (46,208)    0
-    (47,70)    0
-    (48,336)    0
-    (49,476)    0
-    (50,35)    0
-    (51,556)    0
-    (52,999)    0
-    (53,940)    0
-    (54,558)    0
-    (54,960)    0
-    (55,979)    0
-    (56,90)    0
-    (57,846)    0
-    (57,893)    0
-    (58,35)    0
-    (59,108)    0
-    (60,479)    0
-    (61,590)    0
-    (62,771)    0
-    (63,50)    0
-    (64,268)    0
-    (66,719)    0
-    (67,411)    0
-    (68,324)    0
-    (69,477)    0
-    (70,539)    0
-    (71,228)    0
-    (72,297)    0
-    (73,665)    0
-    (75,855)    0
-    (76,248)    0
-    (77,433)    0
-    (78,90)    0
-    (81,754)    0
-    (82,243)    0
-    (84,253)    0
-    (86,104)    0
-    (87,657)    0
-    (89,825)    0
-    (90,37)    0
-    (91,234)    0
-    (91,519)    0
-    (92,74)    0
-    (92,218)    0
-    (92,690)    0
-    (93,486)    0
-    (94,637)    0
-    (94,722)    0
-    (96,564)    0
-    (99,326)    0
-    (100,281)    0
-    (102,609)    0
-    (104,644)    0
-    (106,652)    0
-    (107,239)    0
-    (107,522)    0
-    (108,131)    0
-    (109,884)    0
-    (110,402)    0
-    (111,905)    0
-    (112,127)    0
-    (112,779)    0
-    (113,278)    0
-    (114,519)    0
-    (115,240)    0
-    (117,219)    0
-    (117,338)    0
-    (118,99)    0
-    (120,477)    0
-    (121,554)    0
-    (121,715)    0
-    (122,151)    0
-    (125,177)    0
-    (128,820)    0
-    (129,660)    0
-    (130,623)    0
-    (131,253)    0
-    (131,355)    0
-    (133,492)    0
-    (134,821)    0
-    (135,295)    0
-    (136,108)    0
-    (137,834)    0
-    (138,288)    0
-    (139,284)    0
-    (139,945)    0
-    (141,199)    0
-    (142,87)    0
-    (142,225)    0
-    (143,123)    0
-    (144,574)    0
-    (146,194)    0
-    (148,357)    0
-    (149,949)    0
-    (150,717)    0
-    (151,484)    0
-    (156,290)    0
-    (157,714)    0
-    (157,974)    0
-    (160,297)    0
-    (162,601)    0
-    (163,816)    0
-    (164,221)    0
-    (165,396)    0
-    (166,801)    0
-    (167,879)    0
-    (168,321)    0
-    (169,901)    0
-    (172,951)    0
-    (176,108)    0
-    (176,188)    0
-    (176,614)    0
-    (176,781)    0
-    (178,631)    0
-    (179,932)    0
-    (180,830)    0
-    (182,675)    0
-    (182,1001)    0
-    (183,692)    0
-    (184,143)    0
-    (185,450)    0
-    (186,779)    0
-    (187,997)    0
-    (188,357)    0
-    (189,111)    0
-    (190,990)    0
-    (192,644)    0
-    (192,953)    0
-    (193,135)    0
-    (194,137)    0
-    (195,922)    0
-    (197,859)    0
-    (198,910)    0
-    (199,531)    0
-    (201,907)    0
-    (202,863)    0
-    (203,865)    0
-    (204,614)    0
-    (207,826)    0
-    (208,985)    0
-    (209,808)    0
-    (211,71)    0
-    (211,931)    0
-    (212,426)    0
-    (213,152)    0
-    (214,928)    0
-    (215,268)    0
-    (216,550)    0
-    (217,921)    0
-    (218,704)    0
-    (218,922)    0
-    (219,66)    0
-    (220,704)    0
-    (221,56)    0
-    (221,551)    0
-    (222,545)    0
-    (223,1016)    0
-    (224,721)    0
-    (225,935)    0
-    (226,727)    0
-    (228,743)    0
-    (229,535)    0
-    (231,551)    0
-    (232,897)    0
-    (234,520)    0
-    (235,522)    0
-    (236,221)    0
-    (237,755)    0
-    (238,964)    0
-    (239,82)    0
-    (240,388)    0
-    (241,500)    0
-    (242,124)    0
-    (242,193)    0
-    (243,300)    0
-    (244,588)    0
-    (244,1004)    0
-    (245,494)    0
-    (246,326)    0
-    (247,115)    0
-    (247,147)    0
-    (248,233)    0
-    (250,485)    0
-    (251,708)    0
-    (252,197)    0
-    (253,485)    0
-    (254,40)    0
-    (254,238)    0
-    (255,895)    0
-    (256,114)    0
-    (257,461)    0
-    (257,796)    0
-    (258,233)    0
-    (260,884)    0
-    (261,945)    0
-    (262,368)    0
-    (264,755)    0
-    (265,124)    0
-    (266,352)    0
-    (267,10)    0
-    (268,234)    0
-    (269,400)    0
-    (270,877)    0
-    (270,924)    0
-    (271,944)    0
-    (272,67)    0
-    (273,100)    0
-    (274,979)    0
-    (276,333)    0
-    (277,377)    0
-    (279,877)    0
-    (280,18)    0
-    (281,449)    0
-    (282,179)    0
-    (283,1007)    0
-    (285,32)    0
-    (286,37)    0
-    (287,394)    0
-    (288,848)    0
-    (290,317)    0
-    (291,594)    0
-    (294,466)    0
-    (294,960)    0
-    (295,1)    0
-    (295,106)    0
-    (296,109)    0
-    (296,183)    0
-    (296,245)    0
-    (297,912)    0
-    (299,159)    0
-    (300,554)    0
-    (301,774)    0
-    (302,30)    0
-    (303,645)    0
-    (304,229)    0
-    (305,622)    0
-    (307,264)    0
-    (308,28)    0
-    (309,328)    0
-    (309,627)    0
-    (310,357)    0
-    (311,355)    0
-    (312,61)    0
-    (314,571)    0
-    (315,177)    0
-    (315,741)    0
-    (316,177)    0
-    (316,308)    0
-    (320,468)    0
-    (321,73)    0
-    (322,235)    0
-    (323,375)    0
-    (323,651)    0
-    (324,549)    0
-    (325,306)    0
-    (325,487)    0
-    (326,649)    0
-    (327,704)    0
-    (329,176)    0
-    (330,848)    0
-    (330,965)    0
-    (332,795)    0
-    (334,695)    0
-    (336,808)    0
-    (337,608)    0
-    (338,993)    0
-    (339,680)    0
-    (340,849)    0
-    (342,723)    0
-    (343,678)    0
-    (344,384)    0
-    (344,680)    0
-    (345,75)    0
-    (347,996)    0
-    (348,60)    0
-    (348,821)    0
-    (350,282)    0
-    (352,160)    0
-    (353,536)    0
-    (355,352)    0
-    (356,340)    0
-    (358,678)    0
-    (360,679)    0
-    (361,794)    0
-    (361,989)    0
-    (362,816)    0
-    (363,206)    0
-    (364,629)    0
-    (365,990)    0
-    (366,841)    0
-    (366,971)    0
-    (367,888)    0
-    (368,587)    0
-    (369,684)    0
-    (370,270)    0
-    (372,471)    0
-    (373,88)    0
-    (375,992)    0
-    (376,336)    0
-    (377,86)    0
-    (378,882)    0
-    (379,592)    0
-    (380,77)    0
-    (380,643)    0
-    (381,1012)    0
-    (382,816)    0
-    (383,711)    0
-    (385,670)    0
-    (386,537)    0
-    (387,347)    0
-    (388,494)    0
-    (389,328)    0
-    (390,551)    0
-    (391,59)    0
-    (391,600)    0
-    (394,692)    0
-    (396,645)    0
-    (398,107)    0
-    (398,246)    0
-    (399,436)    0
-    (400,172)    0
-    (401,790)    0
-    (402,320)    0
-    (403,40)    0
-    (404,641)    0
-    (405,49)    0
-    (405,475)    0
-    (407,320)    0
-    (408,61)    0
-    (410,754)    0
-    (411,643)    0
-    (412,949)    0
-    (413,94)    0
-    (415,26)    0
-    (416,575)    0
-    (417,366)    0
-    (418,160)    0
-    (419,209)    0
-    (421,614)    0
-    (422,177)    0
-    (423,873)    0
-    (424,542)    0
-    (425,263)    0
-    (426,377)    0
-    (427,149)    0
-    (429,305)    0
-    (430,718)    0
-    (431,51)    0
-    (432,857)    0
-    (434,604)    0
-    (435,152)    0
-    (436,356)    0
-    (437,105)    0
-    (440,338)    0
-    (441,982)    0
-    (442,880)    0
-    (443,753)    0
-    (446,741)    0
-    (448,646)    0
-    (448,744)    0
-    (450,579)    0
-    (451,147)    0
-    (451,1017)    0
-    (452,868)    0
-    (453,26)    0
-    (454,415)    0
-    (454,668)    0
-    (455,43)    0
-    (456,849)    0
-    (456,985)    0
-    (457,218)    0
-    (458,510)    0
-    (459,737)    0
-    (460,836)    0
-    (461,849)    0
-    (461,917)    0
-    (462,900)    0
-    (463,316)    0
-    (464,762)    0
-    (465,355)    0
-    (465,801)    0
-    (466,673)    0
-    (468,288)    0
-    (470,889)    0
-    (471,650)    0
-    (473,121)    0
-    (473,127)    0
-    (474,487)    0
-    (476,44)    0
-    (477,342)    0
-    (480,667)    0
-    (481,558)    0
-    (482,680)    0
-    (483,517)    0
-    (484,961)    0
-    (485,274)    0
-    (486,1015)    0
-    (487,194)    0
-    (489,802)    0
-    (490,811)    0
-    (491,319)    0
-    (492,377)    0
-    (494,432)    0
-    (495,809)    0
-    (496,267)    0
-    (496,902)    0
-    (498,194)    0
-    (500,84)    0
-    (501,704)    0
-    (503,519)    0
-    (504,510)    0
-    (505,574)    0
-    (507,643)    0
-    (508,449)    0
-    (512,892)    0
-    (513,271)    0
-    (517,369)    0
-    (518,293)    0
-    (520,270)    0
-    (521,1013)    0
-    (522,284)    0
-    (524,945)    0
-    (525,94)    0
-    (525,362)    0
-    (526,52)    0
-    (527,61)    0
-    (529,998)    0
-    (531,908)    0
-    (533,674)    0
-    (535,660)    0
-    (535,776)    0
-    (536,500)    0
-    (537,799)    0
-    (538,492)    0
-    (538,861)    0
-    (540,245)    0
-    (542,137)    0
-    (545,658)    0
-    (546,213)    0
-    (547,767)    0
-    (547,912)    0
-    (547,1018)    0
-    (548,46)    0
-    (548,697)    0
-    (549,602)    0
-    (550,927)    0
-    (553,391)    0
-    (554,351)    0
-    (555,10)    0
-    (556,26)    0
-    (557,910)    0
-    (560,792)    0
-    (562,182)    0
-    (562,862)    0
-    (563,877)    0
-    (564,310)    0
-    (564,609)    0
-    (565,490)    0
-    (566,564)    0
-    (566,607)    0
-    (569,872)    0
-    (570,465)    0
-    (571,271)    0
-    (571,919)    0
-    (574,603)    0
-    (576,256)    0
-    (579,274)    0
-    (580,182)    0
-    (581,445)    0
-    (582,177)    0
-    (583,118)    0
-    (584,399)    0
-    (585,433)    0
-    (587,254)    0
-    (588,914)    0
-    (589,1016)    0
-    (590,95)    0
-    (590,802)    0
-    (592,527)    0
-    (593,143)    0
-    (594,430)    0
-    (595,787)    0
-    (598,788)    0
-    (599,127)    0
-    (601,478)    0
-    (602,218)    0
-    (603,759)    0
-    (604,270)    0
-    (605,76)    0
-    (606,930)    0
-    (608,832)    0
-    (609,287)    0
-    (610,794)    0
-    (611,759)    0
-    (613,398)    0
-    (614,386)    0
-    (615,115)    0
-    (616,928)    0
-    (617,30)    0
-    (618,361)    0
-    (619,996)    0
-    (620,5)    0
-    (621,41)    0
-    (623,44)    0
-    (624,19)    0
-    (624,242)    0
-    (624,524)    0
-    (626,51)    0
-    (627,361)    0
-    (628,396)    0
-    (629,882)    0
-    (630,341)    0
-    (631,49)    0
-    (631,585)    0
-    (632,73)    0
-    (634,912)    0
-    (635,882)    0
-    (636,617)    0
-    (637,716)    0
-    (638,113)    0
-    (639,616)    0
-    (640,837)    0
-    (641,457)    0
-    (643,934)    0
-    (647,783)    0
-    (648,195)    0
-    (649,614)    0
-    (650,957)    0
-    (651,281)    0
-    (652,973)    0
-    (653,60)    0
-    (653,333)    0
-    (654,605)    0
-    (655,910)    0
-    (656,349)    0
-    (660,591)    0
-    (661,512)    0
-    (663,767)    0
-    (665,77)    0
-    (666,503)    0
-    (667,951)    0
-    (668,365)    0
-    (669,300)    0
-    (671,141)    0
-    (671,565)    0
-    (672,819)    0
-    (674,819)    0
-    (675,454)    0
-    (676,242)    0
-    (677,289)    0
-    (678,802)    0
-    (680,398)    0
-    (681,390)    0
-    (682,117)    0
-    (683,110)    0
-    (684,907)    0
-    (686,202)    0
-    (687,45)    0
-    (688,287)    0
-    (689,502)    0
-    (690,299)    0
-    (691,392)    0
-    (692,600)    0
-    (694,378)    0
-    (695,702)    0
-    (696,102)    0
-    (698,631)    0
-    (699,152)    0
-    (700,840)    0
-    (702,777)    0
-    (703,132)    0
-    (704,374)    0
-    (705,579)    0
-    (706,511)    0
-    (707,76)    0
-    (708,259)    0
-    (708,925)    0
-    (709,872)    0
-    (709,873)    0
-    (710,107)    0
-    (710,293)    0
-    (711,462)    0
-    (714,475)    0
-    (715,172)    0
-    (715,751)    0
-    (716,697)    0
-    (717,234)    0
-    (718,848)    0
-    (719,331)    0
-    (720,201)    0
-    (720,725)    0
-    (722,415)    0
-    (722,934)    0
-    (723,675)    0
-    (724,480)    0
-    (727,177)    0
-    (728,797)    0
-    (729,884)    0
-    (730,767)    0
-    (731,275)    0
-    (732,910)    0
-    (733,763)    0
-    (734,574)    0
-    (735,268)    0
-    (736,115)    0
-    (737,912)    0
-    (738,1023)    0
-    (739,335)    0
-    (740,596)    0
-    (741,365)    0
-    (742,485)    0
-    (743,186)    0
-    (745,645)    0
-    (746,273)    0
-    (747,91)    0
-    (748,886)    0
-    (749,59)    0
-    (749,755)    0
-    (751,348)    0
-    (752,313)    0
-    (752,742)    0
-    (752,745)    0
-    (753,472)    0
-    (753,592)    0
-    (754,1007)    0
-    (756,633)    0
-    (758,847)    0
-    (759,500)    0
-    (760,340)    0
-    (760,381)    0
-    (762,962)    0
-    (763,954)    0
-    (764,392)    0
-    (764,913)    0
-    (766,915)    0
-    (766,936)    0
-    (767,372)    0
-    (768,307)    0
-    (770,458)    0
-    (771,487)    0
-    (773,56)    0
-    (774,773)    0
-    (775,115)    0
-    (776,537)    0
-    (777,392)    0
-    (778,893)    0
-    (779,644)    0
-    (780,256)    0
-    (782,399)    0
-    (782,892)    0
-    (783,614)    0
-    (785,816)    0
-    (786,462)    0
-    (787,876)    0
-    (788,273)    0
-    (789,696)    0
-    (790,471)    0
-    (791,793)    0
-    (792,636)    0
-    (792,955)    0
-    (793,809)    0
-    (794,986)    0
-    (795,656)    0
-    (796,347)    0
-    (797,880)    0
-    (798,802)    0
-    (801,130)    0
-    (803,896)    0
-    (804,1022)    0
-    (805,32)    0
-    (805,479)    0
-    (806,889)    0
-    (807,504)    0
-    (809,719)    0
-    (809,737)    0
-    (810,646)    0
-    (812,375)    0
-    (813,200)    0
-    (815,408)    0
-    (816,902)    0
-    (817,430)    0
-    (818,985)    0
-    (819,688)    0
-    (821,839)    0
-    (822,747)    0
-    (823,39)    0
-    (824,886)    0
-    (825,406)    0
-    (828,407)    0
-    (829,511)    0
-    (830,915)    0
-    (831,982)    0
-    (832,1003)    0
-    (833,362)    0
-    (833,999)    0
-    (834,136)    0
-    (834,295)    0
-    (835,115)    0
-    (836,218)    0
-    (837,565)    0
-    (839,541)    0
-    (839,711)    0
-    (840,159)    0
-    (841,636)    0
-    (842,136)    0
-    (843,524)    0
-    (844,114)    0
-    (846,533)    0
-    (847,741)    0
-    (848,483)    0
-    (849,464)    0
-    (850,302)    0
-    (851,567)    0
-    (852,150)    0
-    (852,529)    0
-    (853,623)    0
-    (855,106)    0
-    (856,1014)    0
-    (857,151)    0
-    (857,650)    0
-    (858,781)    0
-    (858,994)    0
-    (859,508)    0
-    (859,716)    0
-    (862,636)    0
-    (863,21)    0
-    (864,1022)    0
-    (866,97)    0
-    (867,48)    0
-    (868,303)    0
-    (869,364)    0
-    (871,453)    0
-    (873,173)    0
-    (874,485)    0
-    (875,168)    0
-    (876,357)    0
-    (877,722)    0
-    (877,990)    0
-    (880,176)    0
-    (881,23)    0
-    (882,608)    0
-    (884,643)    0
-    (885,687)    0
-    (887,487)    0
-    (888,110)    0
-    (888,943)    0
-    (889,892)    0
-    (890,628)    0
-    (891,679)    0
-    (892,653)    0
-    (894,33)    0
-    (895,37)    0
-    (895,695)    0
-    (896,390)    0
-    (897,42)    0
-    (900,687)    0
-    (901,605)    0
-    (902,57)    0
-    (903,1021)    0
-    (904,808)    0
-    (905,795)    0
-    (906,479)    0
-    (907,674)    0
-    (909,456)    0
-    (911,548)    0
-    (914,924)    0
-    (915,366)    0
-    (915,502)    0
-    (916,420)    0
-    (916,823)    0
-    (918,480)    0
-    (920,608)    0
-    (925,685)    0
-    (926,755)    0
-    (929,538)    0
-    (930,13)    0
-    (931,479)    0
-    (933,860)    0
-    (934,165)    0
-    (935,351)    0
-    (936,399)    0
-    (938,215)    0
-    (939,496)    0
-    (940,414)    0
-    (941,586)    0
-    (942,356)    0
-    (943,31)    0
-    (943,538)    0
-    (944,109)    0
-    (945,671)    0
-    (946,246)    0
-    (947,182)    0
-    (948,628)    0
-    (949,316)    0
-    (950,1017)    0
-    (951,221)    0
-    (955,457)    0
-    (955,823)    0
-    (956,653)    0
-    (957,656)    0
-    (958,644)    0
-    (959,667)    0
-    (960,78)    0
-    (961,828)    0
-    (962,877)    0
-    (963,397)    0
-    (964,370)    0
-    (965,504)    0
-    (966,483)    0
-    (967,1023)    0
-    (968,400)    0
-    (969,564)    0
-    (970,856)    0
-    (971,875)    0
-    (972,549)    0
-    (974,934)    0
-    (977,347)    0
-    (978,123)    0
-    (981,175)    0
-    (983,58)    0
-    (984,449)    0
-    (984,582)    0
-    (985,72)    0
-    (985,743)    0
-    (987,120)    0
-    (987,340)    0
-    (988,172)    0
-    (989,585)    0
-    (991,660)    0
-    (992,531)    0
-    (993,87)    0
-    (993,674)    0
-    (994,992)    0
-    (995,170)    0
-    (997,946)    0
-    (998,678)    0
-    (1001,877)    0
-    (1002,286)    0
-    (1004,250)    0
-    (1006,1022)    0
-    (1008,159)    0
-    (1009,574)    0
-    (1012,533)    0
-    (1013,574)    0
-    (1014,667)    0
-    (1015,127)    0
-    (1015,613)    0
-    (1016,457)    0
-    (1017,180)    0
-    (1018,254)    0
-    (1019,287)    0
-    (1020,67)    0
-    (1020,151)    0
-    (1021,810)    0
-    (1022,491)    0
-    (1023,840)    0
-
-
-  1024x1024 GraphBLAS bool matrix, sparse by row
-  T actual, 893 entries, memory: 25.2 KB
-
-    (0,478)   1
-    (0,574)   1
-    (5,560)   1
-    (6,996)   1
-    (7,183)   1
-    (7,666)   1
-    (8,896)   1
-    (9,187)   1
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   1
-    (12,953)   1
-    (13,192)   1
-    (14,421)   1
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   1
-    (19,821)   1
-    (19,886)   1
-    (20,474)   1
-    (21,479)   1
-    (21,975)   1
-    (22,569)   1
-    (23,310)   1
-    (24,905)   1
-    (25,241)   1
-    (26,428)   1
-    ...
- work:893 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
- rmm_wrap_alloc 8192 bytes
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 4096 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 10240 values, invsparse = 103
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-10240 nonzeroes left to fill..
-4633 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5416f00
-inside enumify: 0x7f1ff5416f00
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 6, no filling
-done assigning buckets
-bucket 6 has 1024 dots to do
-LAUNCHING BUCKET CODE: 6
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vssp
-found memory-cached prog GB_jit_AxB_dot3_phase3_vssp
- got kernel instance AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vssp_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsspIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 1.00352ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1001
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 23 entries, memory: 16.6 KB
-
-    (42,324)   0
-    (73,665)   1
-    (106,652)   0
-    (138,288)   1
-    (242,124)   1
-    (295,1)   0
-    (300,554)   1
-    (312,61)   0
-    (344,384)   0
-    (496,267)   0
-    (587,254)   1
-    (686,202)   0
-    (708,925)   1
-    (715,751)   0
-    (729,884)   0
-    (741,365)   1
-    (751,348)   1
-    (792,636)   0
-    (857,151)   0
-    (876,357)   0
-    (940,414)   0
-    (945,671)   0
-    (968,400)   1
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 23 entries, memory: 1.1 KB
-
-    (42,324)   0
-    (73,665)   1
-    (106,652)   0
-    (138,288)   1
-    (242,124)   1
-    (295,1)   0
-    (300,554)   1
-    (312,61)   0
-    (344,384)   0
-    (496,267)   0
-    (587,254)   1
-    (686,202)   0
-    (708,925)   1
-    (715,751)   0
-    (729,884)   0
-    (741,365)   1
-    (751,348)   1
-    (792,636)   0
-    (857,151)   0
-    (876,357)   0
-    (940,414)   0
-    (945,671)   0
-    (968,400)   1
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 23 entries, memory: 1.2 KB
-
-    (42,324)    0
-    (73,665)    0
-    (106,652)    0
-    (138,288)    0
-    (242,124)    0
-    (295,1)    0
-    (300,554)    0
-    (312,61)    0
-    (344,384)    0
-    (496,267)    0
-    (587,254)    0
-    (686,202)    0
-    (708,925)    0
-    (715,751)    0
-    (729,884)    0
-    (741,365)    0
-    (751,348)    0
-    (792,636)    0
-    (857,151)    0
-    (876,357)    0
-    (940,414)    0
-    (945,671)    0
-    (968,400)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 23 entries, memory: 1.0 KB
-
-    (42,324)   1
-    (73,665)   1
-    (106,652)   1
-    (138,288)   1
-    (242,124)   1
-    (295,1)   1
-    (300,554)   1
-    (312,61)   1
-    (344,384)   1
-    (496,267)   1
-    (587,254)   1
-    (686,202)   1
-    (708,925)   1
-    (715,751)   1
-    (729,884)   1
-    (741,365)   1
-    (751,348)   1
-    (792,636)   1
-    (857,151)   1
-    (876,357)   1
-    (940,414)   1
-    (945,671)   1
-    (968,400)   1
- work:23 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 16384 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
-1024 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff542a200
-inside enumify: 0x7f1ff542a200
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 7, no filling
-done assigning buckets
-bucket 7 has 1024 dots to do
-LAUNCHING BUCKET CODE: 7
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.776192ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
-1024 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5447700
-inside enumify: 0x7f1ff5447700
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 8, no filling
-done assigning buckets
-bucket 8 has 1024 dots to do
-LAUNCHING BUCKET CODE: 8
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.867296ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff5447800
-inside enumify: 0x7f1ff5447800
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 9, no filling
-done assigning buckets
-bucket 9 has 1024 dots to do
-LAUNCHING BUCKET CODE: 9
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.913408ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-504 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 4096 values, invsparse = 256
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-4096 nonzeroes left to fill..
-1491 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff542a400
-inside enumify: 0x7f1ff542a400
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 10, no filling
-done assigning buckets
-bucket 10 has 1024 dots to do
-LAUNCHING BUCKET CODE: 10
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_vsvs
-found memory-cached prog GB_jit_AxB_dot3_phase3_vsvs
- got kernel instance AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_vsvs_int32_t_int32_t_int32_t
-Launching _Z20AxB_dot3_phase3_vsvsIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<2,512,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 0.871424ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1010
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 14 entries, memory: 16.4 KB
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 14 entries, memory: 704 bytes
-
-    (99,326)   0
-    (115,240)   0
-    (176,614)   0
-    (180,830)   1
-    (343,678)   0
-    (398,246)   0
-    (411,643)   0
-    (557,910)   0
-    (590,95)   0
-    (601,478)   0
-    (623,44)   0
-    (729,884)   0
-    (825,406)   1
-    (891,679)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 14 entries, memory: 768 bytes
-
-    (99,326)    0
-    (115,240)    0
-    (176,614)    0
-    (180,830)    0
-    (343,678)    0
-    (398,246)    0
-    (411,643)    0
-    (557,910)    0
-    (590,95)    0
-    (601,478)    0
-    (623,44)    0
-    (729,884)    0
-    (825,406)    0
-    (891,679)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 14 entries, memory: 656 bytes
-
-    (99,326)   1
-    (115,240)   1
-    (176,614)   1
-    (180,830)   1
-    (343,678)   1
-    (398,246)   1
-    (411,643)   1
-    (557,910)   1
-    (590,95)   1
-    (601,478)   1
-    (623,44)   1
-    (729,884)   1
-    (825,406)   1
-    (891,679)   1
- work:14 gpus:0 Getting test data
-Creating problem gen
-filling matrices
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 4567
-fill_random nrows=1024ncols=1024 need 1024 values, invsparse = 1024
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-1024 nonzeroes left to fill..
-35 nonzeroes left to fill..
-inside fill, using seed 543210
-fill_random nrows=1024ncols=1024 need 5120 values, invsparse = 205
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-5120 nonzeroes left to fill..
-2091 nonzeroes left to fill..
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-inside fill, using seed 32
-fill_random nrows=1024ncols=1024 need 2048 values, invsparse = 512
-fill_random after alloc values
-vdim ready 
-vlen ready 
-ready to fill p
-filling sparse
-2048 nonzeroes left to fill..
-569 nonzeroes left to fill..
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
-1024 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-Building semiring factgory
- calling stringify semiring: 0x7f1ff542b400
-inside enumify: 0x7f1ff542b400
-
-    GraphBLAS Semiring: semiring (user-defined)
-    GraphBLAS Monoid: semiring->add (built-in)
-    GraphBLAS BinaryOp: monoid->op (built-in) z=plus(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-    identity: [   0 ] 
-
-    GraphBLAS BinaryOp: semiring->multiply (built-in) z=times(x,y)
-    GraphBLAS type: ztype int32_t size: 4
-    GraphBLAS type: xtype int32_t size: 4
-    GraphBLAS type: ytype int32_t size: 4
-Getting semiring add
-Getting semiring mult
-Getting semiring add op
-Getting types
-Getting opcodes
-Getting typecodes
-Performing asserts
-Invoking boolean rename
-Invoking boolean rename
-Invoking enumify binop
-e 14
-Invoking enumify monoid
-Calling enumify binop
-Inside plus binop code
-e 11
-Calling enumify identity
-Calling enumify terminal
-Done enumify monoid
-Done invoking enumify monoid
-atype
-btype
-ctype
-Invoking enumify_mask, mtype 0x7f2028b56f40
-GB_enumify_mask gets mcode: 6 Mask_struct: 0 Mask_comp: 0
-got mask_ecode: 8
-constructing semiring scode
-before: add_ecode: 11, id_ecode: 0, term_ecode: 31, mult_ecode: 14, flipxy: 0, zcode: 6, xcode: 6, ycode: 6, mask_ecode: 8, ccode: 6, acode: 6, bcode: 6, csparsity: 0, msparsity: 0, asparsity: 0, bsparsity: 0
-serialized_scode: 397409434378593792
-done enumify semiring
-scode=397409434378593792
-done stringify semiring
- returned from  stringify semiring
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 8192 bytes
-1024 slots to fill
-all pairs to bucket 11, no filling
-done assigning buckets
-bucket 11 has 1024 dots to do
-LAUNCHING BUCKET CODE: 11
-INside get cached file
-looking for prog in file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-opening /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h for write
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_semiring_397409434378593792.h
-successful_read: 1
-Just closed
- jit_cache get program GB_jit_AxB_dot3_phase3_mp
-found memory-cached prog GB_jit_AxB_dot3_phase3_mp
- got kernel instance AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-found memory-cached prog AxB_dot3_phase3_mp_int32_t_int32_t_int32_t
-Launching _Z18AxB_dot3_phase3_mpIiiiEvxxPxP16GB_Matrix_opaqueS2_S2_S2_i<<<32,32,0,0>>>(long,long,long*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,GB_Matrix_opaque*,int)
-warp 22 zombie count = 31, nzombies = 0
-warp 17 zombie count = 32, nzombies = 0
-warp 11 zombie count = 32, nzombies = 0
-warp 1 zombie count = 32, nzombies = 0
-warp 3 zombie count = 32, nzombies = 0
-warp 21 zombie count = 32, nzombies = 0
-warp 27 zombie count = 32, nzombies = 0
-warp 9 zombie count = 31, nzombies = 0
-warp 15 zombie count = 32, nzombies = 0
-warp 5 zombie count = 32, nzombies = 0
-warp 30 zombie count = 32, nzombies = 0
-warp 6 zombie count = 32, nzombies = 0
-warp 24 zombie count = 32, nzombies = 0
-warp 13 zombie count = 31, nzombies = 0
-warp 7 zombie count = 31, nzombies = 0
-warp 20 zombie count = 32, nzombies = 0
-warp 14 zombie count = 32, nzombies = 0
-warp 2 zombie count = 32, nzombies = 0
-warp 19 zombie count = 32, nzombies = 0
-warp 25 zombie count = 32, nzombies = 0
-warp 31 zombie count = 32, nzombies = 0
-warp 16 zombie count = 32, nzombies = 0
-warp 26 zombie count = 31, nzombies = 0
-warp 4 zombie count = 31, nzombies = 0
-warp 29 zombie count = 32, nzombies = 0
-warp 12 zombie count = 32, nzombies = 0
-warp 18 zombie count = 32, nzombies = 0
-warp 10 zombie count = 32, nzombies = 0
-warp 28 zombie count = 31, nzombies = 0
-warp 23 zombie count = 32, nzombies = 0
-warp 0 zombie count = 32, nzombies = 0
- Czombie = 64
- Czombie = 95
- Czombie = 127
- Czombie = 127
- Czombie = 191
- Czombie = 254
- Czombie = 254
- Czombie = 254
- Czombie = 349
- Czombie = 349
- Czombie = 349
- Czombie = 381
- Czombie = 444
- Czombie = 444
- Czombie = 540
- Czombie = 540
- Czombie = 540
- Czombie = 636
- Czombie = 636
- Czombie = 636
- Czombie = 668
- Czombie = 731
- Czombie = 731
-warp 8 zombie count = 31, nzombies = 668
- Czombie = 762
- Czombie = 857
- Czombie = 857
- Czombie = 857
- Czombie = 953
- Czombie = 953
- Czombie = 953
- Czombie = 985
- Czombie = 1016
-Printing bucketp
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-Done.
-returned from kernel 1.59642ms
-
-  1024x1024 GraphBLAS int32_t matrix, sparse (jumbled) by row
-  sparsity control: sparse only
-  C GPU, 1024 entries, memory: 28.2 KB
-  pending tuples: 0 max pending: 0 zombies: 1016
-
-    (0,478) zombie
-    (0,574) zombie
-    (2,376) zombie
-    (5,560) zombie
-    (6,996) zombie
-    (7,183) zombie
-    (7,666) zombie
-    (8,896) zombie
-    (9,187) zombie
-    (10,446) zombie
-    (11,46) zombie
-    (11,955) zombie
-    (12,397) zombie
-    (12,953) zombie
-    (13,192) zombie
-    (14,421) zombie
-    (15,568) zombie
-    (16,788) zombie
-    (16,904) zombie
-    (17,928) zombie
-    (18,103) zombie
-    (19,821) zombie
-    (19,886) zombie
-    (20,474) zombie
-    (21,479) zombie
-    (21,975) zombie
-    (22,569) zombie
-    (23,310) zombie
-    (24,905) zombie
-    ...
- rmm_wrap_alloc 256 bytes
-Not using cuda path. M_is_hypersparse: 0, A->iso: 0, B->iso: 0, A_BITMAP: 0, B_BITMAP: 0, GB_IS_FULL(A): 0, GB_IS_FULL(B): 0, semiring header size: 32
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  M actual, 1024 entries, memory: 28.2 KB
-
-    (0,478)   0
-    (0,574)   0
-    (2,376)   1
-    (5,560)   0
-    (6,996)   0
-    (7,183)   1
-    (7,666)   1
-    (8,896)   0
-    (9,187)   0
-    (10,446)   1
-    (11,46)   1
-    (11,955)   1
-    (12,397)   0
-    (12,953)   1
-    (13,192)   1
-    (14,421)   0
-    (15,568)   1
-    (16,788)   1
-    (16,904)   1
-    (17,928)   0
-    (18,103)   1
-    (19,821)   0
-    (19,886)   0
-    (20,474)   0
-    (21,479)   1
-    (21,975)   0
-    (22,569)   1
-    (23,310)   0
-    (24,905)   1
-    ...
- rmm_wrap_alloc 16384 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  C GPU, 8 entries, memory: 16.3 KB
-
-    (235,522)   1
-    (309,328)   1
-    (417,366)   0
-    (565,490)   0
-    (611,759)   0
-    (714,475)   1
-    (766,915)   0
-    (877,722)   0
-
-
-  1024x1024 GraphBLAS int32_t matrix, hypersparse by row
-  C_actual, 8 entries, memory: 544 bytes
-
-    (235,522)   1
-    (309,328)   1
-    (417,366)   0
-    (565,490)   0
-    (611,759)   0
-    (714,475)   1
-    (766,915)   0
-    (877,722)   0
-
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  1024x1024 GraphBLAS double matrix, hypersparse by row
-  Diff actual, 8 entries, memory: 576 bytes
-
-    (235,522)    0
-    (309,328)    0
-    (417,366)    0
-    (565,490)    0
-    (611,759)    0
-    (714,475)    0
-    (766,915)    0
-    (877,722)    0
-
-
-  1024x1024 GraphBLAS bool matrix, hypersparse by row
-  T actual, 8 entries, memory: 520 bytes
-
-    (235,522)   1
-    (309,328)   1
-    (417,366)   1
-    (565,490)   1
-    (611,759)   1
-    (714,475)   1
-    (766,915)   1
-    (877,722)   1
- work:8 gpus:0 [       OK ] AxB_dot3_tests_PLUS_TIMES_3.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (480 ms)
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_3 (3226 ms total)
-
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_4
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_4.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  my mat, 32 entries, memory: 1.5 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  A, 32 entries, memory: 1.5 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    (29,1)   29
-    (30,1)   30
-    (31,1)   31
-
- jit_cache get program GB_jit_reduceNonZombiesWarp
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/GB_jit_reduceNonZombiesWarp
- got kernel instance reduceNonZombiesWarp_int32_t_int32_t_true
-about to close
- read cache file /home/cjnolet/.SuiteSparse/GraphBLAS/6.3.0/reduceNonZombiesWarp_int32_t_int32_t_true
----------------------------------------
---- Linker for void reduceNonZombiesWarp<int, int, true>(GB_Matrix_opaque*, GB_Scalar_opaque*, unsigned int) ---
----------------------------------------
-info    : 0 bytes gmem
-info    : Function properties for '_Z20reduceNonZombiesWarpIiiLb1EEvP16GB_Matrix_opaqueP16GB_Scalar_opaquej':
-info    : used 32 registers, 328 stack, 128 bytes smem, 372 bytes cmem[0], 0 bytes lmem
-
-
----------------------------------------
-Launching _Z20reduceNonZombiesWarpIiiLb1EEvP16GB_Matrix_opaqueP16GB_Scalar_opaquej<<<1,128,0,0>>>(GB_Matrix_opaque*,GB_Scalar_opaque*,unsigned int)
-Sum: 496
-Invoking grb reduce
- work:32 gpus:0 Done.
-Results matched!
-[       OK ] AxB_dot3_tests_PLUS_TIMES_4.tinyxtinyPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (2 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_4.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  my mat, 1024 entries, memory: 40.2 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    ...
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  A, 1024 entries, memory: 40.2 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    (29,1)   29
-    (30,1)   30
-    (31,1)   31
-    (32,1)   32
-    (33,1)   33
-    (34,1)   34
-    (35,1)   35
-    (36,1)   36
-    (37,1)   37
-    (38,1)   38
-    (39,1)   39
-    (40,1)   40
-    (41,1)   41
-    (42,1)   42
-    (43,1)   43
-    (44,1)   44
-    (45,1)   45
-    (46,1)   46
-    (47,1)   47
-    (48,1)   48
-    (49,1)   49
-    (50,1)   50
-    (51,1)   51
-    (52,1)   52
-    (53,1)   53
-    (54,1)   54
-    (55,1)   55
-    (56,1)   56
-    (57,1)   57
-    (58,1)   58
-    (59,1)   59
-    (60,1)   60
-    (61,1)   61
-    (62,1)   62
-    (63,1)   63
-    (64,1)   64
-    (65,1)   65
-    (66,1)   66
-    (67,1)   67
-    (68,1)   68
-    (69,1)   69
-    (70,1)   70
-    (71,1)   71
-    (72,1)   72
-    (73,1)   73
-    (74,1)   74
-    (75,1)   75
-    (76,1)   76
-    (77,1)   77
-    (78,1)   78
-    (79,1)   79
-    (80,1)   80
-    (81,1)   81
-    (82,1)   82
-    (83,1)   83
-    (84,1)   84
-    (85,1)   85
-    (86,1)   86
-    (87,1)   87
-    (88,1)   88
-    (89,1)   89
-    (90,1)   90
-    (91,1)   91
-    (92,1)   92
-    (93,1)   93
-    (94,1)   94
-    (95,1)   95
-    (96,1)   96
-    (97,1)   97
-    (98,1)   98
-    (99,1)   99
-    (100,1)   100
-    (101,1)   101
-    (102,1)   102
-    (103,1)   103
-    (104,1)   104
-    (105,1)   105
-    (106,1)   106
-    (107,1)   107
-    (108,1)   108
-    (109,1)   109
-    (110,1)   110
-    (111,1)   111
-    (112,1)   112
-    (113,1)   113
-    (114,1)   114
-    (115,1)   115
-    (116,1)   116
-    (117,1)   117
-    (118,1)   118
-    (119,1)   119
-    (120,1)   120
-    (121,1)   121
-    (122,1)   122
-    (123,1)   123
-    (124,1)   124
-    (125,1)   125
-    (126,1)   126
-    (127,1)   127
-    (128,1)   128
-    (129,1)   129
-    (130,1)   130
-    (131,1)   131
-    (132,1)   132
-    (133,1)   133
-    (134,1)   134
-    (135,1)   135
-    (136,1)   136
-    (137,1)   137
-    (138,1)   138
-    (139,1)   139
-    (140,1)   140
-    (141,1)   141
-    (142,1)   142
-    (143,1)   143
-    (144,1)   144
-    (145,1)   145
-    (146,1)   146
-    (147,1)   147
-    (148,1)   148
-    (149,1)   149
-    (150,1)   150
-    (151,1)   151
-    (152,1)   152
-    (153,1)   153
-    (154,1)   154
-    (155,1)   155
-    (156,1)   156
-    (157,1)   157
-    (158,1)   158
-    (159,1)   159
-    (160,1)   160
-    (161,1)   161
-    (162,1)   162
-    (163,1)   163
-    (164,1)   164
-    (165,1)   165
-    (166,1)   166
-    (167,1)   167
-    (168,1)   168
-    (169,1)   169
-    (170,1)   170
-    (171,1)   171
-    (172,1)   172
-    (173,1)   173
-    (174,1)   174
-    (175,1)   175
-    (176,1)   176
-    (177,1)   177
-    (178,1)   178
-    (179,1)   179
-    (180,1)   180
-    (181,1)   181
-    (182,1)   182
-    (183,1)   183
-    (184,1)   184
-    (185,1)   185
-    (186,1)   186
-    (187,1)   187
-    (188,1)   188
-    (189,1)   189
-    (190,1)   190
-    (191,1)   191
-    (192,1)   192
-    (193,1)   193
-    (194,1)   194
-    (195,1)   195
-    (196,1)   196
-    (197,1)   197
-    (198,1)   198
-    (199,1)   199
-    (200,1)   200
-    (201,1)   201
-    (202,1)   202
-    (203,1)   203
-    (204,1)   204
-    (205,1)   205
-    (206,1)   206
-    (207,1)   207
-    (208,1)   208
-    (209,1)   209
-    (210,1)   210
-    (211,1)   211
-    (212,1)   212
-    (213,1)   213
-    (214,1)   214
-    (215,1)   215
-    (216,1)   216
-    (217,1)   217
-    (218,1)   218
-    (219,1)   219
-    (220,1)   220
-    (221,1)   221
-    (222,1)   222
-    (223,1)   223
-    (224,1)   224
-    (225,1)   225
-    (226,1)   226
-    (227,1)   227
-    (228,1)   228
-    (229,1)   229
-    (230,1)   230
-    (231,1)   231
-    (232,1)   232
-    (233,1)   233
-    (234,1)   234
-    (235,1)   235
-    (236,1)   236
-    (237,1)   237
-    (238,1)   238
-    (239,1)   239
-    (240,1)   240
-    (241,1)   241
-    (242,1)   242
-    (243,1)   243
-    (244,1)   244
-    (245,1)   245
-    (246,1)   246
-    (247,1)   247
-    (248,1)   248
-    (249,1)   249
-    (250,1)   250
-    (251,1)   251
-    (252,1)   252
-    (253,1)   253
-    (254,1)   254
-    (255,1)   255
-    (256,1)   256
-    (257,1)   257
-    (258,1)   258
-    (259,1)   259
-    (260,1)   260
-    (261,1)   261
-    (262,1)   262
-    (263,1)   263
-    (264,1)   264
-    (265,1)   265
-    (266,1)   266
-    (267,1)   267
-    (268,1)   268
-    (269,1)   269
-    (270,1)   270
-    (271,1)   271
-    (272,1)   272
-    (273,1)   273
-    (274,1)   274
-    (275,1)   275
-    (276,1)   276
-    (277,1)   277
-    (278,1)   278
-    (279,1)   279
-    (280,1)   280
-    (281,1)   281
-    (282,1)   282
-    (283,1)   283
-    (284,1)   284
-    (285,1)   285
-    (286,1)   286
-    (287,1)   287
-    (288,1)   288
-    (289,1)   289
-    (290,1)   290
-    (291,1)   291
-    (292,1)   292
-    (293,1)   293
-    (294,1)   294
-    (295,1)   295
-    (296,1)   296
-    (297,1)   297
-    (298,1)   298
-    (299,1)   299
-    (300,1)   300
-    (301,1)   301
-    (302,1)   302
-    (303,1)   303
-    (304,1)   304
-    (305,1)   305
-    (306,1)   306
-    (307,1)   307
-    (308,1)   308
-    (309,1)   309
-    (310,1)   310
-    (311,1)   311
-    (312,1)   312
-    (313,1)   313
-    (314,1)   314
-    (315,1)   315
-    (316,1)   316
-    (317,1)   317
-    (318,1)   318
-    (319,1)   319
-    (320,1)   320
-    (321,1)   321
-    (322,1)   322
-    (323,1)   323
-    (324,1)   324
-    (325,1)   325
-    (326,1)   326
-    (327,1)   327
-    (328,1)   328
-    (329,1)   329
-    (330,1)   330
-    (331,1)   331
-    (332,1)   332
-    (333,1)   333
-    (334,1)   334
-    (335,1)   335
-    (336,1)   336
-    (337,1)   337
-    (338,1)   338
-    (339,1)   339
-    (340,1)   340
-    (341,1)   341
-    (342,1)   342
-    (343,1)   343
-    (344,1)   344
-    (345,1)   345
-    (346,1)   346
-    (347,1)   347
-    (348,1)   348
-    (349,1)   349
-    (350,1)   350
-    (351,1)   351
-    (352,1)   352
-    (353,1)   353
-    (354,1)   354
-    (355,1)   355
-    (356,1)   356
-    (357,1)   357
-    (358,1)   358
-    (359,1)   359
-    (360,1)   360
-    (361,1)   361
-    (362,1)   362
-    (363,1)   363
-    (364,1)   364
-    (365,1)   365
-    (366,1)   366
-    (367,1)   367
-    (368,1)   368
-    (369,1)   369
-    (370,1)   370
-    (371,1)   371
-    (372,1)   372
-    (373,1)   373
-    (374,1)   374
-    (375,1)   375
-    (376,1)   376
-    (377,1)   377
-    (378,1)   378
-    (379,1)   379
-    (380,1)   380
-    (381,1)   381
-    (382,1)   382
-    (383,1)   383
-    (384,1)   384
-    (385,1)   385
-    (386,1)   386
-    (387,1)   387
-    (388,1)   388
-    (389,1)   389
-    (390,1)   390
-    (391,1)   391
-    (392,1)   392
-    (393,1)   393
-    (394,1)   394
-    (395,1)   395
-    (396,1)   396
-    (397,1)   397
-    (398,1)   398
-    (399,1)   399
-    (400,1)   400
-    (401,1)   401
-    (402,1)   402
-    (403,1)   403
-    (404,1)   404
-    (405,1)   405
-    (406,1)   406
-    (407,1)   407
-    (408,1)   408
-    (409,1)   409
-    (410,1)   410
-    (411,1)   411
-    (412,1)   412
-    (413,1)   413
-    (414,1)   414
-    (415,1)   415
-    (416,1)   416
-    (417,1)   417
-    (418,1)   418
-    (419,1)   419
-    (420,1)   420
-    (421,1)   421
-    (422,1)   422
-    (423,1)   423
-    (424,1)   424
-    (425,1)   425
-    (426,1)   426
-    (427,1)   427
-    (428,1)   428
-    (429,1)   429
-    (430,1)   430
-    (431,1)   431
-    (432,1)   432
-    (433,1)   433
-    (434,1)   434
-    (435,1)   435
-    (436,1)   436
-    (437,1)   437
-    (438,1)   438
-    (439,1)   439
-    (440,1)   440
-    (441,1)   441
-    (442,1)   442
-    (443,1)   443
-    (444,1)   444
-    (445,1)   445
-    (446,1)   446
-    (447,1)   447
-    (448,1)   448
-    (449,1)   449
-    (450,1)   450
-    (451,1)   451
-    (452,1)   452
-    (453,1)   453
-    (454,1)   454
-    (455,1)   455
-    (456,1)   456
-    (457,1)   457
-    (458,1)   458
-    (459,1)   459
-    (460,1)   460
-    (461,1)   461
-    (462,1)   462
-    (463,1)   463
-    (464,1)   464
-    (465,1)   465
-    (466,1)   466
-    (467,1)   467
-    (468,1)   468
-    (469,1)   469
-    (470,1)   470
-    (471,1)   471
-    (472,1)   472
-    (473,1)   473
-    (474,1)   474
-    (475,1)   475
-    (476,1)   476
-    (477,1)   477
-    (478,1)   478
-    (479,1)   479
-    (480,1)   480
-    (481,1)   481
-    (482,1)   482
-    (483,1)   483
-    (484,1)   484
-    (485,1)   485
-    (486,1)   486
-    (487,1)   487
-    (488,1)   488
-    (489,1)   489
-    (490,1)   490
-    (491,1)   491
-    (492,1)   492
-    (493,1)   493
-    (494,1)   494
-    (495,1)   495
-    (496,1)   496
-    (497,1)   497
-    (498,1)   498
-    (499,1)   499
-    (500,1)   500
-    (501,1)   501
-    (502,1)   502
-    (503,1)   503
-    (504,1)   504
-    (505,1)   505
-    (506,1)   506
-    (507,1)   507
-    (508,1)   508
-    (509,1)   509
-    (510,1)   510
-    (511,1)   511
-    (512,1)   512
-    (513,1)   513
-    (514,1)   514
-    (515,1)   515
-    (516,1)   516
-    (517,1)   517
-    (518,1)   518
-    (519,1)   519
-    (520,1)   520
-    (521,1)   521
-    (522,1)   522
-    (523,1)   523
-    (524,1)   524
-    (525,1)   525
-    (526,1)   526
-    (527,1)   527
-    (528,1)   528
-    (529,1)   529
-    (530,1)   530
-    (531,1)   531
-    (532,1)   532
-    (533,1)   533
-    (534,1)   534
-    (535,1)   535
-    (536,1)   536
-    (537,1)   537
-    (538,1)   538
-    (539,1)   539
-    (540,1)   540
-    (541,1)   541
-    (542,1)   542
-    (543,1)   543
-    (544,1)   544
-    (545,1)   545
-    (546,1)   546
-    (547,1)   547
-    (548,1)   548
-    (549,1)   549
-    (550,1)   550
-    (551,1)   551
-    (552,1)   552
-    (553,1)   553
-    (554,1)   554
-    (555,1)   555
-    (556,1)   556
-    (557,1)   557
-    (558,1)   558
-    (559,1)   559
-    (560,1)   560
-    (561,1)   561
-    (562,1)   562
-    (563,1)   563
-    (564,1)   564
-    (565,1)   565
-    (566,1)   566
-    (567,1)   567
-    (568,1)   568
-    (569,1)   569
-    (570,1)   570
-    (571,1)   571
-    (572,1)   572
-    (573,1)   573
-    (574,1)   574
-    (575,1)   575
-    (576,1)   576
-    (577,1)   577
-    (578,1)   578
-    (579,1)   579
-    (580,1)   580
-    (581,1)   581
-    (582,1)   582
-    (583,1)   583
-    (584,1)   584
-    (585,1)   585
-    (586,1)   586
-    (587,1)   587
-    (588,1)   588
-    (589,1)   589
-    (590,1)   590
-    (591,1)   591
-    (592,1)   592
-    (593,1)   593
-    (594,1)   594
-    (595,1)   595
-    (596,1)   596
-    (597,1)   597
-    (598,1)   598
-    (599,1)   599
-    (600,1)   600
-    (601,1)   601
-    (602,1)   602
-    (603,1)   603
-    (604,1)   604
-    (605,1)   605
-    (606,1)   606
-    (607,1)   607
-    (608,1)   608
-    (609,1)   609
-    (610,1)   610
-    (611,1)   611
-    (612,1)   612
-    (613,1)   613
-    (614,1)   614
-    (615,1)   615
-    (616,1)   616
-    (617,1)   617
-    (618,1)   618
-    (619,1)   619
-    (620,1)   620
-    (621,1)   621
-    (622,1)   622
-    (623,1)   623
-    (624,1)   624
-    (625,1)   625
-    (626,1)   626
-    (627,1)   627
-    (628,1)   628
-    (629,1)   629
-    (630,1)   630
-    (631,1)   631
-    (632,1)   632
-    (633,1)   633
-    (634,1)   634
-    (635,1)   635
-    (636,1)   636
-    (637,1)   637
-    (638,1)   638
-    (639,1)   639
-    (640,1)   640
-    (641,1)   641
-    (642,1)   642
-    (643,1)   643
-    (644,1)   644
-    (645,1)   645
-    (646,1)   646
-    (647,1)   647
-    (648,1)   648
-    (649,1)   649
-    (650,1)   650
-    (651,1)   651
-    (652,1)   652
-    (653,1)   653
-    (654,1)   654
-    (655,1)   655
-    (656,1)   656
-    (657,1)   657
-    (658,1)   658
-    (659,1)   659
-    (660,1)   660
-    (661,1)   661
-    (662,1)   662
-    (663,1)   663
-    (664,1)   664
-    (665,1)   665
-    (666,1)   666
-    (667,1)   667
-    (668,1)   668
-    (669,1)   669
-    (670,1)   670
-    (671,1)   671
-    (672,1)   672
-    (673,1)   673
-    (674,1)   674
-    (675,1)   675
-    (676,1)   676
-    (677,1)   677
-    (678,1)   678
-    (679,1)   679
-    (680,1)   680
-    (681,1)   681
-    (682,1)   682
-    (683,1)   683
-    (684,1)   684
-    (685,1)   685
-    (686,1)   686
-    (687,1)   687
-    (688,1)   688
-    (689,1)   689
-    (690,1)   690
-    (691,1)   691
-    (692,1)   692
-    (693,1)   693
-    (694,1)   694
-    (695,1)   695
-    (696,1)   696
-    (697,1)   697
-    (698,1)   698
-    (699,1)   699
-    (700,1)   700
-    (701,1)   701
-    (702,1)   702
-    (703,1)   703
-    (704,1)   704
-    (705,1)   705
-    (706,1)   706
-    (707,1)   707
-    (708,1)   708
-    (709,1)   709
-    (710,1)   710
-    (711,1)   711
-    (712,1)   712
-    (713,1)   713
-    (714,1)   714
-    (715,1)   715
-    (716,1)   716
-    (717,1)   717
-    (718,1)   718
-    (719,1)   719
-    (720,1)   720
-    (721,1)   721
-    (722,1)   722
-    (723,1)   723
-    (724,1)   724
-    (725,1)   725
-    (726,1)   726
-    (727,1)   727
-    (728,1)   728
-    (729,1)   729
-    (730,1)   730
-    (731,1)   731
-    (732,1)   732
-    (733,1)   733
-    (734,1)   734
-    (735,1)   735
-    (736,1)   736
-    (737,1)   737
-    (738,1)   738
-    (739,1)   739
-    (740,1)   740
-    (741,1)   741
-    (742,1)   742
-    (743,1)   743
-    (744,1)   744
-    (745,1)   745
-    (746,1)   746
-    (747,1)   747
-    (748,1)   748
-    (749,1)   749
-    (750,1)   750
-    (751,1)   751
-    (752,1)   752
-    (753,1)   753
-    (754,1)   754
-    (755,1)   755
-    (756,1)   756
-    (757,1)   757
-    (758,1)   758
-    (759,1)   759
-    (760,1)   760
-    (761,1)   761
-    (762,1)   762
-    (763,1)   763
-    (764,1)   764
-    (765,1)   765
-    (766,1)   766
-    (767,1)   767
-    (768,1)   768
-    (769,1)   769
-    (770,1)   770
-    (771,1)   771
-    (772,1)   772
-    (773,1)   773
-    (774,1)   774
-    (775,1)   775
-    (776,1)   776
-    (777,1)   777
-    (778,1)   778
-    (779,1)   779
-    (780,1)   780
-    (781,1)   781
-    (782,1)   782
-    (783,1)   783
-    (784,1)   784
-    (785,1)   785
-    (786,1)   786
-    (787,1)   787
-    (788,1)   788
-    (789,1)   789
-    (790,1)   790
-    (791,1)   791
-    (792,1)   792
-    (793,1)   793
-    (794,1)   794
-    (795,1)   795
-    (796,1)   796
-    (797,1)   797
-    (798,1)   798
-    (799,1)   799
-    (800,1)   800
-    (801,1)   801
-    (802,1)   802
-    (803,1)   803
-    (804,1)   804
-    (805,1)   805
-    (806,1)   806
-    (807,1)   807
-    (808,1)   808
-    (809,1)   809
-    (810,1)   810
-    (811,1)   811
-    (812,1)   812
-    (813,1)   813
-    (814,1)   814
-    (815,1)   815
-    (816,1)   816
-    (817,1)   817
-    (818,1)   818
-    (819,1)   819
-    (820,1)   820
-    (821,1)   821
-    (822,1)   822
-    (823,1)   823
-    (824,1)   824
-    (825,1)   825
-    (826,1)   826
-    (827,1)   827
-    (828,1)   828
-    (829,1)   829
-    (830,1)   830
-    (831,1)   831
-    (832,1)   832
-    (833,1)   833
-    (834,1)   834
-    (835,1)   835
-    (836,1)   836
-    (837,1)   837
-    (838,1)   838
-    (839,1)   839
-    (840,1)   840
-    (841,1)   841
-    (842,1)   842
-    (843,1)   843
-    (844,1)   844
-    (845,1)   845
-    (846,1)   846
-    (847,1)   847
-    (848,1)   848
-    (849,1)   849
-    (850,1)   850
-    (851,1)   851
-    (852,1)   852
-    (853,1)   853
-    (854,1)   854
-    (855,1)   855
-    (856,1)   856
-    (857,1)   857
-    (858,1)   858
-    (859,1)   859
-    (860,1)   860
-    (861,1)   861
-    (862,1)   862
-    (863,1)   863
-    (864,1)   864
-    (865,1)   865
-    (866,1)   866
-    (867,1)   867
-    (868,1)   868
-    (869,1)   869
-    (870,1)   870
-    (871,1)   871
-    (872,1)   872
-    (873,1)   873
-    (874,1)   874
-    (875,1)   875
-    (876,1)   876
-    (877,1)   877
-    (878,1)   878
-    (879,1)   879
-    (880,1)   880
-    (881,1)   881
-    (882,1)   882
-    (883,1)   883
-    (884,1)   884
-    (885,1)   885
-    (886,1)   886
-    (887,1)   887
-    (888,1)   888
-    (889,1)   889
-    (890,1)   890
-    (891,1)   891
-    (892,1)   892
-    (893,1)   893
-    (894,1)   894
-    (895,1)   895
-    (896,1)   896
-    (897,1)   897
-    (898,1)   898
-    (899,1)   899
-    (900,1)   900
-    (901,1)   901
-    (902,1)   902
-    (903,1)   903
-    (904,1)   904
-    (905,1)   905
-    (906,1)   906
-    (907,1)   907
-    (908,1)   908
-    (909,1)   909
-    (910,1)   910
-    (911,1)   911
-    (912,1)   912
-    (913,1)   913
-    (914,1)   914
-    (915,1)   915
-    (916,1)   916
-    (917,1)   917
-    (918,1)   918
-    (919,1)   919
-    (920,1)   920
-    (921,1)   921
-    (922,1)   922
-    (923,1)   923
-    (924,1)   924
-    (925,1)   925
-    (926,1)   926
-    (927,1)   927
-    (928,1)   928
-    (929,1)   929
-    (930,1)   930
-    (931,1)   931
-    (932,1)   932
-    (933,1)   933
-    (934,1)   934
-    (935,1)   935
-    (936,1)   936
-    (937,1)   937
-    (938,1)   938
-    (939,1)   939
-    (940,1)   940
-    (941,1)   941
-    (942,1)   942
-    (943,1)   943
-    (944,1)   944
-    (945,1)   945
-    (946,1)   946
-    (947,1)   947
-    (948,1)   948
-    (949,1)   949
-    (950,1)   950
-    (951,1)   951
-    (952,1)   952
-    (953,1)   953
-    (954,1)   954
-    (955,1)   955
-    (956,1)   956
-    (957,1)   957
-    (958,1)   958
-    (959,1)   959
-    (960,1)   960
-    (961,1)   961
-    (962,1)   962
-    (963,1)   963
-    (964,1)   964
-    (965,1)   965
-    (966,1)   966
-    (967,1)   967
-    (968,1)   968
-    (969,1)   969
-    (970,1)   970
-    (971,1)   971
-    (972,1)   972
-    (973,1)   973
-    (974,1)   974
-    (975,1)   975
-    (976,1)   976
-    (977,1)   977
-    (978,1)   978
-    (979,1)   979
-    (980,1)   980
-    (981,1)   981
-    (982,1)   982
-    (983,1)   983
-    (984,1)   984
-    (985,1)   985
-    (986,1)   986
-    (987,1)   987
-    (988,1)   988
-    (989,1)   989
-    (990,1)   990
-    (991,1)   991
-    (992,1)   992
-    (993,1)   993
-    (994,1)   994
-    (995,1)   995
-    (996,1)   996
-    (997,1)   997
-    (998,1)   998
-    (999,1)   999
-    (1000,1)   1000
-    (1001,1)   1001
-    (1002,1)   1002
-    (1003,1)   1003
-    (1004,1)   1004
-    (1005,1)   1005
-    (1006,1)   1006
-    (1007,1)   1007
-    (1008,1)   1008
-    (1009,1)   1009
-    (1010,1)   1010
-    (1011,1)   1011
-    (1012,1)   1012
-    (1013,1)   1013
-    (1014,1)   1014
-    (1015,1)   1015
-    (1016,1)   1016
-    (1017,1)   1017
-    (1018,1)   1018
-    (1019,1)   1019
-    (1020,1)   1020
-    (1021,1)   1021
-    (1022,1)   1022
-    (1023,1)   1023
-
- jit_cache get program GB_jit_reduceNonZombiesWarp
-found memory-cached prog GB_jit_reduceNonZombiesWarp
- got kernel instance reduceNonZombiesWarp_int32_t_int32_t_true
-found memory-cached prog reduceNonZombiesWarp_int32_t_int32_t_true
-Launching _Z20reduceNonZombiesWarpIiiLb1EEvP16GB_Matrix_opaqueP16GB_Scalar_opaquej<<<8,128,0,0>>>(GB_Matrix_opaque*,GB_Scalar_opaque*,unsigned int)
-Sum: 523776
-Invoking grb reduce
- rmm_wrap_alloc 256 bytes
- work:1024 gpus:0 Done.
-Results matched!
-[       OK ] AxB_dot3_tests_PLUS_TIMES_4.smallxsmallPLUS_TIMES_Cint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t (5 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_4.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  my mat, 32 entries, memory: 1.5 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    ...
-
-  32x32 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  A, 32 entries, memory: 1.5 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    (29,1)   29
-    (30,1)   30
-    (31,1)   31
-
- jit_cache get program GB_jit_reduceNonZombiesWarp
-found memory-cached prog GB_jit_reduceNonZombiesWarp
- got kernel instance reduceNonZombiesWarp_int32_t_int32_t_true
-found memory-cached prog reduceNonZombiesWarp_int32_t_int32_t_true
-Launching _Z20reduceNonZombiesWarpIiiLb1EEvP16GB_Matrix_opaqueP16GB_Scalar_opaquej<<<1,128,0,0>>>(GB_Matrix_opaque*,GB_Scalar_opaque*,unsigned int)
-Sum: 496
-Invoking grb reduce
- work:32 gpus:0 Done.
-Results matched!
-[       OK ] AxB_dot3_tests_PLUS_TIMES_4.tinyxtinyPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (0 ms)
-[ RUN      ] AxB_dot3_tests_PLUS_TIMES_4.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t
- rmm_wrap_alloc 256 bytes
- rmm_wrap_alloc 512 bytes
- rmm_wrap_alloc 16384 bytes
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  my mat, 1024 entries, memory: 40.2 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    ...
-
-  1024x1024 GraphBLAS int32_t matrix, sparse by row
-  sparsity control: sparse only
-  A, 1024 entries, memory: 40.2 KB
-
-    (0,1)   0
-    (1,1)   1
-    (2,1)   2
-    (3,1)   3
-    (4,1)   4
-    (5,1)   5
-    (6,1)   6
-    (7,1)   7
-    (8,1)   8
-    (9,1)   9
-    (10,1)   10
-    (11,1)   11
-    (12,1)   12
-    (13,1)   13
-    (14,1)   14
-    (15,1)   15
-    (16,1)   16
-    (17,1)   17
-    (18,1)   18
-    (19,1)   19
-    (20,1)   20
-    (21,1)   21
-    (22,1)   22
-    (23,1)   23
-    (24,1)   24
-    (25,1)   25
-    (26,1)   26
-    (27,1)   27
-    (28,1)   28
-    (29,1)   29
-    (30,1)   30
-    (31,1)   31
-    (32,1)   32
-    (33,1)   33
-    (34,1)   34
-    (35,1)   35
-    (36,1)   36
-    (37,1)   37
-    (38,1)   38
-    (39,1)   39
-    (40,1)   40
-    (41,1)   41
-    (42,1)   42
-    (43,1)   43
-    (44,1)   44
-    (45,1)   45
-    (46,1)   46
-    (47,1)   47
-    (48,1)   48
-    (49,1)   49
-    (50,1)   50
-    (51,1)   51
-    (52,1)   52
-    (53,1)   53
-    (54,1)   54
-    (55,1)   55
-    (56,1)   56
-    (57,1)   57
-    (58,1)   58
-    (59,1)   59
-    (60,1)   60
-    (61,1)   61
-    (62,1)   62
-    (63,1)   63
-    (64,1)   64
-    (65,1)   65
-    (66,1)   66
-    (67,1)   67
-    (68,1)   68
-    (69,1)   69
-    (70,1)   70
-    (71,1)   71
-    (72,1)   72
-    (73,1)   73
-    (74,1)   74
-    (75,1)   75
-    (76,1)   76
-    (77,1)   77
-    (78,1)   78
-    (79,1)   79
-    (80,1)   80
-    (81,1)   81
-    (82,1)   82
-    (83,1)   83
-    (84,1)   84
-    (85,1)   85
-    (86,1)   86
-    (87,1)   87
-    (88,1)   88
-    (89,1)   89
-    (90,1)   90
-    (91,1)   91
-    (92,1)   92
-    (93,1)   93
-    (94,1)   94
-    (95,1)   95
-    (96,1)   96
-    (97,1)   97
-    (98,1)   98
-    (99,1)   99
-    (100,1)   100
-    (101,1)   101
-    (102,1)   102
-    (103,1)   103
-    (104,1)   104
-    (105,1)   105
-    (106,1)   106
-    (107,1)   107
-    (108,1)   108
-    (109,1)   109
-    (110,1)   110
-    (111,1)   111
-    (112,1)   112
-    (113,1)   113
-    (114,1)   114
-    (115,1)   115
-    (116,1)   116
-    (117,1)   117
-    (118,1)   118
-    (119,1)   119
-    (120,1)   120
-    (121,1)   121
-    (122,1)   122
-    (123,1)   123
-    (124,1)   124
-    (125,1)   125
-    (126,1)   126
-    (127,1)   127
-    (128,1)   128
-    (129,1)   129
-    (130,1)   130
-    (131,1)   131
-    (132,1)   132
-    (133,1)   133
-    (134,1)   134
-    (135,1)   135
-    (136,1)   136
-    (137,1)   137
-    (138,1)   138
-    (139,1)   139
-    (140,1)   140
-    (141,1)   141
-    (142,1)   142
-    (143,1)   143
-    (144,1)   144
-    (145,1)   145
-    (146,1)   146
-    (147,1)   147
-    (148,1)   148
-    (149,1)   149
-    (150,1)   150
-    (151,1)   151
-    (152,1)   152
-    (153,1)   153
-    (154,1)   154
-    (155,1)   155
-    (156,1)   156
-    (157,1)   157
-    (158,1)   158
-    (159,1)   159
-    (160,1)   160
-    (161,1)   161
-    (162,1)   162
-    (163,1)   163
-    (164,1)   164
-    (165,1)   165
-    (166,1)   166
-    (167,1)   167
-    (168,1)   168
-    (169,1)   169
-    (170,1)   170
-    (171,1)   171
-    (172,1)   172
-    (173,1)   173
-    (174,1)   174
-    (175,1)   175
-    (176,1)   176
-    (177,1)   177
-    (178,1)   178
-    (179,1)   179
-    (180,1)   180
-    (181,1)   181
-    (182,1)   182
-    (183,1)   183
-    (184,1)   184
-    (185,1)   185
-    (186,1)   186
-    (187,1)   187
-    (188,1)   188
-    (189,1)   189
-    (190,1)   190
-    (191,1)   191
-    (192,1)   192
-    (193,1)   193
-    (194,1)   194
-    (195,1)   195
-    (196,1)   196
-    (197,1)   197
-    (198,1)   198
-    (199,1)   199
-    (200,1)   200
-    (201,1)   201
-    (202,1)   202
-    (203,1)   203
-    (204,1)   204
-    (205,1)   205
-    (206,1)   206
-    (207,1)   207
-    (208,1)   208
-    (209,1)   209
-    (210,1)   210
-    (211,1)   211
-    (212,1)   212
-    (213,1)   213
-    (214,1)   214
-    (215,1)   215
-    (216,1)   216
-    (217,1)   217
-    (218,1)   218
-    (219,1)   219
-    (220,1)   220
-    (221,1)   221
-    (222,1)   222
-    (223,1)   223
-    (224,1)   224
-    (225,1)   225
-    (226,1)   226
-    (227,1)   227
-    (228,1)   228
-    (229,1)   229
-    (230,1)   230
-    (231,1)   231
-    (232,1)   232
-    (233,1)   233
-    (234,1)   234
-    (235,1)   235
-    (236,1)   236
-    (237,1)   237
-    (238,1)   238
-    (239,1)   239
-    (240,1)   240
-    (241,1)   241
-    (242,1)   242
-    (243,1)   243
-    (244,1)   244
-    (245,1)   245
-    (246,1)   246
-    (247,1)   247
-    (248,1)   248
-    (249,1)   249
-    (250,1)   250
-    (251,1)   251
-    (252,1)   252
-    (253,1)   253
-    (254,1)   254
-    (255,1)   255
-    (256,1)   256
-    (257,1)   257
-    (258,1)   258
-    (259,1)   259
-    (260,1)   260
-    (261,1)   261
-    (262,1)   262
-    (263,1)   263
-    (264,1)   264
-    (265,1)   265
-    (266,1)   266
-    (267,1)   267
-    (268,1)   268
-    (269,1)   269
-    (270,1)   270
-    (271,1)   271
-    (272,1)   272
-    (273,1)   273
-    (274,1)   274
-    (275,1)   275
-    (276,1)   276
-    (277,1)   277
-    (278,1)   278
-    (279,1)   279
-    (280,1)   280
-    (281,1)   281
-    (282,1)   282
-    (283,1)   283
-    (284,1)   284
-    (285,1)   285
-    (286,1)   286
-    (287,1)   287
-    (288,1)   288
-    (289,1)   289
-    (290,1)   290
-    (291,1)   291
-    (292,1)   292
-    (293,1)   293
-    (294,1)   294
-    (295,1)   295
-    (296,1)   296
-    (297,1)   297
-    (298,1)   298
-    (299,1)   299
-    (300,1)   300
-    (301,1)   301
-    (302,1)   302
-    (303,1)   303
-    (304,1)   304
-    (305,1)   305
-    (306,1)   306
-    (307,1)   307
-    (308,1)   308
-    (309,1)   309
-    (310,1)   310
-    (311,1)   311
-    (312,1)   312
-    (313,1)   313
-    (314,1)   314
-    (315,1)   315
-    (316,1)   316
-    (317,1)   317
-    (318,1)   318
-    (319,1)   319
-    (320,1)   320
-    (321,1)   321
-    (322,1)   322
-    (323,1)   323
-    (324,1)   324
-    (325,1)   325
-    (326,1)   326
-    (327,1)   327
-    (328,1)   328
-    (329,1)   329
-    (330,1)   330
-    (331,1)   331
-    (332,1)   332
-    (333,1)   333
-    (334,1)   334
-    (335,1)   335
-    (336,1)   336
-    (337,1)   337
-    (338,1)   338
-    (339,1)   339
-    (340,1)   340
-    (341,1)   341
-    (342,1)   342
-    (343,1)   343
-    (344,1)   344
-    (345,1)   345
-    (346,1)   346
-    (347,1)   347
-    (348,1)   348
-    (349,1)   349
-    (350,1)   350
-    (351,1)   351
-    (352,1)   352
-    (353,1)   353
-    (354,1)   354
-    (355,1)   355
-    (356,1)   356
-    (357,1)   357
-    (358,1)   358
-    (359,1)   359
-    (360,1)   360
-    (361,1)   361
-    (362,1)   362
-    (363,1)   363
-    (364,1)   364
-    (365,1)   365
-    (366,1)   366
-    (367,1)   367
-    (368,1)   368
-    (369,1)   369
-    (370,1)   370
-    (371,1)   371
-    (372,1)   372
-    (373,1)   373
-    (374,1)   374
-    (375,1)   375
-    (376,1)   376
-    (377,1)   377
-    (378,1)   378
-    (379,1)   379
-    (380,1)   380
-    (381,1)   381
-    (382,1)   382
-    (383,1)   383
-    (384,1)   384
-    (385,1)   385
-    (386,1)   386
-    (387,1)   387
-    (388,1)   388
-    (389,1)   389
-    (390,1)   390
-    (391,1)   391
-    (392,1)   392
-    (393,1)   393
-    (394,1)   394
-    (395,1)   395
-    (396,1)   396
-    (397,1)   397
-    (398,1)   398
-    (399,1)   399
-    (400,1)   400
-    (401,1)   401
-    (402,1)   402
-    (403,1)   403
-    (404,1)   404
-    (405,1)   405
-    (406,1)   406
-    (407,1)   407
-    (408,1)   408
-    (409,1)   409
-    (410,1)   410
-    (411,1)   411
-    (412,1)   412
-    (413,1)   413
-    (414,1)   414
-    (415,1)   415
-    (416,1)   416
-    (417,1)   417
-    (418,1)   418
-    (419,1)   419
-    (420,1)   420
-    (421,1)   421
-    (422,1)   422
-    (423,1)   423
-    (424,1)   424
-    (425,1)   425
-    (426,1)   426
-    (427,1)   427
-    (428,1)   428
-    (429,1)   429
-    (430,1)   430
-    (431,1)   431
-    (432,1)   432
-    (433,1)   433
-    (434,1)   434
-    (435,1)   435
-    (436,1)   436
-    (437,1)   437
-    (438,1)   438
-    (439,1)   439
-    (440,1)   440
-    (441,1)   441
-    (442,1)   442
-    (443,1)   443
-    (444,1)   444
-    (445,1)   445
-    (446,1)   446
-    (447,1)   447
-    (448,1)   448
-    (449,1)   449
-    (450,1)   450
-    (451,1)   451
-    (452,1)   452
-    (453,1)   453
-    (454,1)   454
-    (455,1)   455
-    (456,1)   456
-    (457,1)   457
-    (458,1)   458
-    (459,1)   459
-    (460,1)   460
-    (461,1)   461
-    (462,1)   462
-    (463,1)   463
-    (464,1)   464
-    (465,1)   465
-    (466,1)   466
-    (467,1)   467
-    (468,1)   468
-    (469,1)   469
-    (470,1)   470
-    (471,1)   471
-    (472,1)   472
-    (473,1)   473
-    (474,1)   474
-    (475,1)   475
-    (476,1)   476
-    (477,1)   477
-    (478,1)   478
-    (479,1)   479
-    (480,1)   480
-    (481,1)   481
-    (482,1)   482
-    (483,1)   483
-    (484,1)   484
-    (485,1)   485
-    (486,1)   486
-    (487,1)   487
-    (488,1)   488
-    (489,1)   489
-    (490,1)   490
-    (491,1)   491
-    (492,1)   492
-    (493,1)   493
-    (494,1)   494
-    (495,1)   495
-    (496,1)   496
-    (497,1)   497
-    (498,1)   498
-    (499,1)   499
-    (500,1)   500
-    (501,1)   501
-    (502,1)   502
-    (503,1)   503
-    (504,1)   504
-    (505,1)   505
-    (506,1)   506
-    (507,1)   507
-    (508,1)   508
-    (509,1)   509
-    (510,1)   510
-    (511,1)   511
-    (512,1)   512
-    (513,1)   513
-    (514,1)   514
-    (515,1)   515
-    (516,1)   516
-    (517,1)   517
-    (518,1)   518
-    (519,1)   519
-    (520,1)   520
-    (521,1)   521
-    (522,1)   522
-    (523,1)   523
-    (524,1)   524
-    (525,1)   525
-    (526,1)   526
-    (527,1)   527
-    (528,1)   528
-    (529,1)   529
-    (530,1)   530
-    (531,1)   531
-    (532,1)   532
-    (533,1)   533
-    (534,1)   534
-    (535,1)   535
-    (536,1)   536
-    (537,1)   537
-    (538,1)   538
-    (539,1)   539
-    (540,1)   540
-    (541,1)   541
-    (542,1)   542
-    (543,1)   543
-    (544,1)   544
-    (545,1)   545
-    (546,1)   546
-    (547,1)   547
-    (548,1)   548
-    (549,1)   549
-    (550,1)   550
-    (551,1)   551
-    (552,1)   552
-    (553,1)   553
-    (554,1)   554
-    (555,1)   555
-    (556,1)   556
-    (557,1)   557
-    (558,1)   558
-    (559,1)   559
-    (560,1)   560
-    (561,1)   561
-    (562,1)   562
-    (563,1)   563
-    (564,1)   564
-    (565,1)   565
-    (566,1)   566
-    (567,1)   567
-    (568,1)   568
-    (569,1)   569
-    (570,1)   570
-    (571,1)   571
-    (572,1)   572
-    (573,1)   573
-    (574,1)   574
-    (575,1)   575
-    (576,1)   576
-    (577,1)   577
-    (578,1)   578
-    (579,1)   579
-    (580,1)   580
-    (581,1)   581
-    (582,1)   582
-    (583,1)   583
-    (584,1)   584
-    (585,1)   585
-    (586,1)   586
-    (587,1)   587
-    (588,1)   588
-    (589,1)   589
-    (590,1)   590
-    (591,1)   591
-    (592,1)   592
-    (593,1)   593
-    (594,1)   594
-    (595,1)   595
-    (596,1)   596
-    (597,1)   597
-    (598,1)   598
-    (599,1)   599
-    (600,1)   600
-    (601,1)   601
-    (602,1)   602
-    (603,1)   603
-    (604,1)   604
-    (605,1)   605
-    (606,1)   606
-    (607,1)   607
-    (608,1)   608
-    (609,1)   609
-    (610,1)   610
-    (611,1)   611
-    (612,1)   612
-    (613,1)   613
-    (614,1)   614
-    (615,1)   615
-    (616,1)   616
-    (617,1)   617
-    (618,1)   618
-    (619,1)   619
-    (620,1)   620
-    (621,1)   621
-    (622,1)   622
-    (623,1)   623
-    (624,1)   624
-    (625,1)   625
-    (626,1)   626
-    (627,1)   627
-    (628,1)   628
-    (629,1)   629
-    (630,1)   630
-    (631,1)   631
-    (632,1)   632
-    (633,1)   633
-    (634,1)   634
-    (635,1)   635
-    (636,1)   636
-    (637,1)   637
-    (638,1)   638
-    (639,1)   639
-    (640,1)   640
-    (641,1)   641
-    (642,1)   642
-    (643,1)   643
-    (644,1)   644
-    (645,1)   645
-    (646,1)   646
-    (647,1)   647
-    (648,1)   648
-    (649,1)   649
-    (650,1)   650
-    (651,1)   651
-    (652,1)   652
-    (653,1)   653
-    (654,1)   654
-    (655,1)   655
-    (656,1)   656
-    (657,1)   657
-    (658,1)   658
-    (659,1)   659
-    (660,1)   660
-    (661,1)   661
-    (662,1)   662
-    (663,1)   663
-    (664,1)   664
-    (665,1)   665
-    (666,1)   666
-    (667,1)   667
-    (668,1)   668
-    (669,1)   669
-    (670,1)   670
-    (671,1)   671
-    (672,1)   672
-    (673,1)   673
-    (674,1)   674
-    (675,1)   675
-    (676,1)   676
-    (677,1)   677
-    (678,1)   678
-    (679,1)   679
-    (680,1)   680
-    (681,1)   681
-    (682,1)   682
-    (683,1)   683
-    (684,1)   684
-    (685,1)   685
-    (686,1)   686
-    (687,1)   687
-    (688,1)   688
-    (689,1)   689
-    (690,1)   690
-    (691,1)   691
-    (692,1)   692
-    (693,1)   693
-    (694,1)   694
-    (695,1)   695
-    (696,1)   696
-    (697,1)   697
-    (698,1)   698
-    (699,1)   699
-    (700,1)   700
-    (701,1)   701
-    (702,1)   702
-    (703,1)   703
-    (704,1)   704
-    (705,1)   705
-    (706,1)   706
-    (707,1)   707
-    (708,1)   708
-    (709,1)   709
-    (710,1)   710
-    (711,1)   711
-    (712,1)   712
-    (713,1)   713
-    (714,1)   714
-    (715,1)   715
-    (716,1)   716
-    (717,1)   717
-    (718,1)   718
-    (719,1)   719
-    (720,1)   720
-    (721,1)   721
-    (722,1)   722
-    (723,1)   723
-    (724,1)   724
-    (725,1)   725
-    (726,1)   726
-    (727,1)   727
-    (728,1)   728
-    (729,1)   729
-    (730,1)   730
-    (731,1)   731
-    (732,1)   732
-    (733,1)   733
-    (734,1)   734
-    (735,1)   735
-    (736,1)   736
-    (737,1)   737
-    (738,1)   738
-    (739,1)   739
-    (740,1)   740
-    (741,1)   741
-    (742,1)   742
-    (743,1)   743
-    (744,1)   744
-    (745,1)   745
-    (746,1)   746
-    (747,1)   747
-    (748,1)   748
-    (749,1)   749
-    (750,1)   750
-    (751,1)   751
-    (752,1)   752
-    (753,1)   753
-    (754,1)   754
-    (755,1)   755
-    (756,1)   756
-    (757,1)   757
-    (758,1)   758
-    (759,1)   759
-    (760,1)   760
-    (761,1)   761
-    (762,1)   762
-    (763,1)   763
-    (764,1)   764
-    (765,1)   765
-    (766,1)   766
-    (767,1)   767
-    (768,1)   768
-    (769,1)   769
-    (770,1)   770
-    (771,1)   771
-    (772,1)   772
-    (773,1)   773
-    (774,1)   774
-    (775,1)   775
-    (776,1)   776
-    (777,1)   777
-    (778,1)   778
-    (779,1)   779
-    (780,1)   780
-    (781,1)   781
-    (782,1)   782
-    (783,1)   783
-    (784,1)   784
-    (785,1)   785
-    (786,1)   786
-    (787,1)   787
-    (788,1)   788
-    (789,1)   789
-    (790,1)   790
-    (791,1)   791
-    (792,1)   792
-    (793,1)   793
-    (794,1)   794
-    (795,1)   795
-    (796,1)   796
-    (797,1)   797
-    (798,1)   798
-    (799,1)   799
-    (800,1)   800
-    (801,1)   801
-    (802,1)   802
-    (803,1)   803
-    (804,1)   804
-    (805,1)   805
-    (806,1)   806
-    (807,1)   807
-    (808,1)   808
-    (809,1)   809
-    (810,1)   810
-    (811,1)   811
-    (812,1)   812
-    (813,1)   813
-    (814,1)   814
-    (815,1)   815
-    (816,1)   816
-    (817,1)   817
-    (818,1)   818
-    (819,1)   819
-    (820,1)   820
-    (821,1)   821
-    (822,1)   822
-    (823,1)   823
-    (824,1)   824
-    (825,1)   825
-    (826,1)   826
-    (827,1)   827
-    (828,1)   828
-    (829,1)   829
-    (830,1)   830
-    (831,1)   831
-    (832,1)   832
-    (833,1)   833
-    (834,1)   834
-    (835,1)   835
-    (836,1)   836
-    (837,1)   837
-    (838,1)   838
-    (839,1)   839
-    (840,1)   840
-    (841,1)   841
-    (842,1)   842
-    (843,1)   843
-    (844,1)   844
-    (845,1)   845
-    (846,1)   846
-    (847,1)   847
-    (848,1)   848
-    (849,1)   849
-    (850,1)   850
-    (851,1)   851
-    (852,1)   852
-    (853,1)   853
-    (854,1)   854
-    (855,1)   855
-    (856,1)   856
-    (857,1)   857
-    (858,1)   858
-    (859,1)   859
-    (860,1)   860
-    (861,1)   861
-    (862,1)   862
-    (863,1)   863
-    (864,1)   864
-    (865,1)   865
-    (866,1)   866
-    (867,1)   867
-    (868,1)   868
-    (869,1)   869
-    (870,1)   870
-    (871,1)   871
-    (872,1)   872
-    (873,1)   873
-    (874,1)   874
-    (875,1)   875
-    (876,1)   876
-    (877,1)   877
-    (878,1)   878
-    (879,1)   879
-    (880,1)   880
-    (881,1)   881
-    (882,1)   882
-    (883,1)   883
-    (884,1)   884
-    (885,1)   885
-    (886,1)   886
-    (887,1)   887
-    (888,1)   888
-    (889,1)   889
-    (890,1)   890
-    (891,1)   891
-    (892,1)   892
-    (893,1)   893
-    (894,1)   894
-    (895,1)   895
-    (896,1)   896
-    (897,1)   897
-    (898,1)   898
-    (899,1)   899
-    (900,1)   900
-    (901,1)   901
-    (902,1)   902
-    (903,1)   903
-    (904,1)   904
-    (905,1)   905
-    (906,1)   906
-    (907,1)   907
-    (908,1)   908
-    (909,1)   909
-    (910,1)   910
-    (911,1)   911
-    (912,1)   912
-    (913,1)   913
-    (914,1)   914
-    (915,1)   915
-    (916,1)   916
-    (917,1)   917
-    (918,1)   918
-    (919,1)   919
-    (920,1)   920
-    (921,1)   921
-    (922,1)   922
-    (923,1)   923
-    (924,1)   924
-    (925,1)   925
-    (926,1)   926
-    (927,1)   927
-    (928,1)   928
-    (929,1)   929
-    (930,1)   930
-    (931,1)   931
-    (932,1)   932
-    (933,1)   933
-    (934,1)   934
-    (935,1)   935
-    (936,1)   936
-    (937,1)   937
-    (938,1)   938
-    (939,1)   939
-    (940,1)   940
-    (941,1)   941
-    (942,1)   942
-    (943,1)   943
-    (944,1)   944
-    (945,1)   945
-    (946,1)   946
-    (947,1)   947
-    (948,1)   948
-    (949,1)   949
-    (950,1)   950
-    (951,1)   951
-    (952,1)   952
-    (953,1)   953
-    (954,1)   954
-    (955,1)   955
-    (956,1)   956
-    (957,1)   957
-    (958,1)   958
-    (959,1)   959
-    (960,1)   960
-    (961,1)   961
-    (962,1)   962
-    (963,1)   963
-    (964,1)   964
-    (965,1)   965
-    (966,1)   966
-    (967,1)   967
-    (968,1)   968
-    (969,1)   969
-    (970,1)   970
-    (971,1)   971
-    (972,1)   972
-    (973,1)   973
-    (974,1)   974
-    (975,1)   975
-    (976,1)   976
-    (977,1)   977
-    (978,1)   978
-    (979,1)   979
-    (980,1)   980
-    (981,1)   981
-    (982,1)   982
-    (983,1)   983
-    (984,1)   984
-    (985,1)   985
-    (986,1)   986
-    (987,1)   987
-    (988,1)   988
-    (989,1)   989
-    (990,1)   990
-    (991,1)   991
-    (992,1)   992
-    (993,1)   993
-    (994,1)   994
-    (995,1)   995
-    (996,1)   996
-    (997,1)   997
-    (998,1)   998
-    (999,1)   999
-    (1000,1)   1000
-    (1001,1)   1001
-    (1002,1)   1002
-    (1003,1)   1003
-    (1004,1)   1004
-    (1005,1)   1005
-    (1006,1)   1006
-    (1007,1)   1007
-    (1008,1)   1008
-    (1009,1)   1009
-    (1010,1)   1010
-    (1011,1)   1011
-    (1012,1)   1012
-    (1013,1)   1013
-    (1014,1)   1014
-    (1015,1)   1015
-    (1016,1)   1016
-    (1017,1)   1017
-    (1018,1)   1018
-    (1019,1)   1019
-    (1020,1)   1020
-    (1021,1)   1021
-    (1022,1)   1022
-    (1023,1)   1023
-
- jit_cache get program GB_jit_reduceNonZombiesWarp
-found memory-cached prog GB_jit_reduceNonZombiesWarp
- got kernel instance reduceNonZombiesWarp_int32_t_int32_t_true
-found memory-cached prog reduceNonZombiesWarp_int32_t_int32_t_true
-Launching _Z20reduceNonZombiesWarpIiiLb1EEvP16GB_Matrix_opaqueP16GB_Scalar_opaquej<<<8,128,0,0>>>(GB_Matrix_opaque*,GB_Scalar_opaque*,unsigned int)
-Sum: 523776
-Invoking grb reduce
- rmm_wrap_alloc 256 bytes
- work:1024 gpus:0 Done.
-Results matched!
-[       OK ] AxB_dot3_tests_PLUS_TIMES_4.smallxsmallPLUS_TIMES_Cint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t (5 ms)
-[----------] 4 tests from AxB_dot3_tests_PLUS_TIMES_4 (14 ms total)
-
-[----------] Global test environment tear-down
-[==========] 16 tests from 4 test suites ran. (4206 ms total)
-[  PASSED  ] 16 tests.
-Tests complete
diff --git a/GraphBLAS/CUDA/test/problem_spec.hpp b/GraphBLAS/CUDA/test/problem_spec.hpp
new file mode 100644
index 000000000..dc8a5b5a7
--- /dev/null
+++ b/GraphBLAS/CUDA/test/problem_spec.hpp
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <cassert>
+#include <cmath>
+#include <random>
+#include <algorithm>
+#include <iostream>
+//#include "GB_binary_search.h"
+#include "GpuTimer.h"
+#include "GB_cuda_buckets.h"
+#include "../../rmm_wrap/rmm_wrap.h"
+#include <gtest/gtest.h>
+#include "test_data.hpp"
+extern "C" {
+#include "GB.h"
+}
+
+#include "../jitFactory.hpp"
+#include "dataFactory.hpp"
+
+template<typename T_C, typename T_M, typename T_A, typename T_B>
+class mxm_problem_spec {
+
+public:
+    mxm_problem_spec(GrB_Monoid monoid_, GrB_BinaryOp binop_, int64_t N_, int64_t Annz_, int64_t Bnnz_, int64_t Cnnz_,
+                     int sparsity_control_A_ = GxB_SPARSE, int sparsity_control_B_ = GxB_SPARSE) :
+        mysemiring(), binop(binop_), monoid(monoid_), N(N_),
+        G(N_, N_), Annz(Annz_), Bnnz(Bnnz_), Cnnz(Cnnz_), mask_struct(true), flipxy(false), mask_comp(false) {
+
+        // FIXME: This should be getting set automatically somehow.
+        float Cnzpercent = (float) Cnnz_/(N_*N_);
+
+        // TODO: Allocate and fill arrays for buckets and nano buckets
+        G.init_A(Annz_, sparsity_control_A_, GxB_BY_ROW);
+        G.init_B(Bnnz_, sparsity_control_B_, GxB_BY_ROW);
+        G.init_C(Cnzpercent);
+//      G.fill_buckets( TB ); // all elements go to testbucket= TB
+
+        /************************
+         * Create mxm factory
+         */
+        auto grb_info = GrB_Semiring_new(&mysemiring, monoid_, binop_);
+        GRB_TRY (grb_info) ;
+        GrB_Matrix A = G.getA();
+        GrB_Matrix B = G.getB();
+        //GRB_TRY (GxB_Matrix_fprint (A, "A", GxB_SHORT_VERBOSE, stdout)) ;
+        //GRB_TRY (GxB_Matrix_fprint (B, "B", GxB_SHORT_VERBOSE, stdout)) ;
+    }
+
+    ~mxm_problem_spec() {
+
+        std::cout << "Calling G.del()" << std::endl;
+        G.del();
+
+    }
+
+    GrB_Matrix getC(){ return G.getC(); }
+    GrB_Matrix getM(){ return G.getM(); }
+    GrB_Matrix getA(){ return G.getA(); }
+    GrB_Matrix getB(){ return G.getB(); }
+
+    GrB_Monoid getMonoid() { return monoid; }
+    GrB_BinaryOp getBinaryOp() { return binop; }
+
+    int64_t getN() { return N; }
+    int64_t getAnnz() { return Annz; }
+    int64_t getBnnz() { return Bnnz; }
+    int64_t getCnnz() { return Cnnz; }
+
+    auto &getG() { return G; }
+
+    GB_cuda_mxm_factory &get_mxm_factory() {
+
+        // Lazily create the mxm factory
+        if(!mymxmfactory.has_value()) {
+
+            mymxmfactory.emplace(GB_cuda_mxm_factory());
+            GrB_Matrix C = G.getC();
+            GrB_Matrix M = G.getM();
+            GrB_Matrix A = G.getA();
+            GrB_Matrix B = G.getB();
+
+            bool C_iso = false ;
+            int C_sparsity = GB_sparsity (M) ;
+            GrB_Type ctype = binop->ztype ;
+
+            (*mymxmfactory).mxm_factory (
+                    C_iso, C_sparsity, ctype,
+                    M, mask_struct, mask_comp,
+                    mysemiring, flipxy,
+                    A, B) ;
+        }
+        return *mymxmfactory;
+    }
+    GrB_Semiring get_semiring() { return mysemiring; }
+
+    void set_sparsity_control(GrB_Matrix mat, int gxb_sparsity_control, int gxb_format) {
+        GRB_TRY (GxB_Matrix_Option_set (mat, GxB_SPARSITY_CONTROL, gxb_sparsity_control)) ;
+        GRB_TRY (GxB_Matrix_Option_set(mat, GxB_FORMAT, gxb_format));
+        GRB_TRY (GrB_Matrix_wait (mat, GrB_MATERIALIZE)) ;
+    }
+
+    bool get_mask_struct() { return mask_struct; }
+
+private:
+
+    bool mask_struct{false};
+    bool flipxy{false};
+    bool mask_comp{false};
+
+    int64_t Annz;
+    int64_t Bnnz;
+    int64_t Cnnz;
+    int64_t N;
+    GrB_BinaryOp binop;
+    GrB_Monoid  monoid;
+    GrB_Semiring  mysemiring;
+    std::optional<GB_cuda_mxm_factory> mymxmfactory;
+    SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+};
diff --git a/GraphBLAS/CUDA/test/rmm_log.txt b/GraphBLAS/CUDA/test/rmm_log.txt
deleted file mode 100644
index 595ee97d6..000000000
--- a/GraphBLAS/CUDA/test/rmm_log.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-[1369531][11:21:15:565775][info  ] ----- RMM LOG BEGIN [PTDS DISABLED] -----
-[1369531][11:21:15:566007][error ] [A][Stream 0x1][Upstream 262144B][FAILURE maximum pool size exceeded]
diff --git a/GraphBLAS/CUDA/test/run_tests.cpp b/GraphBLAS/CUDA/test/run_tests.cpp
index 55a666e86..f0663c4e9 100644
--- a/GraphBLAS/CUDA/test/run_tests.cpp
+++ b/GraphBLAS/CUDA/test/run_tests.cpp
@@ -15,7 +15,8 @@ int main(int argc, char **argv) {
     //printf(" pool init size %ld, max size %ld\n", init_size, max_size);
     rmm_wrap_initialize( rmm_wrap_managed, init_size, max_size );
 
-    GRB_TRY (GxB_init(GrB_NONBLOCKING, rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free)) ;
+    GRB_TRY (GxB_init (GxB_NONBLOCKING_GPU,
+        rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free)) ;
     GRB_TRY (GxB_Global_Option_set (GxB_GLOBAL_GPU_CONTROL, GxB_GPU_ALWAYS)) ;
 
     size_t buff_size = (1ULL<<13)+152;
diff --git a/GraphBLAS/CUDA/test/testGen.py b/GraphBLAS/CUDA/test/testGen.py
deleted file mode 100644
index 4b78c942d..000000000
--- a/GraphBLAS/CUDA/test/testGen.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0 
-# Generate test instances from a large tensor product set of options
-
-Monoids = ["PLUS","MIN","MAX","TIMES","ANY"]
-Binops  = ["TIMES", "PLUS", "MIN", "MAX", "DIV","MINUS", "RDIV","RMINUS","FIRST","SECOND","PAIR"]
-Semirings = ["PLUS_TIMES", "MIN_PLUS", "MAX_PLUS"]
-#Semirings = ["PLUS_TIMES"]#,"MIN_PLUS"] #, "MAX_PLUS"]
-
-#DataTypes = ["bool","int8_t","int16_t", "int32_t", "int64_t",
-#                    "uint8_t", "uint16_t", "uint32_t", "uint64_t",
-#                    "float","double"]
-DataTypes = ["int32_t", "int64_t", "uint32_t","uint64_t","float","double"]
-#DataTypes = ["float","double"]
-DataTypes = ["int32_t","uint64_t"]
-
-DataShapes ={
-             "tinyxtiny": {'N':32, 'Anz':256, 'Bnz':128}, 
-             "smallxsmall": {'N':1024, 'Anz': 65_536, 'Bnz':65_536}
-            # "medxmed": {'N':4096, 'Anz': 2**20, 'Bnz':2**20} 
-            # "largexlarge": {'N':2**16, 'Anz': 64*2**20, 'Bnz':64*2**20} 
-             }
-
-Kernels= ["warp","mp", "vsvs","dndn", "spdn","vssp"]
-Kernels= ["warp"] #, "vsvs","dndn", "spdn","vssp"]
-
-
-
-def buildTest(ts="TestsuiteName",kern="vsvs", ds= "tiny-tiny", SR = "PLUS_TIMES",phase=3, 
-              typeC="int",typeM="int",typeA="int",typeB="int",type_x="int",type_y="int",type_z="int"):
-
-    # build string interpolation from pieces
-    Test_name = f"{ds}{SR}C{typeC}M{typeM}A{typeA}B{typeB}X{type_x}Y{type_y}Z{type_z}"
-
-    Test_suite = ts
-    #print(Test_suite)
-    TEST_HEAD = f"""TEST( {Test_suite}, {Test_name})""" 
-    #print(TEST_HEAD)
-    N = DataShapes[ds]['N']
-    Anz = DataShapes[ds]['Anz']
-    Bnz = DataShapes[ds]['Bnz']
-    phase1_body= f""" test_AxB_dot3_phase1_factory< {typeC}, {typeM}, {typeA}, {typeB}>( 5, {N}, {Anz},{Bnz});""" 
-    phase2_body= f""" test_AxB_dot3_phase2_factory< {typeC} >( 5, {N}, {Anz},{Bnz});"""  
-    phase3_body = f""" test_AxB_dot3_{kern}_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (5, {N}, {Anz}, {Bnz}, SR);"""
-    #print( TEST_BODY)
-    phasedict = { 1: phase1_body, 2: phase2_body, 3: phase3_body}
-    TEST_BODY= phasedict[phase] 
-
-    return TEST_HEAD,TEST_BODY 
-
-
-if __name__ == "__main__":
-
-
-   #print( buildTest()) #test if anything works
-
-    
-   outfile = f"""AxB_dot3_test_instances.hpp""" 
-   fp = open(outfile, 'w')
-
-
-   for k in Kernels:
-       Test_suite = f'AxB_dot3_tests_{k}'
-       for SR in Semirings:
-           for dtC in DataTypes:
-               dtX = dtC 
-               dtY = dtC 
-               dtZ = dtC
-               for dtM in ["bool", "int32_t"]: 
-                   for dtA in DataTypes:
-                       for dtB in DataTypes:
-                           for ds in DataShapes:
-                               for phase in [3]: 
-
-                                   TEST_HEAD, TEST_BODY = buildTest( Test_suite, k, ds, SR, phase,
-                                                                     dtC, dtM, dtA, dtB, dtX, dtY, dtZ)  
-                                   fp.write( TEST_HEAD)
-                                   fp.write( """{ std::string SR = "%s"; """%SR)
-                                   fp.write( TEST_BODY)
-                                   fp.write( "}\n")
-
-          
-   fp.close()
-
diff --git a/GraphBLAS/CUDA/test/testGen_cmake.py b/GraphBLAS/CUDA/test/testGen_cmake.py
index f48996869..ade7381ad 100644
--- a/GraphBLAS/CUDA/test/testGen_cmake.py
+++ b/GraphBLAS/CUDA/test/testGen_cmake.py
@@ -5,10 +5,44 @@
 
 SUPPORTED_TYPES = {
     "int32_t": "INT32",
-    "uint32_t": "UINT32"
+    "uint32_t": "UINT32",
+    "int64_t": "INT64",
+    "uint64_t": "UINT64",
+    "bool": "BOOL",
+    "float": "FP32",
+    "double": "FP64"
 }
 
-DOT3_BUCKETS = [1, 5, 6, 7, 8, 9, 10, 11]
+DOT3_BUCKETS = [1, 2]    # NBUCKETS, hard-coded
+
+DataShapes ={
+    "nanoxnano": {'N':32, 'Anz':64, 'Bnz':56, 'Cnz': 256},
+    "tinyxtiny": {'N':128, 'Anz':1256, 'Bnz':1028, 'Cnz': 1640},
+    "smallxsmall": {'N':1024, 'Anz': 65_536, 'Bnz':65_536, 'Cnz': 10000},
+    "ti_denxti_den": {'N':32, 'Anz':1024, 'Bnz':1024, 'Cnz': 1024},
+    "ti_spaxti_den": {'N':32, 'Anz':256, 'Bnz':1024, 'Cnz': 1024},
+    "medxmed": {'N':4096, 'Anz': 2**20, 'Bnz':2**20},
+    "largexlarge": {'N':2**16, 'Anz': 64*2**20, 'Bnz':64*2**20}
+}
+
+FORMATS = { "sparse": ["phase1", "phase2", "mxm_sparse"],
+            "dense": ["dense_phase1", "mxm_dense"],
+            "sparse_dense": ["dense_phase1", "mxm_sparse_dense"],
+            "reduce": ["reduce"]}
+
+FORMAT_INPUTS = {
+    "sparse": [("GxB_SPARSE", "GxB_SPARSE")],
+    "dense": [("GxB_FULL", "GxB_FULL"), ("GxB_BITMAP", "GxB_BITMAP")],
+    "sparse_dense": [("GxB_SPARSE", "GxB_FULL")],
+    "reduce": [("GxB_SPARSE", "GxB_SPARSE")]
+}
+
+FORMAT_DATASETS = {
+    "sparse": ["nanoxnano", "tinyxtiny", "smallxsmall"],
+    "dense": ["ti_denxti_den"],
+    "sparse_dense": ["ti_spaxti_den"],
+    "reduce": ["nanoxnano", "smallxsmall", "ti_denxti_den", "ti_spaxti_den"]
+}
 
 def std_type_to_gb_type(t):
     return SUPPORTED_TYPES[t]
@@ -23,87 +57,101 @@ def build_gb_binop(t, b):
     gb_type = std_type_to_gb_type(t)
     return f"{GB_TYPE_PREFIX}_{b}_{gb_type}"
 
-def buildTest(ts="TestsuiteName",kernels=DOT3_BUCKETS, ds= "tiny-tiny", SUM="PLUS", PRODUCT="TIMES",phase=3,
-              typeC="int32_t",typeM="int32_t",typeA="int32_t",typeB="int32_t",type_x="int32_t",type_y="int32_t",type_z="int32_t"):
+
+
+
+def buildTest(ts="TestsuiteName", ds="tiny-tiny", df=("GxB_SPARSE", "GxB_SPARSE"),
+              SUM="PLUS", PRODUCT="TIMES",
+              typeC="int32_t",typeM="int32_t",
+              typeA="int32_t",typeB="int32_t",
+              type_x="int32_t", type_y="int32_t",type_z="int32_t"):
 
     # build string interpolation from pieces
-    Test_name = f"{ds}{SUM}_{PRODUCT}_C{typeC}M{typeM}A{typeA}B{typeB}X{type_x}Y{type_y}Z{type_z}"
+    format_A, format_B = df
+
+    Test_name = f"{ds}{SUM}_{PRODUCT}__{format_A}_{format_B}__C{typeC}M{typeM}A{typeA}B{typeB}X{type_x}Y{type_y}Z{type_z}"
+    Test_suite = f"{ts}"
 
-    Test_suite = f"{ts}_{phase}"
-    #print(Test_suite)
-    TEST_HEAD = f"""TEST( {Test_suite}, {Test_name})"""
-    #print(TEST_HEAD)
     N = DataShapes[ds]['N']
     Anz = DataShapes[ds]['Anz']
     Bnz = DataShapes[ds]['Bnz']
+    Cnz = DataShapes[ds]['Cnz']
 
     gb_monoid = build_gb_monioid(typeC, SUM)
     gb_binop = build_gb_binop(typeC, PRODUCT)
 
-    phase1_body= f""" test_AxB_phase1_factory< {typeC}, {typeM}, {typeA}, {typeB}>( 5, {N}, {Anz}, {Bnz}, monoid, binop);"""
-    phase2_body= f""" test_AxB_phase2_factory< {typeC} >( 5, {N}, {Anz},{Bnz});"""
-    phase3_body = ''.join([f""" test_AxB_dot3_full_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > ({kern}, {N}, {Anz}, {Bnz}, monoid, binop);\n""" for kern in kernels])
-    reduce_body = f""" test_reduce_factory<{typeC}>({N}, monoid);"""
-    phasedict = { 1: phase1_body, 2: phase2_body, 3: phase3_body, 4: reduce_body }
-    TEST_BODY= phasedict[phase]
+    TEST_HEAD = f"""
+    TEST( {Test_suite}, {Test_name}) {{
 
-    return TEST_HEAD,TEST_BODY, gb_monoid, gb_binop
+        /**************************
+         * Create reference and input data
+         */
+        GrB_Monoid monoid = {gb_monoid}; 
+        GrB_BinaryOp binop = {gb_binop};
+
+        mxm_problem_spec<{typeC}, {typeM}, {typeA}, {typeB}> problem_spec(monoid, binop, {N}, {Anz}, {Bnz}, {Cnz},
+                                                                          {format_A}, {format_B});
+    """
+    phase1_body= f""" test_AxB_phase1_factory< {typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);"""
+    phase2_body= f""" test_AxB_phase2_factory< {typeC}, {typeM}, {typeA}, {typeB} >(problem_spec);"""
+    dense_phase1_body = f""" test_AxB_dense_phase1_factory<{typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);"""
+    mxm_sparse_body = f""" test_AxB_dot3_sparse_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n"""
+    mxm_dense_body = f""" test_AxB_dot3_dense_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n"""
+    mxm_sparse_dense_body = f""" test_AxB_dot3_sparse_dense_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (problem_spec);\n"""
+    reduce_body = f""" test_reduce_factory<{typeC}, {typeM}, {typeA}, {typeB}>(problem_spec);"""
+    phasedict = { "phase1": phase1_body,
+                  "phase2": phase2_body,
+                  "mxm_sparse": mxm_sparse_body,
+                  "mxm_dense": mxm_dense_body,
+                  "mxm_sparse_dense": mxm_sparse_dense_body,
+                  "reduce": reduce_body,
+                  "dense_phase1": dense_phase1_body }
+
+    return TEST_HEAD, phasedict
 
 def load_types(argv):
     test_suite_name = argv[2]
-
-
     Monoids = argv[3].split(";")
     Binops  = argv[4].split(";")
     Semirings = argv[5]
     DataTypes = argv[6].split(";")
 
     # Hard-coding data shapes for now
-
-    DataShapes ={
-        "tinyxtiny": {'N':32, 'Anz':256, 'Bnz':128},
-        "smallxsmall": {'N':1024, 'Anz': 65_536, 'Bnz':65_536}
-        # "medxmed": {'N':4096, 'Anz': 2**20, 'Bnz':2**20}
-        # "largexlarge": {'N':2**16, 'Anz': 64*2**20, 'Bnz':64*2**20}
-    }
-
     Kernels= argv[7]
 
     return argv[1], test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels
 
-def write_test_instances_header(test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels):
-    outfile = f'{test_suite_name}_{Semirings}_test_instances.hpp'
+def write_test_instances_header(test_suite_name, mat_format, tests, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels):
+    outfile = f'{test_suite_name}_{Semirings}_{mat_format}_test_instances.hpp'
     with open(outfile, 'w') as fp:
-        fp.write("#pragma once\n");
-        for m in Monoids:
-            for b in Binops:
-                Test_suite = f'{test_suite_name}_tests_{m}_{b}'
-                for dtC in DataTypes:
-                    dtX = dtC
-                    dtY = dtC
-                    dtZ = dtC
-                    for dtM in ["bool", "int32_t"]:
-                        for dtA in DataTypes:
-                            for dtB in DataTypes:
-                                for ds in DataShapes:
-                                    for phase in [1, 2, 3, 4]:
-
-                                        TEST_HEAD, TEST_BODY, gb_monoid, gb_binop = buildTest( Test_suite, Kernels, ds, m, b, phase,
-                                                                          dtC, dtM, dtA, dtB, dtX, dtY, dtZ)
-                                        fp.write( TEST_HEAD)
-                                        fp.write( """{ GrB_Monoid monoid = %s; GrB_BinaryOp binop = %s; """%(gb_monoid, gb_binop))
-                                        fp.write( TEST_BODY)
-                                        fp.write( "}\n")
-
-def write_cuda_test(source_dir, test_suite_name, semiring, kernel):
+        fp.write("#pragma once\n#include \"problem_spec.hpp\"\n");
+        m, b = Semirings.split("_")
+        Test_suite = f'{test_suite_name}_tests_{mat_format}_{m}_{b}'
+        for dtC in DataTypes:
+            dtX = dtC
+            dtY = dtC
+            dtZ = dtC
+            for dtM in ["bool", "int32_t", "int64_t", "float", "double"]:
+                for dtA in DataTypes:
+                    for dtB in DataTypes:
+                        for ds in FORMAT_DATASETS[mat_format]:
+                            for df in FORMAT_INPUTS[mat_format]:
+                                TEST_HEAD, TEST_BODY = buildTest( Test_suite, ds, df, m, b,
+                                                                  dtC, dtM, dtA, dtB, dtX, dtY, dtZ)
+                                fp.write( TEST_HEAD)
+                                for test in tests:
+                                    fp.write( TEST_BODY[test] )
+                                fp.write( "}\n")
+
+def write_cuda_test(source_dir, test_suite_name, mat_format, semiring, kernel):
     import shutil
 
-    shutil.copy(f"{source_dir}/test/cuda_tests_template.cpp", f"{test_suite_name}_{semiring}_cuda_tests.cpp")
+    shutil.copy(f"{source_dir}/test/cuda_tests_template.cpp", f"{test_suite_name}_{semiring}_{mat_format}_cuda_tests.cpp")
 
-    with open(f"{test_suite_name}_{semiring}_cuda_tests.cpp", "a") as file_object:
+    with open(f"{test_suite_name}_{semiring}_{mat_format}_cuda_tests.cpp", "a") as file_object:
         # Keeping this as a separate file for now to allow for further nesting
         # of test instances for each test_suite_name
-        file_object.write(f"\n#include \"{test_suite_name}_{semiring}_test_instances.hpp\"")
+        file_object.write(f"\n#include \"{test_suite_name}_{semiring}_{mat_format}_test_instances.hpp\"")
 
 if __name__ == "__main__":
     import sys
@@ -116,6 +164,6 @@ def write_cuda_test(source_dir, test_suite_name, semiring, kernel):
     """
     source_dir, test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, Kernels = load_types(sys.argv)
 
-    write_test_instances_header(test_suite_name, Monoids, Binops, Semirings, DataTypes, DataShapes, DOT3_BUCKETS)
-
-    write_cuda_test(source_dir, test_suite_name, Semirings, Kernels)
+    for mat_format, tests in FORMATS.items():
+        write_test_instances_header(test_suite_name, mat_format, tests, Monoids, Binops, Semirings, DataTypes, DataShapes, DOT3_BUCKETS)
+        write_cuda_test(source_dir, test_suite_name, mat_format, Semirings, Kernels)
diff --git a/GraphBLAS/CUDA/test/test_data.hpp b/GraphBLAS/CUDA/test/test_data.hpp
index a3fdaee6f..1e418260b 100644
--- a/GraphBLAS/CUDA/test/test_data.hpp
+++ b/GraphBLAS/CUDA/test/test_data.hpp
@@ -2,6 +2,8 @@
 #include <vector>
 #include <cstdint>
 
+#pragma once
+
 template<typename T_A, typename T_B, typename T_C, typename T_M>
 class TestData {
 
diff --git a/GraphBLAS/CUDA/tofix/testJit.cpp b/GraphBLAS/CUDA/tofix/testJit.cpp
deleted file mode 100644
index b6a13407f..000000000
--- a/GraphBLAS/CUDA/tofix/testJit.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of NVIDIA CORPORATION nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
-  Extended example for building on-the-fly kernels with C interface.
-  Simple examples demonstrating different ways to load source code
-    and call kernels.
- */
-
-
-#include "../test/jitTestFactory.hpp"
-
-
-int main(int argc, char* argv[]) {
-#if __cplusplus >= 201103L
-
-#define TEST_RESULT(result) (result ? "PASSED" : "FAILED")
-std::cout << "Running tests..."<<std::endl;
-
-  std::cout<<" spdot_plus_times_ffd_non_uniform"<<std::endl;
-  bool test_spdot_plus_times_ffd_nu = test_spdotfactoryUM<float,float,double>(256, 32,120,"PLUS_TIMES");
-  std::cout << "test_spdotfactoryUM<float,float,double> non-uniform uncached:       " 
-            << TEST_RESULT(test_spdot_plus_times_ffd_nu)
-            << std::endl;
-
-  std::cout<<" spdot_plus_times_ffd_large_non_uniform"<<std::endl;
-  bool test_spdot_plus_times_ffd_lrg_nu = test_spdotfactoryUM<float,float,double>(4096, 256,256,"PLUS_TIMES");
-  std::cout << "test_spdotfactoryUM<float,float,double> non-uniform uncached:       " 
-            << TEST_RESULT(test_spdot_plus_times_ffd_lrg_nu)
-            << std::endl;
-
-  std::cout<<" spdot_plus_times_fff"<<std::endl;
-  bool test_spdot_plus_times_fff = test_spdotfactoryUM<float,float,float>(256, 32,32,"PLUS_TIMES");
-  std::cout << "test_spdotfactoryUM<float,float,float> uncached:       " 
-            << TEST_RESULT(test_spdot_plus_times_fff)
-            << std::endl;
-
-  std::cout<<" spdot_plus_times_ffd"<<std::endl;
-  bool test_spdot_plus_times_ffd = test_spdotfactoryUM<float,float,double>(256, 32,32,"PLUS_TIMES");
-  std::cout << "test_spdotfactoryUM<float,float,double> uncached:       " 
-            << TEST_RESULT(test_spdot_plus_times_ffd)
-            << std::endl;
-
-
-  cudaSetDevice(0); 
-  bool test_spdot_batch_ffff = test_spdot_batch_factoryUM<float, float, float, float>(5, 32, 128, 128, "PLUS_TIMES"); 
-  std::cout << "test_spdot_batchUM<float,float,float,float> uncached:       " 
-            << TEST_RESULT(test_spdot_batch_ffff)
-            << std::endl;
-
-  bool test_spdot_batch_iiii = test_spdot_batch_factoryUM<int, int, int, int>(5, 32, 128, 128, "PLUS_TIMES"); 
-  std::cout << "test_spdot_batchUM<int,int,int,int> uncached:       " 
-            << TEST_RESULT(test_spdot_batch_iiii)
-            << std::endl;
-
-  cudaSetDevice(1); 
-
-  bool test_spdot_batch_liii= test_spdot_batch_factoryUM<int64_t, int, int, int>(5, 32, 256, 128, "PLUS_TIMES"); 
-  std::cout << "test_spdot_batchUM<int64_t,int, int,int> uncached:       " 
-            << TEST_RESULT(test_spdot_batch_liii)
-            << std::endl;
-/*
-
-
-
-  bool test_dot_min_plus_iil = test_dotfactoryUM<int,int,long>(4096,"MIN_PLUS");
-  std::cout << "test_dotfactoryUM<int,int,long> uncached:       " 
-            << TEST_RESULT(test_dot_min_plus_iil)
-            << std::endl;
-
-  bool test_dot_min_plus_ffd = test_dotfactoryUM<float,float,double>(4096,"MIN_PLUS");
-  std::cout << "test_dotfactoryUM<float,float,double> uncached:       " 
-            << TEST_RESULT(test_dot_min_plus_ffd)
-            << std::endl;
-
-  bool test_dot_plus_times_ffd = test_dotfactoryUM<float,float,double>(4096,"PLUS_TIMES");
-  std::cout << "test_dotfactoryUM<float,float,double> uncached:       " 
-            << TEST_RESULT(test_dot_plus_times_ffd)
-            << std::endl;
-
-  bool test_dot_plus_times_fii = test_dotfactoryUM<float,int,int>(4096,"PLUS_TIMES");
-  std::cout << "test_dotfactoryUM<float,int,int> uncached:       " 
-            << TEST_RESULT(test_dot_plus_times_fii)
-            << std::endl;
-
-  bool test_dot_plus_times_iil = test_dotfactoryUM<int,int,long>(4096,"PLUS_TIMES");
-  std::cout << "test_dotfactoryUM<int,int,long> uncached:       " 
-            << TEST_RESULT(test_dot_plus_times_iil)
-            << std::endl;
-
-  bool test_reducefactory_float_result = test_reducefactoryUM<float>(4096, "PLUS");
-  std::cout << "test_reducefactoryUM<float> uncached:       " 
-            << TEST_RESULT(test_reducefactory_float_result)
-            << std::endl;
-
-  bool test_reducefactory_double_plus_result = test_reducefactoryUM<double>(4096, "PLUS");
-  std::cout << "test_reducefactoryUM<double> uncached:       " 
-            << TEST_RESULT(test_reducefactory_double_plus_result)
-            << std::endl;
-
-  std::cout << "testing cached kernel" <<std::endl;
-  bool test2_reducefactory_double_plus_result = test_reducefactoryUM<double>(4096, "PLUS");
-  std::cout << "test_reducefactoryUM<double> cached:       " 
-            << TEST_RESULT(test2_reducefactory_double_plus_result)
-            << std::endl;
-
-  bool test_reducefactory_float_min_result = test_reducefactoryUM<float>(32,"MIN");
-  std::cout << "test_reducefactoryUM<float> MIN uncached:       " 
-            << TEST_RESULT(test_reducefactory_float_min_result)
-            << std::endl;
-
-  bool test_reducefactory_int_min_result = test_reducefactoryUM<int>(32,"MIN");
-  std::cout << "test_reducefactoryUM<int> MIN uncached:       " 
-            << TEST_RESULT(test_reducefactory_int_min_result)
-            << std::endl;
-
-  bool test_reducefactory_int_max_result = test_reducefactoryUM<int>(32,"MAX");
-  std::cout << "test_reducefactoryUM<int> MAX uncached:       " 
-            << TEST_RESULT(test_reducefactory_int_max_result)
-            << std::endl;
-
-  bool test_reducefactory_int_result = test_reducefactoryUM<int>(4096,"PLUS");
-  std::cout << "test_reducefactoryUM<int> PLUS uncached:       " 
-            << TEST_RESULT(test_reducefactory_int_result)
-            << std::endl;
-
-  bool test_reducefactory_int_cache_result = 
-                test_reducefactoryUM<int>(4096,"PLUS");
-  std::cout << "test_reducefactoryUM<int> PLUS cached:          " 
-            << TEST_RESULT(test_reducefactory_int_cache_result)
-            << std::endl;
-*/
-  return 0;
-
-#else
-  std::cout << "Tests require building with C++14 support (make CXX14=1)"
-            << std::endl;
-  return 0;
-#endif
-}
diff --git a/GraphBLAS/Config/GraphBLAS.h.in b/GraphBLAS/Config/GraphBLAS.h.in
index 25ca36c7a..1f6b33477 100644
--- a/GraphBLAS/Config/GraphBLAS.h.in
+++ b/GraphBLAS/Config/GraphBLAS.h.in
@@ -352,21 +352,25 @@ GrB_Info ;
 
 typedef enum
 {
-    GrB_NONBLOCKING = 0,    // methods may return with pending computations
-    GrB_BLOCKING = 1        // no computations are ever left pending
+    GrB_NONBLOCKING = 0,        // methods may return with pending computations
+    GrB_BLOCKING = 1,           // no computations are ever left pending
+//  DRAFT: in progress, do not use:
+    GxB_NONBLOCKING_GPU = 2,    // non-blocking mode, allow use of GPU(s)
+    GxB_BLOCKING_GPU = 3,       // blocking mode, allow use of GPU(s)
 }
 GrB_Mode ;
 
 GB_PUBLIC
 GrB_Info GrB_init           // start up GraphBLAS
 (
-    GrB_Mode mode           // blocking or non-blocking mode
+    GrB_Mode mode           // blocking or non-blocking mode, no GPU
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_init           // start up GraphBLAS and also define malloc, etc
 (
-    GrB_Mode mode,          // blocking or non-blocking mode
+    GrB_Mode mode,          // blocking or non-blocking mode,
+                            // with or without GPU
     // pointers to memory management functions
     void * (* user_malloc_function  ) (size_t),
     void * (* user_calloc_function  ) (size_t, size_t),
@@ -467,7 +471,7 @@ GrB_Info GrB_getVersion         // runtime access to C API version number
 //      done, and this setting has no effect.
 //
 // GxB_COMPRESSION: compression method for GxB_Matrix_serialize and
-//      GxB_Vector_serialize.  The default is LZ4.
+//      GxB_Vector_serialize.  The default is ZSTD (level 1).
 //
 // GxB_IMPORT:  GxB_FAST_IMPORT (faster, for trusted input data) or
 //      GxB_SECURE_IMPORT (slower, for untrusted input data), for the
@@ -945,6 +949,10 @@ GB_PUBLIC GrB_UnaryOp
     GxB_LGAMMA_FP32,    GxB_TGAMMA_FP32,    GxB_ERF_FP32,       GxB_ERFC_FP32,
     GxB_LGAMMA_FP64,    GxB_TGAMMA_FP64,    GxB_ERF_FP64,       GxB_ERFC_FP64,
 
+    // z = cbrt (x)
+    GxB_CBRT_FP32,
+    GxB_CBRT_FP64,
+
     // frexpx and frexpe return the mantissa and exponent, respectively,
     // from the ANSI C11 frexp function.  The exponent is returned as a
     // floating-point value, not an integer.
@@ -3196,6 +3204,17 @@ GrB_Info GrB_Vector_extractElement  // x = v(i)
     (x, v, i)
 #endif
 
+// GxB_Vector_isStoredElement determines if v(i) is present in the structure
+// of the vector v, as a stored element.  It does not return the value.  It
+// returns GrB_SUCCESS if the element is present, or GrB_NO_VALUE otherwise.
+
+GB_PUBLIC
+GrB_Info GxB_Vector_isStoredElement // determine if v(i) is a stored element
+(
+    const GrB_Vector v,             // vector to check
+    GrB_Index i                     // row index
+) ;
+
 //------------------------------------------------------------------------------
 // GrB_Vector_removeElement
 //------------------------------------------------------------------------------
@@ -3994,6 +4013,18 @@ GrB_Info GrB_Matrix_extractElement      // x = A(i,j)
     (x, A, i, j)
 #endif
 
+// GxB_Matrix_isStoredElement determines if A(i,j) is present in the structure
+// of the matrix A, as a stored element.  It does not return the value.  It
+// returns GrB_SUCCESS if the element is present, or GrB_NO_VALUE otherwise.
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_isStoredElement // determine if A(i,j) is a stored element
+(
+    const GrB_Matrix A,                 // matrix to check
+    GrB_Index i,                        // row index
+    GrB_Index j                         // column index
+) ;
+
 //------------------------------------------------------------------------------
 // GrB_Matrix_removeElement
 //------------------------------------------------------------------------------
@@ -11247,10 +11278,10 @@ GrB_Info GrB_Matrix_exportHint  // suggest the best export format
 
 // GrB_Matrix_serialize/deserialize are slightly different from their GxB*
 // counterparts.  The blob is allocated by GxB_Matrix_serialize, and must be
-// freed by GxB_serialize_free (which calls the ANSI C11 free if GrB_init was
-// used).  By contrast, the GrB* methods require the user application to pass
-// in a preallocated blob to GrB_Matrix_serialize, whose size can be given by
-// GrB_Matrix_serializeSize (as a loose upper bound).
+// freed by the same free() method passed to GxB_init (or the ANSI C11 free()
+// if GrB_init was used).  By contrast, the GrB* methods require the user
+// application to pass in a preallocated blob to GrB_Matrix_serialize, whose
+// size can be given by GrB_Matrix_serializeSize (as a loose upper bound).
 
 // The GrB* and GxB* methods can be mixed.  GrB_Matrix_serialize and
 // GxB_Matrix_serialize construct the same blob (assuming they are given the
@@ -11339,20 +11370,14 @@ GrB_Info GrB_Matrix_exportHint  // suggest the best export format
     free (blob) ;
 */
 
-// Three methods are currently implemented: no compression, LZ4, and LZ4HC
+// Currently implemented: no compression, LZ4, LZ4HC, and ZSTD
 #define GxB_COMPRESSION_NONE -1     // no compression
-#define GxB_COMPRESSION_DEFAULT 0   // LZ4
+#define GxB_COMPRESSION_DEFAULT 0   // ZSTD (level 1)
 #define GxB_COMPRESSION_LZ4   1000  // LZ4
 #define GxB_COMPRESSION_LZ4HC 2000  // LZ4HC, with default level 9
+#define GxB_COMPRESSION_ZSTD  3000  // ZSTD, with default level 1
 
-// possible future methods that could be added:
-// #define GxB_COMPRESSION_ZLIB  3000  // ZLIB, with default level 6
-// #define GxB_COMPRESSION_LZO   4000  // LZO, with default level 2
-// #define GxB_COMPRESSION_BZIP2 5000  // BZIP2, with default level 9
-// #define GxB_COMPRESSION_LZSS  6000  // LZSS
-
-// using the Intel IPP versions, if available (not yet supported);
-#define GxB_COMPRESSION_INTEL   1000000
+#define GxB_COMPRESSION_INTEL   1000000 // not yet supported
 
 // Most of the above methods have a level parameter that controls the tradeoff
 // between run time and the amount of compression obtained.  Higher levels
@@ -11360,31 +11385,16 @@ GrB_Info GrB_Matrix_exportHint  // suggest the best export format
 
 //  LZ4     no level setting
 //  LZ4HC   1: fast, 9: default, 9: max
-
-//  these methos are not yet supported but may be added in the future:
-//  ZLIB    1: fast, 6: default, 9: max
-//  LZO     1: fast (X1ST), 2: default (XST)
-//  BZIP2   1: fast, 9: default, 9: max
-//  LZSS    no level setting
+//  ZSTD:   1: fast, 1: default, 19: max
 
 // For all methods, a level of zero results in the default level setting.
 // These settings can be added, so to use LZ4HC at level 5, use method =
 // GxB_COMPRESSION_LZ4HC + 5.
 
-// If the Intel IPPS compression methods are available, they can be selected
-// by adding GxB_COMPRESSION_INTEL.  For example, to use the Intel IPPS
-// implementation of LZ4HC at level 9, use method = GxB_COMPRESSION_INTEL +
-// GxB_COMPRESSION_LZ4HC + 9 = 1,002,009.  If the Intel methods are requested
-// but not available, this setting is ignored and the non-Intel methods are
-// used instead.
-
 // If the level setting is out of range, the default is used for that method.
 // If the method is negative, no compression is performed.  If the method is
-// positive but unrecognized, the default is used (GxB_COMPRESSION_LZ4, with no
-// level setting, and the non-Intel version).
-
-// If a method is not implemented, LZ4 is used instead, and the level setting
-// is ignored.
+// positive but unrecognized, the default is used (GxB_COMPRESSION_ZSTD,
+// level 1).
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_serialize       // serialize a GrB_Matrix to a blob
@@ -11536,6 +11546,65 @@ GrB_Info GxB_Matrix_sort
     )                                                       \
     (arg1, __VA_ARGS__)
 
+//==============================================================================
+// GxB_Matrix_reshape and GxB_Matrix_reshapeDup:  reshape a matrix
+//==============================================================================
+
+// GxB_Matrix_reshape changes the dimensions of a matrix, reshaping the entries
+// by row or by column.
+
+// For example, if C is 3-by-4 on input, and is reshaped by column to have
+// dimensions 2-by-6:
+
+//      C on input      C on output (by_col true)
+//      00 01 02 03     00 20 11 02 22 13
+//      10 11 12 13     10 01 21 12 03 23
+//      20 21 22 23
+
+// If the same C on input is reshaped by row to dimensions 2-by-6:
+
+//      C on input      C on output (by_col false)
+//      00 01 02 03     00 01 02 03 10 11
+//      10 11 12 13     12 13 20 21 22 23
+//      20 21 22 23
+
+// If the input matrix is nrows-by-ncols, and the size of the reshaped matrix
+// is nrows_new-by-ncols_new, then nrows*ncols must equal nrows_new*ncols_new.
+// The format of the input matrix (by row or by column) is unchanged; this
+// format need not match the by_col input parameter.
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_reshape     // reshape a GrB_Matrix in place
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix, reshaped in place
+    // input:
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // new number of rows of C
+    GrB_Index ncols_new,        // new number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+) ;
+
+// GxB_Matrix_reshapeDup reshapes a matrix into another matrix.
+
+// If the input matrix A is nrows-by-ncols, and the size of the newly-created
+// matrix C is nrows_new-by-ncols_new, then nrows*ncols must equal
+// nrows_new*ncols_new.  The format of the input matrix A (by row or by column)
+// determines the format of the output matrix C, which need not match the
+// by_col input parameter.
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_reshapeDup // reshape a GrB_Matrix into another GrB_Matrix
+(
+    // output:
+    GrB_Matrix *C,              // newly created output matrix, not in place
+    // input:
+    GrB_Matrix A,               // input matrix, not modified
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // number of rows of C
+    GrB_Index ncols_new,        // number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+) ;
 
 //==============================================================================
 // GxB_Iterator: an object that iterates over the entries of a matrix or vector
@@ -12542,15 +12611,28 @@ extern "C" {
 #endif
 
 // TODO describe the modes
-typedef enum { rmm_wrap_host=0, rmm_wrap_host_pinned=1, rmm_wrap_device=2, rmm_wrap_managed=3 } RMM_MODE ;
+typedef enum
+{
+    rmm_wrap_host = 0,
+    rmm_wrap_host_pinned = 1,
+    rmm_wrap_device = 2,
+    rmm_wrap_managed = 3
+} RMM_MODE ;
 
 void rmm_wrap_finalize (void) ;
-int rmm_wrap_initialize (RMM_MODE mode, size_t init_pool_size, size_t max_pool_size) ;
+
+int rmm_wrap_initialize
+(
+    RMM_MODE mode,
+    size_t init_pool_size,
+    size_t max_pool_size
+) ;
 
 // example usage:
     //  rmm_wrap_initialize (rmm_wrap_managed, INT32_MAX, INT64_MAX) ;
-    //  GxB_init (GrB_NONBLOCKING, rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free) ;
-    //  use GraphBLAS ...
+    //  GxB_init (GxB_NONBLOCKING_GPU, rmm_wrap_malloc, rmm_wrap_calloc,
+    //      rmm_wrap_realloc, rmm_wrap_free) ;
+    //  use GraphBLAS ... with the GPU
     //  GrB_finalize ( ) ;
     //  rmm_wrap_finalize ( ) ;
 
diff --git a/GraphBLAS/Config/README.md.in b/GraphBLAS/Config/README.md.in
index 488566631..41ab98be0 100644
--- a/GraphBLAS/Config/README.md.in
+++ b/GraphBLAS/Config/README.md.in
@@ -24,8 +24,8 @@ built-in sparse matrix multiply in MATLAB R2021a, where `C=A*B` is now up to
 30x faster than in prior versions of MATLAB (on my 20-core NVIDIA DGX Station).
 
 The development of this package is supported by Intel, NVIDIA (including the
-donation of the 20-core DGX Station), Redis, MIT Lincoln Lab, IBM, and Julia
-Computing.
+donation of the 20-core DGX Station), Redis, MIT Lincoln Lab, MathWorks,
+IBM, and Julia Computing.
 
 See the user guide in `Doc/GraphBLAS_UserGuide.pdf` for documentation on the
 SuiteSparse implementation of GraphBLAS, and how to use it in your
diff --git a/GraphBLAS/Demo/Program/wathen_demo.c b/GraphBLAS/Demo/Program/wathen_demo.c
index a40dd7f54..0382c999f 100644
--- a/GraphBLAS/Demo/Program/wathen_demo.c
+++ b/GraphBLAS/Demo/Program/wathen_demo.c
@@ -43,8 +43,8 @@ int main (int argc, char **argv)
     }
     OK (GxB_Global_Option_get (GxB_GLOBAL_NTHREADS, &nthreads)) ;
 
-    fprintf (stderr, "Wathen: nx %ld ny %ld method: %d nthreads: %d ",
-        nx, ny, method, nthreads) ;
+    fprintf (stderr, "Wathen: nx %d ny %d method: %d nthreads: %d ",
+        (int) nx, (int) ny, method, nthreads) ;
 
     //--------------------------------------------------------------------------
     // create a Wathen matrix
diff --git a/GraphBLAS/Doc/ChangeLog b/GraphBLAS/Doc/ChangeLog
index 878af3a11..2f2f0173b 100644
--- a/GraphBLAS/Doc/ChangeLog
+++ b/GraphBLAS/Doc/ChangeLog
@@ -1,3 +1,35 @@
+Version 7.2.0, Aug 8, 2022
+
+    * added ZSTD as a compression option for serialize/deserialize:
+        Version 1.5.3 by Yann Collet, https://github.com/facebook/zstd.git
+        Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+        Included in SuiteSparse:GraphBLAS via its BSD-3-clause license.
+        The default method is now ZSTD, level 1.
+    * added GxB_Matrix_reshape and GxB_Matrix_reshapeDup
+    * MATLAB interface: faster C(:)=A, C=A(:), and reshape.
+        Better error messages.
+
+Version 7.1.2, July 8, 2022
+
+    * MATLAB interface: linear indexing added for C(:)=A, C=A(:), and
+        single-output I=find(C).  Faster bandwidth, istriu, istril,
+        isbanded, isdiag.  C(I,J)=A can now grow the size of A.
+
+Version 7.1.1, June 17, 2022
+
+    * minor updates to documentation and error messages
+    * MATLAB interface: minor revision of GrB.deserialize
+
+Version 7.1.0, May 20, 2022
+
+    * added cube root: GxB_CBRT_FP* unary operators
+    * added GxB_Matrix_isStoredElement and GxB_Vector_isStoredElement
+
+Version 7.0.4, Apr 25, 2022
+
+    * (46) bug fix: user-defined type size was incorrectly limited
+        to 128 bytes.  Caught by Erik Welch.
+
 Version 7.0.3, Apr 8, 2022
 
     * faster transpose when using 2 threads
diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf
index 3574e76c3..66b5da52d 100644
Binary files a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf and b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf differ
diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
index 01e4b266d..d0dd64200 100644
--- a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
+++ b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
@@ -141,810 +141,220 @@ \section{Introduction} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 are extensions to the GraphBLAS C API.
 \end{alert}
 
+
 \newpage
-%-------------------------------------------------------------------------------
-\subsection{Release Notes}
-%-------------------------------------------------------------------------------
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Basic Concepts} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{basic}
 
-\begin{itemize}
+Since the {\em GraphBLAS C API Specification} provides a precise definition of
+GraphBLAS, not every detail of every function is provided here.  For example,
+some error codes returned by GraphBLAS are self-explanatory, but since a
+specification must precisely define all possible error codes a function can
+return, these are listed in detail in the {\em GraphBLAS C API Specification}.
+However, including them here is not essential and the additional information on
+the page might detract from a clearer view of the essential features of the
+GraphBLAS functions.
 
-\item Version 7.0.3 (Apr 8, 2022)
+This User Guide also assumes the reader is familiar with Octave/MATLAB.
+MATLAB supports only the conventional plus-times semiring on sparse
+double and complex matrices, but a MATLAB-like notation easily extends to the
+arbitrary semirings used in GraphBLAS.  The matrix multiplication in the
+example in the Introduction can be written in MATLAB notation as
+\verb'C=A*B', if the Boolean \verb'OR-AND' semiring is understood.  Relying on
+a MATLAB-like notation allows the description in this User Guide to be
+expressive, easy to understand, and terse at the same time.  {\em The GraphBLAS
+C API Specification} also makes use of some MATLAB-like language, such
+as the colon notation.
 
-    \begin{packed_itemize}
-    \item faster transpose when using 2 threads
-    \end{packed_itemize}
+MATLAB notation will always appear here in fixed-width font, such as
+\verb'C=A*B(:,j)'.  In standard mathematical notation it would be written as
+the matrix-vector multiplication ${\bf C = A b}_j$ where ${\bf b}_j$ is the
+$j$th column of the matrix ${\bf B}$.  The GraphBLAS standard is a C API and
+SuiteSparse:GraphBLAS is written in C, and so a great deal of C syntax appears
+here as well, also in fixed-width font.  This User Guide alternates between all
+three styles as needed.
 
-\item Version 7.0.2 (Apr 5, 2022)
+%===============================================================================
+\subsection{Graphs and sparse matrices} %=======================================
+%===============================================================================
+\label{sparse}
 
-    \begin{packed_itemize}
-    \item (45) bug fix: vector iterator was broken for iterating across a
-        vector in bitmap format.  Caught by Erik Welch.
-    \end{packed_itemize}
+Graphs can be huge, with many nodes and edges.  A dense adjacency matrix ${\bf
+A}$ for a graph of $n$ nodes takes $O(n^2)$ memory, which is impossible if $n$
+is, say, a million.  Let $|{\bf A}|$ denote the number of entries in a matrix.
+Most graphs arising in practice are sparse, however, with only $|{\bf A}|=O(n)$
+edges, where $|{\bf A}|$ denotes the number of edges in the graph, or the
+number of explicit entries present in the data structure for the matrix ${\bf
+A}$.  Sparse graphs with millions of nodes and edges can easily be created by
+representing them as sparse matrices, where only explicit values need to be
+stored.  Some graphs are {\em hypersparse}, with ${|\bf A}| << n$.
+SuiteSparse:GraphBLAS supports three kinds of sparse matrix formats: a regular
+sparse format, taking $O(n+|{\bf A}|)$ space, a hypersparse format taking only
+$O(|{\bf A}|)$ space, and a bitmap form, taking $O(n^2)$ space.  Full matrices
+are also represented in $O(n^2)$ space.  Using its hypersparse format, creating
+a sparse matrix of size $n$-by-$n$ where $n=2^{60}$ (about $10^{18}$) can be
+done on quite easily on a commodity laptop, limited only by $|{\bf A}|$.
+To the GraphBLAS user application, all matrices look alike, since these formats
+are opaque, and SuiteSparse:GraphBLAS switches between them at will.
 
-\item Version 7.0.1 (Apr 3, 2022)
+A sparse matrix data structure only stores a subset of the possible $n^2$
+entries, and it assumes the values of entries not stored have some implicit
+value.  In conventional linear algebra, this implicit value is zero, but it
+differs with different semirings.  Explicit values are called {\em entries} and
+they appear in the data structure.  The {\em pattern} (also called the
+{\em structure}) of a matrix  defines where its explicit entries appear.  It
+will be referenced in one of two equivalent ways.  It can be viewed as a set of
+indices $(i,j)$, where $(i,j)$ is in the pattern of a matrix ${\bf A}$ if ${\bf
+A}(i,j)$ is an explicit value.  It can also be viewed as a Boolean matrix ${\bf
+S}$ where ${\bf S}(i,j)$ is true if $(i,j)$ is an explicit entry and false
+otherwise.  In MATLAB notation, \verb'S=spones(A)' or \verb'S=(A~=0)', if the
+implicit value is zero.  The \verb'(i,j)' pairs, and their values, can also be
+extracted from the matrix via the MATLAB expression \verb'[I,J,X]=find(A)',
+where the \verb'k'th tuple \verb'(I(k),J(k),X(k))' represents the explicit
+entry \verb'A(I(k),J(k))', with numerical value \verb'X(k)' equal to $a_{ij}$,
+with row index $i$=\verb'I(k)' and column index $j$=\verb'J(k)'.
 
-    \begin{packed_itemize}
-    \item added revised ACM TOMS submission to the Doc folder
-    \end{packed_itemize}
+The entries in the pattern of ${\bf A}$ can take on any value, including the
+implicit value, whatever it happens to be.  This differs slightly from MATLAB,
+which always drops all explicit zeros from its sparse matrices.  This is a
+minor difference but GraphBLAS cannot drop explicit zeros.  For example, in the
+max-plus tropical algebra, the implicit value is negative infinity, and zero
+has a different meaning.  Here, the MATLAB notation used will assume that no
+explicit entries are ever dropped because their explicit value happens to match
+the implicit value.
 
-\item Version 7.0.0 (Apr 2, 2022)
+{\em Graph Algorithms in the Language on Linear Algebra}, Kepner and Gilbert,
+eds., provides a framework for understanding how graph algorithms can be
+expressed as matrix computations \cite{KepnerGilbert2011}.  For additional
+background on sparse matrix algorithms, see also \cite{Davis06book} and
+\cite{DavisRajamanickamSidLakhdar16}.
 
-    \begin{packed_itemize}
-    \item (44) spec bug: \verb'GrB_Matrix_diag'
-        was implemented in v5.2.x and v6.x with the wrong signature.
-        This fix requires the major release to change, from v6.x to v7.x.
-    \item (43) performance bug fix for \verb'GrB_mxm':
-        auto selection for saxpy method (Hash vs Gustavson) revised.
-    \item \verb'GrB_assign': better performance for \verb'C(i,j)=scalar' and
-        \verb'C(i,j)+=scalar' when \verb'i' and \verb'j' have length 1 (scalar
-        assigment with no scalar expansion).
-    \end{packed_itemize}
+%===============================================================================
+\subsection{Overview of GraphBLAS methods and operations} %=====================
+%===============================================================================
+\label{overview}
 
-\item Version 6.2.5 (Mar 14, 2022)
+GraphBLAS provides a collection of {\em methods} to create, query, and free its
+of objects: sparse matrices, sparse vectors, scalars, types, operators,
+monoids, semirings, and a descriptor object used for parameter settings.
+Details are given in Section~\ref{objects}.  Once these objects are created
+they can be used in mathematical {\em operations} (not to be confused with the
+how the term {\em operator} is used in GraphBLAS).  A short summary of these
+operations and their nearest Octave/MATLAB analog is given in the table below.
 
-    \begin{packed_itemize}
-    \item For SuiteSparse v5.11.0.
-    \end{packed_itemize}
+% \vspace{0.1in}
+\begin{tabular}{ll}
+operation                           & approximate Octave/MATLAB analog \\
+\hline
+matrix multiplication               & \verb'C=A*B' \\
+element-wise operations             & \verb'C=A+B' and \verb'C=A.*B' \\
+reduction to a vector or scalar     & \verb's=sum(A)' \\
+apply unary operator                & \verb'C=-A' \\
+transpose                           & \verb"C=A'" \\
+submatrix extraction                & \verb'C=A(I,J)' \\
+submatrix assignment                & \verb'C(I,J)=A' \\
+select                              & \verb'C=tril(A)' \\
+\hline
+\end{tabular}
+\vspace{0.1in}
 
-\item Version 6.2.4 (Mar 8, 2022)
+GraphBLAS can do far more than what Octave/MATLAB can do in these rough
+analogs, but the list provides a first step in describing what GraphBLAS can
+do.  Details of each GraphBLAS operation are given in Section~\ref{operations}.
+With this brief overview, the full scope of GraphBLAS extensions of these
+operations can now be described.
 
-    \begin{packed_itemize}
-    \item (42) bug fix: \verb'GrB_mxm' with 0-by-0 iso full matrices.
-        Caught by Henry Amuasi in the Python
-        grblas interface, then triaged and isolated by Erik Welch.
-    \end{packed_itemize}
+SuiteSparse:GraphBLAS has 13 built-in scalar types: Boolean, single and double
+precision floating-point (real and complex), and 8, 16, 32, and 64-bit signed
+and unsigned integers.  In addition, user-defined scalar types can be created
+from nearly any C \verb'typedef', as long as the entire type fits in a
+fixed-size contiguous block of memory (of arbitrary size).  All of these types
+can be used to create GraphBLAS sparse matrices, vectors, or scalars.
 
-\item Version 6.2.3 (Mar 5, 2022)
+The scalar addition of conventional matrix multiplication is replaced with a
+{\em monoid}.  A monoid is an associative and commutative binary operator
+\verb'z=f(x,y)' where all three domains are the same (the types of \verb'x',
+\verb'y', and \verb'z'), and where the operator has an identity value \verb'id'
+such that \verb'f(x,id)=f(id,x)=x'.  Performing matrix multiplication with a
+semiring uses a monoid in place of the ``add'' operator, scalar addition being
+just one of many possible monoids.  The identity value of addition is zero,
+since $x+0=0+x=x$.   GraphBLAS includes many built-in operators suitable for
+use as a monoid: min (with an identity value of positive infinity), max (whose
+identity is negative infinity), add (identity is zero), multiply (with an
+identity of one), four logical operators: AND, OR, exclusive-OR, and
+Boolean equality (XNOR), four bitwise operators (AND, OR, XOR, and XNOR),
+and the ANY operator
+See Section~\ref{any_pair} for more details on the unusual ANY operator.
+User-created monoids can be defined with any associative and
+commutative operator that has an identity value.
 
-    \begin{packed_itemize}
-    \item minor update to documentation in \verb'GrB.build':
-        no change to any code
-    \end{packed_itemize}
+Finally, a semiring can use any built-in or user-defined binary operator
+\verb'z=f(x,y)' as its ``multiply'' operator, as long as the type of its
+output, \verb'z' matches the type of the semiring's monoid.
+The user application can create any semiring based on any types, monoids,
+and multiply operators, as long these few rules are followed.
 
-\item Version 6.2.2 (Feb 28, 2022)
+Just considering built-in types and operators, GraphBLAS can perform
+\verb'C=A*B' in thousands of unique semirings.  With typecasting, any of these
+semirings can be applied to matrices \verb'C', \verb'A', and \verb'B' of 13
+predefined types, in any combination.  This results in millions of possible
+kinds of sparse matrix multiplication supported by GraphBLAS, and this is
+counting just built-in types and operators.  By contrast, MATLAB provides just
+two semirings for its sparse matrix multiplication \verb'C=A*B':
+plus-times-double and plus-times-complex, not counting the typecasting that
+MATLAB does when multiplying a real matrix times a complex matrix.
 
-    \begin{packed_itemize}
-    \item revised output of \verb'GxB_*_sort' to return newly created matrices
-        C and P as full or bitmap matrices, as appropriate, instead of
-        sparse/hypersparse, following their sparsity control settings.
-    \end{packed_itemize}
+A monoid can also be used in a reduction operation, like \verb's=sum(A)' in
+MATLAB.  MATLAB provides the plus, times, min, and max reductions of a real or
+complex sparse matrix as \verb's=sum(A)',  \verb's=prod(A)', \verb's=min(A)',
+and \verb's=max(A)', respectively.  In GraphBLAS, any monoid can be used (min,
+max, plus, times, AND, OR, exclusive-OR, equality, bitwise operators,
+or any user-defined monoid on any user-defined type).
 
-\item Version 6.2.1 (Feb 14, 2022)
+Element-wise operations are also expanded from what can be done in MATLAB.
+Consider matrix addition, \verb'C=A+B' in MATLAB.  The pattern of the result is
+the set union of the pattern of \verb'A' and \verb'B'.  In GraphBLAS, any
+binary operator can be used in this set-union ``addition.''  The operator is
+applied to entries in the intersection.  Entries in \verb'A' but not \verb'B',
+or visa-versa, are copied directly into \verb'C', without any application of
+the binary operator.  The accumulator operation for ${\bf Z = C \odot T}$
+described in Section~\ref{accummask} is one example of this set-union
+application of an arbitrary binary operator.
 
-    \begin{packed_itemize}
-    \item  (41) bug fix: \verb'GxB_Iterator_get' used \verb'(void *) + size'
-        arithmetic
-    \end{packed_itemize}
+Consider element-wise multiplication, \verb'C=A.*B' in MATLAB.  The operator
+(multiply in this case) is applied to entries in the set intersection, and the
+pattern of \verb'C' just this set intersection.  Entries in \verb'A' but not
+\verb'B', or visa-versa, do not appear in \verb'C'.  In GraphBLAS, any binary
+operator can be used in this manner, not just scalar multiplication.  The
+difference between element-wise ``add'' and ``multiply'' is not the operators,
+but whether or not the pattern of the result is the set union or the set
+intersection.  In both cases, the operator is only applied to the set
+intersection.
 
-\item Version 6.2.0 (Feb 14, 2022)
+Finally, GraphBLAS includes a {\em non-blocking} mode where operations can be
+left pending, and saved for later.  This is very useful for submatrix
+assignment (\verb'C(I,J)=A' where \verb'I' and \verb'J' are integer vectors),
+or scalar assignment (\verb'C(i,j)=x' where \verb'i' and \verb'j' are scalar
+integers).  Because of how MATLAB stores its matrices, adding and deleting
+individual entries is very costly.  For example, this is very slow in MATLAB,
+taking $O(nz^2)$ time:
 
-    \begin{packed_itemize}
-    \item added the \verb'GxB_Iterator' object and its methods.  See
-        Section~\ref{iter}.
-    \item \verb'@GrB' interface: revised sparse-times-full rule for the
-        conventional semiring (the syntax \verb'C=A*B'), so that
-        sparse-times-full results in \verb'C' as full,
-        but hypersparse-times-sparse is not full
-        (typically sparse or hypersparse).
-    \end{packed_itemize}
+    \begin{mdframed}
+    {\footnotesize
+    \begin{verbatim}
+    A = sparse (m,n) ;   % an empty sparse matrix
+    for k = 1:nz
+        compute a value x, row index i, and column index j
+        A (i,j) = x ;
+    end\end{verbatim}}\end{mdframed}
 
-\item Version 6.1.4 (Jan 12, 2022)
-
-    \begin{packed_itemize}
-    \item added Section~\ref{perf} to User Guide: how to get the best
-        performance out of algorithms based on GraphBLAS.
-    \item \verb'cpu_features':  no longer built as a separate library,
-        but built directly into \verb'libgraphblas.so' and
-        \verb'libgraphblas.a'.  Added compile-time flags to
-        optionally disable the use of \verb'cpu_features' completely.
-    \item Octave 7: port to Apple Silicon (thanks to 
-            G{\'{a}}bor Sz{\'{a}}rnyas).
-    \item min/max monoids:  real case (FP32 and FP64) no longer terminal
-    \item \verb'@GrB' interface: overloaded \verb'C=A*B' syntax where one
-        matrix is full always results in a full matrix \verb'C'.
-    \item Faster \verb'C=A*B' for sparse-times-full and full-times-sparse
-        for \verb'@GrB' interface.
-    \end{packed_itemize}
-
-\item Version 6.1.3 (Jan 1, 2022)
-
-    \begin{packed_itemize}
-    \item performance: task creation for \verb'GrB_mxm'
-        had a minor flaw (results were correct but parallelism suffered).
-        Performance improvement of up to 10x when nnz(A)<<nnz(B).
-    \end{packed_itemize}
-
-\item Version 6.1.2 (Dec 31, 2021)
-
-    \begin{packed_itemize}
-    \item performance: revised \verb'swap_rule' in \verb'GrB_mxm', which decides whether
-        to compute \verb"C=A*B" or \verb"C=(B'*A')'", and variants, resulting in up
-        to 3x performance gain over v6.1.1 for \verb'GrB_mxm' (observed;
-        could be higher in other cases).
-    \end{packed_itemize}
-
-\item Version 6.1.1 (Dec 28, 2021) 
-
-    \begin{packed_itemize}
-    \item minor revision to AVX2 and AVX512f selection
-    \item \verb'cpu_features/Makefile': remove test of \verb'list_cpu_features'
-        so that the package can be built when cross-compiling
-    \end{packed_itemize}
-
-\item Versions 6.1.0 (Dec 26, 2021)
-
-    \begin{packed_itemize}
-    \item added \verb'GxB_get' options: compiler name and version.
-    \item added package: \url{https://github.com/google/cpu_features},
-        Nov 30, 2021 version.
-    \item performance: faster \verb'C+=A*B' when \verb'C' is full,
-        \verb'A' is bitmap/full, and \verb'B' is sparse/hyper.  % saxpy5
-        Faster \verb"C+=A'*B" when
-        \verb'A' is sparse/hyper, and \verb'B' is bitmap/full.  % dot4
-    \item (40) bug fix: deserialization of iso and empty matrices/vectors was
-        incorrect
-    \end{packed_itemize}
-
-\item Versions 6.0.2 and 5.2.2 (Nov 30, 2021)
-
-    \begin{packed_itemize}
-    \item (39) bug fix: \verb'GrB_Matrix_export':
-        numerical values not properly exported
-    \end{packed_itemize}
-
-\item Versions 6.0.1 and 5.2.1 (Nov 27, 2021)
-
-    \begin{packed_itemize}
-    \item v6.0.x and v5.2.x (for the same x):
-        differ only in \verb'GrB_wait', \verb'GrB_Info',
-        \verb'GrB_SCMP', and \verb'GxB_init'.
-    \item (38) bug fix:  \verb"C+=A'*B" when the accum operator is the same as
-        the monoid and C is iso-full, and \verb'A' or \verb'B' are hypersparse.
-        (dot4 method).
-    \item performance: \verb'GrB_select' with user-defined
-        \verb'GrB_IndexUnaryOp' about 2x faster.
-    \item performance: faster \verb'(MIN,MAX)_(FIRSTJ,SECONDI)' semirings
-    \end{packed_itemize}
-
-\item Version 6.0.0 (Nov 15, 2021)
-
-    \begin{packed_itemize}
-    \item this release contains only a few changes that cause a
-        break with backward compatibility.  It is otherwise identical to v5.2.0.
-    \item v6.0.0 is fully compliant with the v2.0 C API Specification.
-        Three changes from the v2.0 C API Spec are not backward compatible
-        (\verb'GrB_*wait', \verb'GrB_Info', \verb'GrB_SCMP').
-        \verb'GxB_init' has also changed.
-        \begin{packed_itemize}
-        \item \verb'GrB_wait (object, mode)': was \verb'GrB_wait (&object)'.
-        \item \verb'GrB_Info': changed enum values
-        \item \verb'GrB_SCMP': removed
-        \item \verb'GxB_init (mode, malloc, calloc, realloc, free, is_thread_safe)':
-            the last parameter, \verb'is_thread_safe', is deleted.
-            The malloc, calloc, realloc, and free functions must be thread-safe.
-        \end{packed_itemize}
-    \end{packed_itemize}
-
-\item Version 5.2.0 (Nov 15, 2021)
-
-    \begin{packed_itemize}
-    \item Added for the v2.0 C API Specification: only features that are
-        backward compatible with SuiteSparse:GraphBLAS v5.x have been
-        added to v5.2.0:
-        \begin{packed_itemize}
-        \item \verb'GrB_Scalar': replaces \verb'GxB_Scalar', \verb'GxB_Scalar_*'
-            functions renamed GrB
-        \item \verb'GrB_IndexUnaryOp': new, free, fprint, wait
-        \item \verb'GrB_select': selection via \verb'GrB_IndexUnaryOp'
-        \item \verb'GrB_apply': with \verb'GrB_IndexUnaryOp'
-        \item \verb'GrB_reduce': reduce matrix or vector to \verb'GrB_Scalar'
-        \item \verb'GrB_assign', \verb'GrB_subassion': with \verb'GrB_Scalar'
-            input
-        \item \verb'GrB_*_extractElement_Scalar': get \verb'GrB_Scalar'
-            from a matrix or vector
-        \item \verb'GrB*build': when \verb'dup' is \verb'NULL',
-            duplicates result in an error.
-        \item \verb'GrB import/export': import/export from/to user-provided
-            arrays
-        \item \verb'GrB_EMPTY_OBJECT', \verb'GrB_NOT_IMPLEMENTED': error codes
-            added
-        \item \verb'GrB_*_setElement_Scalar': set an entry in a matrix or
-            vector, from a \verb'GrB_Scalar'
-        \item \verb'GrB_Matrix_diag': same as
-            \verb'GxB_Matrix_diag (C,v,k,NULL)'
-        \item \verb'GrB_*_serialize/deserialize': with compression
-        \item \verb'GrB_ONEB_T': binary operator, $f(x,y)=1$, the same as
-            \verb'GxB_PAIR_T'.
-        \end{packed_itemize}
-    \item \verb'GxB*import*' and \verb'GxB*export*': now historical; use
-        \verb'GxB*pack/unpack*'
-    \item \verb'GxB_select': is now historical; use \verb'GrB_select' instead.
-    \item \verb'GxB_IGNORE_DUP': special operator for build methods only; if dup
-        is this operator, then duplicates are ignored (not an error)
-    \item \verb'GxB_IndexUnaryOp_new': create a named index-unary operator
-    \item \verb'GxB_BinaryOp_new': create a named binary operator
-    \item \verb'GxB_UnaryOp_new': create a named unary operator
-    \item \verb'GxB_Type_new': to create a named type
-    \item \verb'GxB_Type_name': to query the name of a type
-    \item added \verb'GxB_*type_name' methods
-        to query the name of a type as a string.
-    \item \verb'GxB' methods that query an object return a \verb'GrB_type' such
-        as \verb'GxB_Matrix_type' are declared historical; will be kept but not
-        recommended (use \verb'GxB_*type_name' methods).
-    \item \verb'GxB_Matrix_serialize/deserialize': with compression;
-        optional descriptor.
-    \item \verb'GxB_Matrix_sort', \verb'GxB_Vector_sort':
-        sort a matrix or vector
-    \item \verb'GxB_eWiseUnion': like \verb'GrB_eWiseAdd' except for how
-        entries in $\bf A\setminus B$ and $\bf B \setminus A$ are computed.
-    \item added LZ4/LZ4HC: compression library, \url{http://www.lz4.org} (BSD
-        2), v1.9.3, Copyright (c) 2011-2016, Yann Collet.
-    \item MIS and pagerank demos: removed; MIS added to LAGraph/experimental
-    \item disabled free memory pool if OpenMP not available
-    \item (37) bug fix: ewise \verb'C=A+B' when all matrices are full,
-        \verb'GBCOMPACT' not used, but \verb'GB_control.h' disabled the
-        operator or type.  Caught by Roi Lipman, Redis.
-    \item (36) bug fix: \verb'C<M>=Z' not returning \verb'C'
-        as iso if \verb'Z 'iso and \verb'C' initially
-        empty.  Caught by Erik Welch, Anaconda.
-    \item performance improvements: \verb'C=A*B': sparse/hyper times
-        bitmap/full, and visa versa, including \verb'C += A*B' when \verb'C' is
-        full.
-    \end{packed_itemize}
-
-\item Version 5.1.10 (Oct 27, 2021)
-
-    \begin{packed_itemize}
-    \item  (35) bug fix: \verb'GB_selector'; \verb'A->plen' and \verb'C->plen'
-        not updated correctly.  Caught by Jeffry Lovitz, Redis.
-    \end{packed_itemize}
-
-\item Version 5.1.9 (Oct 26, 2021)
-
-    \begin{packed_itemize}
-    \item (34) bug fix: in-place test incorrect for \verb"C+=A'*B" using dot4
-    \item (33) bug fix: disable free pool if OpenMP not available
-    \end{packed_itemize}
-
-\item Version 5.1.8 (Oct 5, 2021)
-
-    \begin{packed_itemize}
-    \item (32) bug fix: C=A*B when A is sparse and B is iso and bitmap.
-        Caught by Mark Blanco, CMU.
-    \end{packed_itemize}
-
-\item Version 5.1.7 (Aug 23, 2021)
-
-    \begin{packed_itemize}
-    \item (31) bug fix:  \verb'GrB_apply', when done in-place and matrix starts
-        non-iso and becomes iso, gave the wrong iso result.
-        Caught by Fabian Murariu.
-    \end{packed_itemize}
-
-\item Version 5.1.6 (Aug 16, 2021)
-
-    \begin{packed_itemize}
-    \item one-line change to \verb'C=A*B': faster symbolic analysis when a
-        vector \verb'C(:,j)' is dense (for CSC) or \verb'C(i,:)' for CSR.
-    \end{packed_itemize}
-
-\item Version 5.1.5 (July 15, 2021)
-
-    \begin{packed_itemize}
-    \item submission to ACM Transactions on Mathematical Software as
-        a Collected Algorithm of the ACM.
-    \end{packed_itemize}
-
-\item Version 5.1.4 (July 6, 2021)
-
-    \begin{packed_itemize}
-    \item faster Octave interface.  Octave v7 or later is required.
-    \item (30) bug fix: 1-based printing not enabled for pending tuples.
-        Caught by Will Kimmerer, while working on the Julia interface.
-    \end{packed_itemize}
-
-\item Version 5.1.3 (July 3, 2021)
-
-    \begin{packed_itemize}
-    \item added \verb'GxB_Matrix_iso' and \verb'GxB_Vector_iso':
-        to query if a matrix or vector is held as iso-valued
-    \item (29) bug fix: \verb'Matrix_pack_*R' into a matrix previously held by
-        column, or \verb'Matrix_pack*C' into a matrix by row, would flip the
-        dimensions.
-        Caught by Erik Welch, Anaconda.
-    \item (28) bug fix: \verb'kron(A,B)' with iso input matrices
-        \verb'A' and \verb'B' fixed.
-        Caught by Michel Pelletier, Graphegon.
-    \item (27) bug fix: v5.1.0 had a wrong version of a file; posted by mistake.
-        Caught by Michel Pelletier, Graphegon.
-    \end{packed_itemize}
-
-\item Version 5.1.2 (June 30, 2021)
-
-    \begin{packed_itemize}
-    \item iso matrices added:  these are matrices and vectors whose
-        values in the sparsity pattern are all the same.  This is an internal
-        change to the opaque data structures of the \verb'GrB_Matrix' and
-        \verb'GrB_Vector' with very little change to the API.
-    \item added \verb'GxB_Matrix_build_Scalar'
-            and \verb'GxB_Vector_build_Scalar',
-            which always build iso matrices and vectors.
-    \item import/export methods can now import/export iso matrices and vectors.
-    \item added \verb'GrB.argmin/argmax' to Octave/MATLAB interface
-    \item added \verb'GxB_*_pack/unpack' methods as alternatives to
-        import/export.
-    \item added \verb'GxB_PRINT_1BASED' to the global settings.
-    \item added \verb'GxB_*_memoryUsage'
-    \item port to Octave:  \verb'gbmake' and \verb'gbtest'
-        work in Octave7 to build and test
-        the \verb'@GrB' interface to GraphBLAS.  Octave 7.0.0 is required.
-    \end{packed_itemize}
-
-\item Version 5.0.6 (May 24, 2021)
-
-    \begin{packed_itemize}
-    \item BFS and triangle counting demos removed from GraphBLAS/Demo:
-        see LAGraph for these algorithms.  Eventually, all of GraphBLAS/Demo
-        will be deleted, once LAGraph includes all the methods included there.
-    \end{packed_itemize}
-
-\item Version 5.0.5 (May 17, 2021)
-
-    \begin{packed_itemize}
-    \item (26) performance bug fix:  reduce-to-vector where \verb'A' is
-        hypersparse CSR with a transposed descriptor (or CSC with no
-        transpose), and some cases for \verb'GrB_mxm/mxv/vxm' when computing
-        \verb'C=A*B' with A hypersparse CSC and \verb'B' bitmap/full (or
-        \verb'A' bitmap/full and \verb'B' hypersparse CSR), the wrong internal
-        method was being selected via the auto-selection strategy, resulting in
-        a significant slowdown in some cases.
-    \end{packed_itemize}
-
-\item Version 5.0.4 (May 13, 2021)
-
-    \begin{packed_itemize}
-    \item \verb'@GrB' Octave/MATLAB interface: changed license
-        to GNU General Public License v3.0 or later.
-    \end{packed_itemize}
-
-\item Version 5.0.3 (May 12, 2021)
-
-    \begin{packed_itemize}
-    \item (25) bug fix: disabling \verb'ANY_PAIR' semirings by editing
-        \verb'Source/GB_control.h' would cause a segfault if those disabled
-        semirings were used.
-    \item demos are no longer built by default
-    \item (24) bug fix: new functions in v5.0.2 not declared as \verb'extern'
-        in \verb'GraphBLAS.h'.
-    \item \verb'GrB_Matrix_reduce_BinaryOp' reinstated from v4.0.3;
-        same limit on built-in ops that correspond to known monoids.
-    \end{packed_itemize}
-
-\item Version 5.0.2 (May 5, 2021)
-
-    \begin{packed_itemize}
-    \item (23) bug fix: \verb'GrB_Matrix_apply_BinaryOp1st' and \verb'2nd'
-        were using the
-        wrong descriptors for \verb'GrB_INP0' and \verb'GrB_INP1'.
-        Caught by Erik Welch, Anaconda.
-    \item memory pool added for faster allocation/free of small blocks
-    \item \verb'@GrB' interface ported to MATLAB R2021a.
-    \item \verb'GxB_PRINTF' and \verb'GxB_FLUSH' global options added.
-    \item \verb'GxB_Matrix_diag': construct a diagonal matrix from a vector
-    \item \verb'GxB_Vector_diag': extract a diagonal from a matrix
-    \item \verb'concat/split': methods to concatenate and split matrices.
-    \item \verb'import/export':
-        size of arrays now in bytes, not entries.  This change
-        is required for better internal memory management, and it is not
-        backward compatible with the \verb'GxB*import/export' functions in v4.0.
-        A new parameter, \verb'is_uniform', has been added to all import/export
-        methods, which indicates that the matrix values are all the same.
-    \item (22) bug fix: SIMD vectorization was missing
-        \verb'reduction(+,task_cnvals)' in
-        \verb'GB_dense_subassign_06d_template.c'.  Caught by Jeff Huang, Texas
-        A\&M, with his software package for race-condition detection.
-    \item \verb'GrB_Matrix_reduce_BinaryOp': removed.  Use a monoid instead,
-        with \verb'GrB_reduce' or \verb'GrB_Matrix_reduce_Monoid'.
-    \end{packed_itemize}
-
-\item Version 4.0.3 (Jan 19, 2021)
-
-    \begin{packed_itemize}
-    \item faster min/max monoids
-    \item \verb'G=GrB(G)' converts \verb'G' from v3 object to v4
-    \end{packed_itemize}
-
-\item Version 4.0.2 (Jan 13, 2021)
-
-    \begin{packed_itemize}
-    \item ability to load \verb'*.mat' files saved with the v3 \verb'GrB'
-    \end{packed_itemize}
-
-\item Version 4.0.1 (Jan 4, 2021)
-
-    \begin{packed_itemize}
-    \item significant performance improvements: compared with v3.3.3,
-        up to 5x faster in breadth-first-search (using 
-        \verb'LAGraph_bfs_parent2'), and 2x faster in
-        Betweenness-Centrality (using \verb'LAGraph_bc_batch5').
-    \item \verb'GrB_wait(void)', with no inputs: removed
-    \item \verb'GrB_wait(&object)': polymorphic function added
-    \item \verb'GrB_*_nvals': no longer guarantees completion;
-        use \verb'GrB_wait(&object)'
-        or non-polymorphic \verb'GrB_*_wait (&object)' instead
-    \item \verb'GrB_error': now has two parameters: a string
-        (\verb'char **') and an object.
-    \item \verb'GrB_Matrix_reduce_BinaryOp' limited to built-in operators that
-        correspond to known monoids.
-    \item \verb'GrB_*_extractTuples': may return indices out of order
-    \item removed internal features: GBI iterator, slice and hyperslice matrices
-    \item bitmap/full matrices and vectors added
-    \item positional operators and semirings:
-        \verb'GxB_FIRSTI_INT32' and related ops
-    \item jumbled matrices: sort left pending, like zombies and pending tuples
-    \item \verb'GxB_get/set': added \verb'GxB_SPARSITY_*'
-        (hyper, sparse, bitmap, or full) and \verb'GxB_BITMAP_SWITCH'.
-    \item \verb'GxB_HYPER': enum renamed to \verb'GxB_HYPER_SWITCH'
-    \item \verb'GxB*import/export': API modified
-    \item \verb'GxB_SelectOp': \verb'nrows' and \verb'ncols' removed
-        from function signature.
-    \item OpenMP tasking removed from mergesort and replaced with parallel
-        for loops.  Just as fast on Linux/Mac; now the performance ports to
-        Windows.
-    \item \verb'GxB_BURBLE' added as a supported feature.  This was an
-        undocumented feature of prior versions.
-    \item bug fix: \verb'A({lo,hi})=scalar'
-        \verb'A(lo:hi)=scalar' was OK
-    \end{packed_itemize}
-
-\item Version 3.3.3 (July 14, 2020).
-    Bug fix: \verb'w<m>=A*u' with mask non-empty and u empty.
-
-\item Version 3.3.2 (July 3, 2020).  Minor changes to build system.
-
-\item Version 3.3.1 (June 30, 2020).  Bug fix to \verb'GrB_assign' and
-    \verb'GxB_subassign' when the assignment is simple (\verb'C=A') but
-    with typecasting.
-
-\item Version 3.3.0 (June 26, 2020).  Compliant with V1.3 of the C API
-    (except that the polymorphic \verb'GrB_wait(&object)' doesn't appear yet;
-    it will appear in V4.0).
-
-    Added complex types (\verb'GxB_FC32' and \verb'GxB_FC64'), many unary
-    operators, binary operators, monoids, and semirings.  Added bitwise
-    operators, and their monoids and semirings.  Added the predefined monoids
-    and semirings from the v1.3 specification.  \verb'@GrB' interface: added complex
-    matrices and operators, and changed behavior of integer operations to more
-    closely match the behavior on built-in integer matrices.  The rules for
-    typecasting large floating point values to integers has changed.  The
-    specific object-based \verb'GrB_Matrix_wait', \verb'GrB_Vector_wait', etc,
-    functions have been added.  The no-argument \verb'GrB_wait()' is
-    deprecated.  Added \verb'GrB_getVersion', \verb'GrB_Matrix_resize',
-    \verb'GrB_Vector_resize', \verb'GrB_kronecker', \verb'GrB_*_wait', scalar
-    binding with binary operators for \verb'GrB_apply', \newline
-    \verb'GrB_Matrix_removeElement', and \verb'GrB_Vector_removeElement'.
-
-\item Version 3.2.0 (Feb 20, 2020).  Faster \verb'GrB_mxm', \verb'GrB_mxv', and
-    \verb'GrB_vxm', and faster operations on dense matrices/vectors.  Removed
-    compile-time user objects (\verb'GxB_*_define'), since these were not
-    compatible with the faster matrix operations.  Added the \verb'ANY' and
-    \verb'PAIR' operators.  Added the predefined descriptors,
-    \verb'GrB_DESC_*'.  Added the structural mask option.  Changed default
-    chunk size to 65,536.  \verb'@GrB' interface modified:  \verb'GrB.init' is
-    now optional.
-
-\item Version 3.1.2 (Dec, 2019).  Changes to allow SuiteSparse:GraphBLAS
-    to be compiled with the Microsoft Visual Studio compiler.  This compiler
-    does not support the \verb'_Generic' keyword, so the polymorphic functions
-    are not available.  Use the equivalent non-polymorphic functions instead,
-    when compiling GraphBLAS with MS Visual Studio.  In addition,
-    variable-length arrays are not supported, so user-defined types are limited
-    to 128 bytes in size.  These changes have no effect if you have an ANSI C11
-    compliant compiler.
-
-    \verb'@GrB' interface modified:  \verb'GrB.init' is now required.
-
-\item Version 3.1.0 (Oct 1, 2019).  \verb'@GrB' interface added.  See the
-    \newline \verb'GraphBLAS/GraphBLAS' folder for details and documentation,
-    and Section~\ref{octave}.
-
-\item Version 3.0 (July 26, 2019), with OpenMP parallelism.
-
-The version number is increased to 3.0, since
-this version is not backward compatible with V2.x.  The \verb'GxB_select'
-operation changes; the \verb'Thunk' parameter was formerly a
-\verb'const void *' pointer, and is now a \verb'GxB_Scalar'.  A new parameter
-is added to \verb'GxB_SelectOp_new', to define the expected type of
-\verb'Thunk'.  A new parameter is added to \verb'GxB_init', to specify whether
-or not the user-provided memory management functions are thread safe.
-
-The remaining changes add new features, and are upward compatible with V2.x.
-The major change is the addition of OpenMP parallelism.  This addition has no
-effect on the API, except that round-off errors can differ with the number of
-threads used, for floating-point types.  \verb'GxB_set' can optionally define
-the number of threads to use (the default is \verb'omp_get_max_threads').  The
-number of threads can also defined globally, and/or in the
-\verb'GrB_Descriptor'.  The \verb'RDIV' and \verb'RMINUS' operators are added,
-which are defined as $f(x,y)=y/x$ and $f(x,y)=y-x$, respectively.  Additional
-options are added to \verb'GxB_get'.
-
-\item Version 2.3.3 (May 2019): Collected Algorithm of the ACM.
-No changes from V2.3.2 other than the documentation.
-
-\item Version 2.3 (Feb 2019) improves the performance of many GraphBLAS
-operations, including an early-exit for monoids.  These changes have a
-significant impact on breadth-first-search (a performance bug was also fixed in
-the two BFS \verb'Demo' codes).  The matrix and vector import/export functions
-were added (Section~\ref{pack_unpack}), in support of the new LAGraph project
-(\url{https://github.com/GraphBLAS/LAGraph}, see also Section~\ref{lagraph}).
-LAGraph includes a push-pull BFS in GraphBLAS that is faster than two versions
-in the \verb'Demo' folder.  \verb'GxB_init' was added to allow the memory
-manager functions (\verb'malloc', etc) to be specified.
-
-\item
-Version 2.2 (Nov 2018)
-adds user-defined objects at compile-time, via user \verb'*.m4' files placed in
-\verb'GraphBLAS/User', which use the \verb'GxB_*_define' macros 
-(NOTE: feature removed in v3.2).
-The default matrix format is now \verb'GxB_BY_ROW'.
-Also added are the \verb'GxB_*print' methods for printing the contents of each
-GraphBLAS object (Section~\ref{fprint}).   PageRank demos have been added to
-the \verb'Demos' folder.
-
-\item
-Version 2.1 (Oct 2018) was
-a major update with support for new matrix formats
-(by row or column, and hypersparse matrices), and colon notation
-(\verb'I=begin:end' or \verb'I=begin:inc:end').  Some graph algorithms are more
-naturally expressed with matrices stored by row, and this version includes the
-new \verb'GxB_BY_ROW' format.  The default format in Version 2.1 and
-prior versions is by column.
-New extensions to GraphBLAS in this version include \verb'GxB_get',
-\verb'GxB_set', and \verb'GxB_AxB_METHOD', \verb'GxB_RANGE', \verb'GxB_STRIDE',
-and \verb'GxB_BACKWARDS', and their related definitions, described in
-Sections~\ref{descriptor},~\ref{options},~and~\ref{colon}.
-
-\item
-Version 2.0 (March 2018) addressed changes in the GraphBLAS C API
-Specification and added \verb'GxB_kron' and \verb'GxB_resize'.
-
-\item
-Version 1.1 (Dec 2017) primarily improved the performance.
-
-\item
-Version 1.0 was released on Nov 25, 2017.
-\end{itemize}
-
-%-------------------------------------------------------------------------------
-\subsubsection{Regarding historical and deprecated functions and symbols}
-%-------------------------------------------------------------------------------
-
-When a \verb'GxB*' function or symbol is added to the C API Specification with
-a \verb'GrB*' name, the new \verb'GrB*' name should be used instead, if
-possible.  However, the old \verb'GxB*' name will be kept as long as possible
-for historical reasons.  Historical functions and symbols will not always be
-documented here in the SuiteSparse:GraphBLAS User Guide, but they will be kept
-in \verb'GraphbBLAS.h' and kept in good working order in the library.
-Historical functions and symbols would only be removed in the very unlikely
-case that they cause a serious conflict with future methods.
-
-The only methods that have been fully deprecated and removed are the older
-versions of \verb'GrB_wait' and \verb'GrB_error' methods, which are
-incompatible with the latest versions.
-
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Basic Concepts} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{basic}
-
-Since the {\em GraphBLAS C API Specification} provides a precise definition of
-GraphBLAS, not every detail of every function is provided here.  For example,
-some error codes returned by GraphBLAS are self-explanatory, but since a
-specification must precisely define all possible error codes a function can
-return, these are listed in detail in the {\em GraphBLAS C API Specification}.
-However, including them here is not essential and the additional information on
-the page might detract from a clearer view of the essential features of the
-GraphBLAS functions.
-
-This User Guide also assumes the reader is familiar with Octave/MATLAB.
-MATLAB supports only the conventional plus-times semiring on sparse
-double and complex matrices, but a MATLAB-like notation easily extends to the
-arbitrary semirings used in GraphBLAS.  The matrix multiplication in the
-example in the Introduction can be written in MATLAB notation as
-\verb'C=A*B', if the Boolean \verb'OR-AND' semiring is understood.  Relying on
-a MATLAB-like notation allows the description in this User Guide to be
-expressive, easy to understand, and terse at the same time.  {\em The GraphBLAS
-C API Specification} also makes use of some MATLAB-like language, such
-as the colon notation.
-
-MATLAB notation will always appear here in fixed-width font, such as
-\verb'C=A*B(:,j)'.  In standard mathematical notation it would be written as
-the matrix-vector multiplication ${\bf C = A b}_j$ where ${\bf b}_j$ is the
-$j$th column of the matrix ${\bf B}$.  The GraphBLAS standard is a C API and
-SuiteSparse:GraphBLAS is written in C, and so a great deal of C syntax appears
-here as well, also in fixed-width font.  This User Guide alternates between all
-three styles as needed.
-
-%===============================================================================
-\subsection{Graphs and sparse matrices} %=======================================
-%===============================================================================
-\label{sparse}
-
-Graphs can be huge, with many nodes and edges.  A dense adjacency matrix ${\bf
-A}$ for a graph of $n$ nodes takes $O(n^2)$ memory, which is impossible if $n$
-is, say, a million.  Let $|{\bf A}|$ denote the number of entries in a matrix.
-Most graphs arising in practice are sparse, however, with only $|{\bf A}|=O(n)$
-edges, where $|{\bf A}|$ denotes the number of edges in the graph, or the
-number of explicit entries present in the data structure for the matrix ${\bf
-A}$.  Sparse graphs with millions of nodes and edges can easily be created by
-representing them as sparse matrices, where only explicit values need to be
-stored.  Some graphs are {\em hypersparse}, with ${|\bf A}| << n$.
-SuiteSparse:GraphBLAS supports three kinds of sparse matrix formats: a regular
-sparse format, taking $O(n+|{\bf A}|)$ space, a hypersparse format taking only
-$O(|{\bf A}|)$ space, and a bitmap form, taking $O(n^2)$ space.  Full matrices
-are also represented in $O(n^2)$ space.  Using its hypersparse format, creating
-a sparse matrix of size $n$-by-$n$ where $n=2^{60}$ (about $10^{18}$) can be
-done on quite easily on a commodity laptop, limited only by $|{\bf A}|$.
-To the GraphBLAS user application, all matrices look alike, since these formats
-are opaque, and SuiteSparse:GraphBLAS switches between them at will.
-
-A sparse matrix data structure only stores a subset of the possible $n^2$
-entries, and it assumes the values of entries not stored have some implicit
-value.  In conventional linear algebra, this implicit value is zero, but it
-differs with different semirings.  Explicit values are called {\em entries} and
-they appear in the data structure.  The {\em pattern} (also called the
-{\em structure}) of a matrix  defines where its explicit entries appear.  It
-will be referenced in one of two equivalent ways.  It can be viewed as a set of
-indices $(i,j)$, where $(i,j)$ is in the pattern of a matrix ${\bf A}$ if ${\bf
-A}(i,j)$ is an explicit value.  It can also be viewed as a Boolean matrix ${\bf
-S}$ where ${\bf S}(i,j)$ is true if $(i,j)$ is an explicit entry and false
-otherwise.  In MATLAB notation, \verb'S=spones(A)' or \verb'S=(A~=0)', if the
-implicit value is zero.  The \verb'(i,j)' pairs, and their values, can also be
-extracted from the matrix via the MATLAB expression \verb'[I,J,X]=find(A)',
-where the \verb'k'th tuple \verb'(I(k),J(k),X(k))' represents the explicit
-entry \verb'A(I(k),J(k))', with numerical value \verb'X(k)' equal to $a_{ij}$,
-with row index $i$=\verb'I(k)' and column index $j$=\verb'J(k)'.
-
-The entries in the pattern of ${\bf A}$ can take on any value, including the
-implicit value, whatever it happens to be.  This differs slightly from MATLAB,
-which always drops all explicit zeros from its sparse matrices.  This is a
-minor difference but GraphBLAS cannot drop explicit zeros.  For example, in the
-max-plus tropical algebra, the implicit value is negative infinity, and zero
-has a different meaning.  Here, the MATLAB notation used will assume that no
-explicit entries are ever dropped because their explicit value happens to match
-the implicit value.
-
-{\em Graph Algorithms in the Language on Linear Algebra}, Kepner and Gilbert,
-eds., provides a framework for understanding how graph algorithms can be
-expressed as matrix computations \cite{KepnerGilbert2011}.  For additional
-background on sparse matrix algorithms, see also \cite{Davis06book} and
-\cite{DavisRajamanickamSidLakhdar16}.
-
-%===============================================================================
-\subsection{Overview of GraphBLAS methods and operations} %=====================
-%===============================================================================
-\label{overview}
-
-GraphBLAS provides a collection of {\em methods} to create, query, and free its
-of objects: sparse matrices, sparse vectors, scalars, types, operators,
-monoids, semirings, and a descriptor object used for parameter settings.
-Details are given in Section~\ref{objects}.  Once these objects are created
-they can be used in mathematical {\em operations} (not to be confused with the
-how the term {\em operator} is used in GraphBLAS).  A short summary of these
-operations and their nearest Octave/MATLAB analog is given in the table below.
-
-% \vspace{0.1in}
-\begin{tabular}{ll}
-operation                           & approximate Octave/MATLAB analog \\
-\hline
-matrix multiplication               & \verb'C=A*B' \\
-element-wise operations             & \verb'C=A+B' and \verb'C=A.*B' \\
-reduction to a vector or scalar     & \verb's=sum(A)' \\
-apply unary operator                & \verb'C=-A' \\
-transpose                           & \verb"C=A'" \\
-submatrix extraction                & \verb'C=A(I,J)' \\
-submatrix assignment                & \verb'C(I,J)=A' \\
-select                              & \verb'C=tril(A)' \\
-\hline
-\end{tabular}
-\vspace{0.1in}
-
-GraphBLAS can do far more than what Octave/MATLAB can do in these rough
-analogs, but the list provides a first step in describing what GraphBLAS can
-do.  Details of each GraphBLAS operation are given in Section~\ref{operations}.
-With this brief overview, the full scope of GraphBLAS extensions of these
-operations can now be described.
-
-SuiteSparse:GraphBLAS has 13 built-in scalar types: Boolean, single and double
-precision floating-point (real and complex), and 8, 16, 32, and 64-bit signed
-and unsigned integers.  In addition, user-defined scalar types can be created
-from nearly any C \verb'typedef', as long as the entire type fits in a
-fixed-size contiguous block of memory (of arbitrary size).  All of these types
-can be used to create GraphBLAS sparse matrices, vectors, or scalars.
-
-The scalar addition of conventional matrix multiplication is replaced with a
-{\em monoid}.  A monoid is an associative and commutative binary operator
-\verb'z=f(x,y)' where all three domains are the same (the types of \verb'x',
-\verb'y', and \verb'z'), and where the operator has an identity value \verb'id'
-such that \verb'f(x,id)=f(id,x)=x'.  Performing matrix multiplication with a
-semiring uses a monoid in place of the ``add'' operator, scalar addition being
-just one of many possible monoids.  The identity value of addition is zero,
-since $x+0=0+x=x$.   GraphBLAS includes many built-in operators suitable for
-use as a monoid: min (with an identity value of positive infinity), max (whose
-identity is negative infinity), add (identity is zero), multiply (with an
-identity of one), four logical operators: AND, OR, exclusive-OR, and
-Boolean equality (XNOR), four bitwise operators (AND, OR, XOR, and XNOR),
-and the ANY operator
-See Section~\ref{any_pair} for more details on the unusual ANY operator.
-User-created monoids can be defined with any associative and
-commutative operator that has an identity value.
-
-Finally, a semiring can use any built-in or user-defined binary operator
-\verb'z=f(x,y)' as its ``multiply'' operator, as long as the type of its
-output, \verb'z' matches the type of the semiring's monoid.
-The user application can create any semiring based on any types, monoids,
-and multiply operators, as long these few rules are followed.
-
-Just considering built-in types and operators, GraphBLAS can perform
-\verb'C=A*B' in thousands of unique semirings.  With typecasting, any of these
-semirings can be applied to matrices \verb'C', \verb'A', and \verb'B' of 13
-predefined types, in any combination.  This results in millions of possible
-kinds of sparse matrix multiplication supported by GraphBLAS, and this is
-counting just built-in types and operators.  By contrast, MATLAB provides just
-two semirings for its sparse matrix multiplication \verb'C=A*B':
-plus-times-double and plus-times-complex, not counting the typecasting that
-MATLAB does when multiplying a real matrix times a complex matrix.
-
-A monoid can also be used in a reduction operation, like \verb's=sum(A)' in
-MATLAB.  MATLAB provides the plus, times, min, and max reductions of a real or
-complex sparse matrix as \verb's=sum(A)',  \verb's=prod(A)', \verb's=min(A)',
-and \verb's=max(A)', respectively.  In GraphBLAS, any monoid can be used (min,
-max, plus, times, AND, OR, exclusive-OR, equality, bitwise operators,
-or any user-defined monoid on any user-defined type).
-
-Element-wise operations are also expanded from what can be done in MATLAB.
-Consider matrix addition, \verb'C=A+B' in MATLAB.  The pattern of the result is
-the set union of the pattern of \verb'A' and \verb'B'.  In GraphBLAS, any
-binary operator can be used in this set-union ``addition.''  The operator is
-applied to entries in the intersection.  Entries in \verb'A' but not \verb'B',
-or visa-versa, are copied directly into \verb'C', without any application of
-the binary operator.  The accumulator operation for ${\bf Z = C \odot T}$
-described in Section~\ref{accummask} is one example of this set-union
-application of an arbitrary binary operator.
-
-Consider element-wise multiplication, \verb'C=A.*B' in MATLAB.  The operator
-(multiply in this case) is applied to entries in the set intersection, and the
-pattern of \verb'C' just this set intersection.  Entries in \verb'A' but not
-\verb'B', or visa-versa, do not appear in \verb'C'.  In GraphBLAS, any binary
-operator can be used in this manner, not just scalar multiplication.  The
-difference between element-wise ``add'' and ``multiply'' is not the operators,
-but whether or not the pattern of the result is the set union or the set
-intersection.  In both cases, the operator is only applied to the set
-intersection.
-
-Finally, GraphBLAS includes a {\em non-blocking} mode where operations can be
-left pending, and saved for later.  This is very useful for submatrix
-assignment (\verb'C(I,J)=A' where \verb'I' and \verb'J' are integer vectors),
-or scalar assignment (\verb'C(i,j)=x' where \verb'i' and \verb'j' are scalar
-integers).  Because of how MATLAB stores its matrices, adding and deleting
-individual entries is very costly.  For example, this is very slow in MATLAB,
-taking $O(nz^2)$ time:
-
-    \begin{mdframed}
-    {\footnotesize
-    \begin{verbatim}
-    A = sparse (m,n) ;   % an empty sparse matrix
-    for k = 1:nz
-        compute a value x, row index i, and column index j
-        A (i,j) = x ;
-    end\end{verbatim}}\end{mdframed}
-
-The above code is very easy read and simple to write, but exceedingly slow.  In
-MATLAB, the method below is preferred and is far faster, taking at most
-$O(|{\bf A}| \log |{\bf A}| +n)$ time.  It can easily be a million times faster
-than the method above.  Unfortunately the second method below is a little
-harder to read and a little less natural to write:
+The above code is very easy read and simple to write, but exceedingly slow.  In
+MATLAB, the method below is preferred and is far faster, taking at most
+$O(|{\bf A}| \log |{\bf A}| +n)$ time.  It can easily be a million times faster
+than the method above.  Unfortunately the second method below is a little
+harder to read and a little less natural to write:
 
     \begin{mdframed}
     {\footnotesize
@@ -1415,7 +825,7 @@ \subsection{Python Interface}
 \url{https://anaconda.org/conda-forge/pygraphblas}.
 
 See Jim Kitchen and Erik Welch's (both from Anaconda, Inc.) Python interface at
-\url{https://github.com/metagraph-dev/grblas}.
+\url{https://github.com/python-graphblas/python-graphblas} (formerly known as grblas).
 See also \\
 \url{https://anaconda.org/conda-forge/graphblas}.
 
@@ -1534,6 +944,7 @@ \section{Performance of MATLAB versus GraphBLAS}
     C=S+S:   MATLAB:  14.3368 GrB:   1.5539 speedup:     9.23
     C=S+B:   MATLAB:  15.5600 GrB:   1.5098 speedup:    10.31
     C=S(p,q) MATLAB:  95.6219 GrB:  15.9468 speedup:     6.00    \end{verbatim}
+}
 
 \newpage
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -2547,14 +1958,6 @@ \subsection{GraphBLAS unary operators: {\sf GrB\_UnaryOp}, $z=f(x)$} %==========
 \verb'GxB_ROUND_'$F$    & $F \rightarrow F$ & $z = \mbox{round}(x)$        & round to nearest \\
 \verb'GxB_TRUNC_'$F$    & $F \rightarrow F$ & $z = \mbox{trunc}(x)$        & round towards zero \\
 \hline
-\verb'GxB_LGAMMA_'$F$   & $F \rightarrow F$ & $z = \log(|\Gamma (x)|)$  & log of gamma function \\
-\verb'GxB_TGAMMA_'$F$   & $F \rightarrow F$ & $z = \Gamma(x)$        & gamma function \\
-\verb'GxB_ERF_'$F$      & $F \rightarrow F$ & $z = \erf(x)$          & error function \\
-\verb'GxB_ERFC_'$F$     & $F \rightarrow F$ & $z = \erfc(x)$         & complimentary error function \\
-\hline
-\verb'GxB_FREXPX_'$F$   & $F \rightarrow F$ & $z = \mbox{frexpx}(x)$  & normalized fraction \\
-\verb'GxB_FREXPE_'$F$   & $F \rightarrow F$ & $z = \mbox{frexpe}(x)$  & normalized exponent \\
-\hline
 \verb'GxB_ISINF_'$F$    & $F \rightarrow $ \verb'bool' & $z = \mbox{isinf}(x)$ & true if $\pm \infty$ \\
 \verb'GxB_ISNAN_'$F$    & $F \rightarrow $ \verb'bool' & $z = \mbox{isnan}(x)$ & true if \verb'NaN' \\
 \verb'GxB_ISFINITE_'$F$ & $F \rightarrow $ \verb'bool' & $z = \mbox{isfinite}(x)$ & true if finite \\
@@ -2562,6 +1965,24 @@ \subsection{GraphBLAS unary operators: {\sf GrB\_UnaryOp}, $z=f(x)$} %==========
 \end{tabular}
 \vspace{0.2in}
 
+\begin{tabular}{|llll|}
+\hline
+\multicolumn{4}{|c|}{Unary operators for floating-point types (real only)} \\
+\hline
+GraphBLAS name          & types (domains)   & $z=f(x)$      & description \\
+\hline
+\verb'GxB_LGAMMA_'$R$   & $R \rightarrow R$ & $z = \log(|\Gamma (x)|)$  & log of gamma function \\
+\verb'GxB_TGAMMA_'$R$   & $R \rightarrow R$ & $z = \Gamma(x)$        & gamma function \\
+\verb'GxB_ERF_'$R$      & $R \rightarrow R$ & $z = \erf(x)$          & error function \\
+\verb'GxB_ERFC_'$R$     & $R \rightarrow R$ & $z = \erfc(x)$         & complimentary error function \\
+\verb'GxB_CBRT_'$R$     & $R \rightarrow R$ & $z = x^{1/3}$          & cube root \\
+\hline
+\verb'GxB_FREXPX_'$R$   & $R \rightarrow R$ & $z = \mbox{frexpx}(x)$  & normalized fraction \\
+\verb'GxB_FREXPE_'$R$   & $R \rightarrow R$ & $z = \mbox{frexpe}(x)$  & normalized exponent \\
+\hline
+\end{tabular}
+\vspace{0.2in}
+
 \begin{tabular}{|llll|}
 \hline
 \multicolumn{4}{|c|}{Unary operators for complex types} \\
@@ -4522,6 +3943,7 @@ \subsection{GraphBLAS vectors: {\sf GrB\_Vector}} %=============================
 \verb'GxB_Vector_build_Scalar'   & build a vector from tuples       & \ref{vector_build_Scalar} \\
 \verb'GrB_Vector_setElement'     & add an entry to a vector         & \ref{vector_setElement} \\
 \verb'GrB_Vector_extractElement' & get an entry from a vector       & \ref{vector_extractElement} \\
+\verb'GxB_Vector_isStoredElement'& check if entry present in vector & \ref{vector_isStoredElement} \\
 \verb'GrB_Vector_removeElement'  & remove an entry from a vector    & \ref{vector_removeElement} \\
 \verb'GrB_Vector_extractTuples'  & get all entries from a vector    & \ref{vector_extractTuples} \\
 \verb'GrB_Vector_resize'         & resize a vector                  & \ref{vector_resize} \\
@@ -4857,6 +4279,26 @@ \subsubsection{{\sf GrB\_Vector\_extractElement:} get an entry from a vector}
 \verb'x = A(i,0)' from an \verb'n'-by-1 matrix; see
 Section~\ref{matrix_extractElement}.
 
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Vector\_isStoredElement:} check if entry present in vector}
+%-------------------------------------------------------------------------------
+\label{vector_isStoredElement}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Vector_isStoredElement
+(
+    const GrB_Vector v,         // check presence of entry v(i)
+    GrB_Index i                 // index
+) ;
+\end{verbatim} } \end{mdframed}
+
+\verb'GxB_Vector_isStoredElement' checks if a single entry \verb'v(i)'
+is present, returning \verb'GrB_SUCCESS' if the entry is present or
+\verb'GrB_NO_VALUE' otherwise.  The value of \verb'v(i)' is not returned.
+See also Section~\ref{matrix_isStoredElement}.
+
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Vector\_removeElement:} remove an entry from a vector}
 %-------------------------------------------------------------------------------
@@ -5049,6 +4491,7 @@ \subsection{GraphBLAS matrices: {\sf GrB\_Matrix}} %============================
 \verb'GxB_Matrix_build_Scalar'  & build a matrix from tuples            & \ref{matrix_build_Scalar} \\
 \verb'GrB_Matrix_setElement'    & add an entry to a matrix              & \ref{matrix_setElement} \\
 \verb'GrB_Matrix_extractElement'& get an entry from a matrix            & \ref{matrix_extractElement} \\
+\verb'GxB_Matrix_isStoredElement'& check if entry present in matrix     & \ref{matrix_isStoredElement} \\
 \verb'GrB_Matrix_removeElement' & remove an entry from a matrix         & \ref{matrix_removeElement} \\
 \verb'GrB_Matrix_extractTuples' & get all entries from a matrix         & \ref{matrix_extractTuples} \\
 \verb'GrB_Matrix_resize'        & resize a matrix                       & \ref{matrix_resize} \\
@@ -5556,7 +4999,6 @@ \subsubsection{{\sf GrB\_Matrix\_extractElement:} get an entry from a matrix}
     GrB_Index i,                // row index
     GrB_Index j                 // column index
 ) ;
-
 GrB_Info GrB_Matrix_extractElement      // x = A(i,j)
 (
     GrB_Scalar x,               // extracted GrB_Scalar
@@ -5568,14 +5010,11 @@ \subsubsection{{\sf GrB\_Matrix\_extractElement:} get an entry from a matrix}
 
 \verb'GrB_Matrix_extractElement' extracts a single entry from a matrix
 \verb'x=A(i,j)'.
-
 An error is returned (\verb'GrB_INVALID_INDEX') if the row index \verb'i' is
 greater than or equal to the number of rows of \verb'C', or if column index
 \verb'j' is greater than or equal to the number of columns of \verb'C'.
-
 If the entry is present, \verb'x=A(i,j)' is performed and the scalar \verb'x'
 is returned with this value.  The method returns \verb'GrB_SUCCESS'.
-
 If no entry is present at \verb'A(i,j)', and \verb'x' is a non-opaque C scalar,
 then \verb'x' is not modified, and the return value of
 \verb'GrB_Matrix_extractElement' is \verb'GrB_NO_VALUE'.  If \verb'x' is a
@@ -5595,7 +5034,28 @@ \subsubsection{{\sf GrB\_Matrix\_extractElement:} get an entry from a matrix}
 functions.  Everything will work correctly and results will be predictable, it
 will just be slow.
 
+%-------------------------------------------------------------------------------
 \newpage
+\subsubsection{{\sf GxB\_Matrix\_isStoredElement:} check if entry present in matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_isStoredElement}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_isStoredElement
+(
+    const GrB_Matrix A,         // check for A(i,j)
+    GrB_Index i,                // row index
+    GrB_Index j                 // column index
+) ;
+\end{verbatim} } \end{mdframed}
+
+\verb'GxB_Matrix_isStoredElement' check if the single entry \verb'A(i,j)' is
+present in the matrix \verb'A'.  It returns \verb'GrB_SUCCESS' if the entry is
+present, or \verb'GrB_NO_VALUE' otherwise.  The value of \verb'A(i,j)' is not
+returned. It is otherwise identical to \verb'GrB_Matrix_extractElement'.
+
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_removeElement:} remove an entry from a matrix}
 %-------------------------------------------------------------------------------
@@ -5612,9 +5072,10 @@ \subsubsection{{\sf GrB\_Matrix\_removeElement:} remove an entry from a matrix}
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_removeElement' removes a single entry \verb'A(i,j)' from a matrix.
-If no entry is present at \verb'A(i,j)', then the matrix is not modified.
-If an error occurs, \verb'GrB_error(&err,A)' returns details about the error.
+\verb'GrB_Matrix_removeElement' removes a single entry \verb'A(i,j)' from a
+matrix.  If no entry is present at \verb'A(i,j)', then the matrix is not
+modified.  If an error occurs, \verb'GrB_error(&err,A)' returns details about
+the error.
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_extractTuples:} get all entries from a matrix}
@@ -5661,7 +5122,6 @@ \subsubsection{{\sf GrB\_Matrix\_extractTuples:} get all entries from a matrix}
 \verb'NULL'.  This is not an error condition.
 \end{alert}
 
-\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_resize:}          resize a matrix}
 %-------------------------------------------------------------------------------
@@ -5678,11 +5138,93 @@ \subsubsection{{\sf GrB\_Matrix\_resize:}          resize a matrix}
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_resize' changes the size of a matrix.
-If the dimensions decrease, entries that fall outside the resized
-matrix are deleted.
+\verb'GrB_Matrix_resize' changes the size of a matrix.  If the dimensions
+decrease, entries that fall outside the resized matrix are deleted.  Unlike
+\verb'GxB_Matrix_reshape*' (see Sections \ref{matrix_reshape} and
+\ref{matrix_reshapedup}), entries remain in their same position after resizing
+the matrix.
+
+%-------------------------------------------------------------------------------
+\newpage
+\subsubsection{{\sf GxB\_Matrix\_reshape:} reshape a matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_reshape}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_reshape     // reshape a GrB_Matrix in place
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix, reshaped in place
+    // input:
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // new number of rows of C
+    GrB_Index ncols_new,        // new number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+) ;
+\end{verbatim} } \end{mdframed}
+
+\verb'GxB_Matrix_reshape' changes the size of a matrix \verb'C', taking entries
+from the input matrix either column-wise or row-wise.  If matrix \verb'C' on
+input is \verb'nrows'-by-\verb'ncols', and the requested dimensions of
+\verb'C' on output are \verb'nrows_new'-by-\verb'nrows_cols', then
+the condition \verb'nrows*ncols == nrows_new*nrows_cols' must hold.
+The matrix \verb'C' is modified in-place, as both an input and output for
+this method.  To create a new matrix, use \verb'GxB_Matrix_reshapeDup'
+instead (Section \ref{matrix_reshapedup}).
+
+For example, if \verb'C' is 3-by-4 on input, and is reshaped column-wise to
+have dimensions 2-by-6:
+
+\begin{verbatim}
+        C on input      C on output (by_col true)
+        00 01 02 03     00 20 11 02 22 13
+        10 11 12 13     10 01 21 12 03 23
+        20 21 22 23
+\end{verbatim}
+
+If the same \verb'C' on input is reshaped row-wise to dimensions 2-by-6:
+
+\begin{verbatim}
+        C on input      C on output (by_col false)
+        00 01 02 03     00 01 02 03 10 11
+        10 11 12 13     12 13 20 21 22 23
+        20 21 22 23
+\end{verbatim}
+
+NOTE: because an intermediate linear index must be computed for each entry,
+\verb'GxB_Matrix_reshape' cannot be used on matrices for which
+\verb'nrows*ncols' exceeds $2^{60}$.
+
+%-------------------------------------------------------------------------------
+\newpage
+\subsubsection{{\sf GxB\_Matrix\_reshapeDup:} reshape a matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_reshapedup}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_reshapeDup // reshape a GrB_Matrix into another GrB_Matrix
+(
+    // output:
+    GrB_Matrix *C,              // newly created output matrix, not in place
+    // input:
+    GrB_Matrix A,               // input matrix, not modified
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // number of rows of C
+    GrB_Index ncols_new,        // number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+) ;
+\end{verbatim} } \end{mdframed}
+
+\verb'GxB_Matrix_reshapeDup' is identical to \verb'GxB_Matrix_reshape' (see
+Section \ref{matrix_reshape}), except that creates a new output matrix
+\verb'C' that is reshaped from the input matrix \verb'A'.
 
 %-------------------------------------------------------------------------------
+% \newpage
 \subsubsection{{\sf GxB\_Matrix\_concat:} concatenate matrices   }
 %-------------------------------------------------------------------------------
 \label{matrix_concat}
@@ -5734,8 +5276,8 @@ \subsubsection{{\sf GxB\_Matrix\_concat:} concatenate matrices   }
 \verb'GxB_Matrix_Option_set' (format by row or by column, bitmap switch, hyper
 switch, and sparsity control) are unchanged.
 
-\newpage
 %-------------------------------------------------------------------------------
+% \newpage
 \subsubsection{{\sf GxB\_Matrix\_split:} split a matrix   }
 %-------------------------------------------------------------------------------
 \label{matrix_split}
@@ -5797,7 +5339,7 @@ \subsubsection{{\sf GrB\_Matrix\_diag:} construct a diagonal matrix}
 The output matrix \verb'C' is a newly-constructed square matrix with the
 same type as the input vector \verb'v'.  No typecasting is performed.
 
-\newpage
+% \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Matrix\_diag:} build a diagonal matrix}
 %-------------------------------------------------------------------------------
@@ -5862,6 +5404,7 @@ \subsubsection{{\sf GxB\_Matrix\_memoryUsage:} memory used by a matrix}
 Returns the memory space required for a matrix, in bytes.
 
 %-------------------------------------------------------------------------------
+% \newpage
 \subsubsection{{\sf GrB\_Matrix\_free:} free a matrix}
 %-------------------------------------------------------------------------------
 \label{matrix_free}
@@ -5909,9 +5452,10 @@ \subsection{Serialize/deserialize methods}
 allocated by the user application, and it must be large enough to hold the
 matrix or vector.
 
-By default, LZ4 compression is used for serialization, but other options can be
-selected via the descriptor: \verb'GxB_set (desc, GxB_COMPRESSION, method)',
-where \verb'method' is an integer selected from the following options:
+By default, ZSTD (level 1) compression is used for serialization, but other
+options can be selected via the descriptor:
+\verb'GxB_set (desc, GxB_COMPRESSION, method)', where \verb'method' is an
+integer selected from the following options:
 
 \vspace{0.2in}
 {\footnotesize
@@ -5920,9 +5464,10 @@ \subsection{Serialize/deserialize methods}
 method                           &  description \\
 \hline
 \verb'GxB_COMPRESSION_NONE'      &  no compression \\
-\verb'GxB_COMPRESSION_DEFAULT'   &  LZ4 \\
+\verb'GxB_COMPRESSION_DEFAULT'   &  ZSTD, with default level 1 \\
 \verb'GxB_COMPRESSION_LZ4'       &  LZ4 \\
 \verb'GxB_COMPRESSION_LZ4HC'     &  LZ4HC, with default level 9 \\
+\verb'GxB_COMPRESSION_ZSTD'      &  ZSTD, with default level 1 \\
 \hline
 \end{tabular} }
 \vspace{0.2in}
@@ -5936,6 +5481,13 @@ \subsection{Serialize/deserialize methods}
     \begin{verbatim}
     GxB_set (desc, GxB_COMPRESSION, GxB_COMPRESSION_LZ4HC + 6) ; \end{verbatim}}
 
+The ZSTD method can be specified as level 1 to 19, with 1 being the default.
+To compress with ZSTD at level 6, use:
+
+    {\footnotesize
+    \begin{verbatim}
+    GxB_set (desc, GxB_COMPRESSION, GxB_COMPRESSION_ZSTD + 6) ; \end{verbatim}}
+
 Deserialization of untrusted data is a common security problem; see
 \url{https://cwe.mitre.org/data/definitions/502.html}. The deserialization
 methods do a few basic checks so that no out-of-bounds access occurs during
@@ -6029,7 +5581,7 @@ \subsection{Serialize/deserialize methods}
 % On output, it is reduced to the numbed of bytes actually used to serialize
 % the vector.  After calling \verb'GrB_Vector_serialize', the blob may be
 % \verb'realloc''d to this revised size if desired (this is optional).
-% LZ4 compression is used to construct a compact blob.
+% ZSTD (level 1) compression is used to construct a compact blob.
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Vector\_serialize:}      serialize a vector}
@@ -6054,10 +5606,9 @@ \subsubsection{{\sf GxB\_Vector\_serialize:}      serialize a vector}
 
 \verb'GxB_Vector_serialize' serializes a vector into a single array of bytes
 (the blob), which is \verb'malloc''ed and filled with the serialized vector.
-By default, LZ4 compression is used, but other options can be selected
-via the descriptor.
-Serializing a vector is identical to serializing a matrix;
-see Section \ref{matrix_serialize_GxB} for more information.
+By default, ZSTD (level 1) compression is used, but other options can be
+selected via the descriptor.  Serializing a vector is identical to serializing
+a matrix; see Section \ref{matrix_serialize_GxB} for more information.
 
 \newpage
 %-------------------------------------------------------------------------------
@@ -6115,6 +5666,12 @@ \subsubsection{{\sf GxB\_Vector\_deserialize:}    deserialize a vector}
 Deserializing a vector is identical to deserializing a matrix;
 see Section \ref{matrix_deserialize_GxB} for more information.
 
+The blob is allocated with the \verb'malloc' function passed to
+\verb'GxB_init', or the ANSI C11 \verb'malloc' if \verb'GrB_init' was used
+to initialize GraphBLAS.  The blob must be freed by the matching \verb'free'
+method, either the \verb'free' function passed to \verb'GxB_init' or
+the ANSI C11 \verb'free' if \verb'GrB_init' was used.
+
 % Identical to \verb'GrB_Vector_deserialize', except that the descriptor
 % appears as the last parameter to control the number of threads used.
 
@@ -6171,7 +5728,7 @@ \subsubsection{{\sf GrB\_Matrix\_serialize:}      serialize a matrix}
 On output, it is reduced to the numbed of bytes actually used to serialize
 the matrix.  After calling \verb'GrB_Matrix_serialize', the blob may be
 \verb'realloc''d to this revised size if desired (this is optional).
-LZ4 compression is used to construct a compact blob.
+ZSTD (level 1) compression is used to construct a compact blob.
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Matrix\_serialize:}      serialize a matrix}
@@ -6195,10 +5752,15 @@ \subsubsection{{\sf GxB\_Matrix\_serialize:}      serialize a matrix}
 } \end{mdframed}
 
 \verb'GxB_Matrix_serialize' is identical to \verb'GrB_Matrix_serialize', except
-that it does not require a pre-allocated blob.  Instead, it \verb'malloc''s the
-blob internally, and fills it with the serialized matrix.
-By default, LZ4 compression is used, but other options can be selected
-via the descriptor.
+that it does not require a pre-allocated blob.  Instead, it allocates the blob
+internally, and fills it with the serialized matrix.  By default, ZSTD (level 1)
+compression is used, but other options can be selected via the descriptor.
+
+The blob is allocated with the \verb'malloc' function passed to
+\verb'GxB_init', or the ANSI C11 \verb'malloc' if \verb'GrB_init' was used
+to initialize GraphBLAS.  The blob must be freed by the matching \verb'free'
+method, either the \verb'free' function passed to \verb'GxB_init' or
+the ANSI C11 \verb'free' if \verb'GrB_init' was used.
 
 \newpage
 %-------------------------------------------------------------------------------
@@ -7371,8397 +6933,9032 @@ \subsubsection{{\sf GxB\_Matrix\_pack\_BitmapR:} pack a BitmapR matrix}
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_unpack\_BitmapR:} unpack a BitmapR matrix}
+\subsubsection{{\sf GxB\_Matrix\_unpack\_BitmapR:} unpack a BitmapR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_unpack_bitmapr}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_unpack_BitmapR  // unpack a bitmap matrix, by row
+(
+    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
+    int8_t **Ab,        // bitmap
+    void **Ax,          // values
+    GrB_Index *Ab_size, // size of Ab in bytes
+    GrB_Index *Ax_size, // size of Ax in bytes
+    bool *iso,          // if true, A is iso
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_unpack_BitmapR' unpacks a matrix in BitmapR form.
+If successful, the \verb'GrB_Matrix A' is returned with no entries.
+The number of entries is in \verb'nvals'.
+The BitmapR format is two arrays \verb'Ab', and \verb'Ax'.  After an
+unpack, the user application is responsible for freeing these
+arrays via \verb'free' (or the \verb'free' function passed to \verb'GxB_init').
+The BitmapR format is described in Section~\ref{matrix_pack_bitmapr}.
+If \verb'Ab[p]' is zero, the value of \verb'Ax[p]' is undefined.
+This method takes $O(1)$ time if the matrix is already in BitmapR format.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_pack\_BitmapC:} pack a BitmapC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_pack_bitmapc}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_pack_BitmapC  // pack a bitmap matrix, held by column
+(
+    GrB_Matrix A,       // matrix to create (type, nrows, ncols unchanged)
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols * (type size)
+                        // or Ax_size >= (type size), if iso is true
+    GrB_Index Ab_size,  // size of Ab in bytes
+    GrB_Index Ax_size,  // size of Ax in bytes
+    bool iso,           // if true, A is iso
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_pack_BitmapC' packs a matrix from 2 user arrays in BitmapC
+format.  It is identical to \verb'GxB_Matrix_pack_BitmapR', except that the
+entry $A(i,j)$ is held in \verb'Ab[i+j*nrows]' and \verb'Ax[i+j*nrows]',
+in column-major format.
+
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_unpack\_BitmapC:} unpack a BitmapC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_unpack_bitmapc}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_unpack_BitmapC  // unpack a bitmap matrix, by col
+(
+    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
+    int8_t **Ab,        // bitmap
+    void **Ax,          // values
+    GrB_Index *Ab_size, // size of Ab in bytes
+    GrB_Index *Ax_size, // size of Ax in bytes
+    bool *iso,          // if true, A is iso
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_unpack_BitmapC' unpacks a matrix in BitmapC form.
+It is identical to \verb'GxB_Matrix_unpack_BitmapR', except that the
+entry $A(i,j)$ is held in \verb'Ab[i+j*nrows]' and \verb'Ax[i+j*nrows]',
+in column-major format.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_pack\_FullR:} pack a FullR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_pack_fullr}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_pack_FullR  // pack a full matrix, held by row
+(
+    GrB_Matrix A,       // matrix to create (type, nrows, ncols unchanged)
+    void **Ax,          // values, Ax_size >= nrows*ncols * (type size)
+                        // or Ax_size >= (type size), if iso is true
+    GrB_Index Ax_size,  // size of Ax in bytes
+    bool iso,           // if true, A is iso
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_pack_FullR' packs a matrix from a user array in FullR format.
+For the \verb'FullR' format, t value of $A(i,j)$ is \verb'Ax[i*ncols+j]'.  To
+iterate over the rows and entries of this matrix, the following code can be
+used (assuming it has type \verb'GrB_FP64').  If \verb'A' is both full and iso,
+it takes $O(1)$ memory, regardless of \verb'nrows' and \verb'ncols'.
+
+    \vspace{-0.1in}
+    {\footnotesize
+    \begin{verbatim}
+    for (int64_t i = 0 ; i < nrows ; i++)
+    {
+        for (int64_t j = 0 ; j < ncols ; j++)
+        {
+            int64_t p = i*ncols + j ;
+            double aij = Ax [iso ? 0 : p] ;   // numerical value of A(i,j)
+        }
+    } \end{verbatim}}
+
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_unpack\_FullR:} unpack a FullR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_unpack_fullr}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_unpack_FullR  // unpack a full matrix, by row
+(
+    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
+    void **Ax,          // values
+    GrB_Index *Ax_size, // size of Ax in bytes
+    bool *iso,          // if true, A is iso
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_unpack_FullR' unpacks a matrix in FullR form.  It is identical
+to \verb'GxB_Matrix_unpack_BitmapR', except that all entries must be present.
+Prior to unpack, \verb'GrB_Matrix_nvals (&nvals, A)' must return
+\verb'nvals' equal to \verb'nrows*ncols'.  Otherwise, if the \verb'A' is
+unpacked with \newline \verb'GxB_Matrix_unpack_FullR', an error is returned
+(\verb'GrB_INVALID_VALUE') and the matrix is not unpacked.
+
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_pack\_FullC:} pack a FullC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_pack_fullc}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_pack_FullC  // pack a full matrix, held by column
+(
+    GrB_Matrix A,       // matrix to create (type, nrows, ncols unchanged)
+    void **Ax,          // values, Ax_size >= nrows*ncols * (type size)
+                        // or Ax_size >= (type size), if iso is true
+    GrB_Index Ax_size,  // size of Ax in bytes
+    bool iso,           // if true, A is iso
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_pack_FullC' packs a matrix from a user arrays in FullC
+format.  For the \verb'FullC' format,
+the value of $A(i,j)$ is \verb'Ax[i+j*nrows]'.
+To iterate over the rows and entries of this matrix, the following code can be
+used (assuming it has type \verb'GrB_FP64').
+If \verb'A' is both full and iso, it takes $O(1)$ memory,
+regardless of \verb'nrows' and \verb'ncols'.
+
+    \vspace{-0.1in}
+    {\footnotesize
+    \begin{verbatim}
+    for (int64_t i = 0 ; i < nrows ; i++)
+    {
+        for (int64_t j = 0 ; j < ncols ; j++)
+        {
+            int64_t p = i + j*nrows ;
+            double aij = Ax [iso ? 0 : p] ;   // numerical value of A(i,j)
+        }
+    } \end{verbatim}}
+
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_unpack\_FullC:} unpack a FullC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_unpack_fullc}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_unpack_FullC  // unpack a full matrix, by column
+(
+    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
+    void **Ax,          // values
+    GrB_Index *Ax_size, // size of Ax in bytes
+    bool *iso,          // if true, A is iso
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_unpack_FullC' unpacks a matrix in FullC form.  It is identical
+to \verb'GxB_Matrix_unpack_BitmapC', except that all entries must be present.
+That is, prior to unpack, \verb'GrB_Matrix_nvals (&nvals, A)' must return
+\verb'nvals' equal to \verb'nrows*ncols'.  Otherwise, if the \verb'A' is
+unpacked with \newline \verb'GxB_Matrix_unpack_FullC', an error is returned
+(\verb'GrB_INVALID_VALUE') and the matrix is not unpacked.
+
+\newpage
+%===============================================================================
+\subsection{GraphBLAS import/export: using copy semantics} %====================
+%===============================================================================
+\label{GrB_import_export}
+
+The v2.0 C API includes import/export methods for matrices (not vectors) using
+a different strategy as compared to the \verb'GxB*pack/unpack*' methods.  The
+\verb'GxB' methods are based on {\em move semantics}, in which ownership of
+arrays is passed between SuiteSparse:GraphBLAS and the user application.  This
+allows the \verb'GxB*pack/unpack*' methods to work in $O(1)$ time, and require
+no additional memory, but it requires that GraphBLAS and the user application
+agree on which memory manager to use.  This is done via \verb'GxB_init'.  This
+allows GraphBLAS to \verb'malloc' an array that can be later \verb'free'd by
+the user application, and visa versa.
+
+The \verb'GrB' import/export methods take a different approach.  The data
+is always copied in and out between the opaque GraphBLAS matrix and the
+user arrays.  This takes $\Omega(e)$ time, if the matrix has $e$ entries,
+and requires more memory.  It has the advantage that it does not require
+GraphBLAS and the user application to agree on what memory manager to use,
+since no ownership of allocated arrays is changed.
+
+The format for \verb'GrB_Matrix_import' and \verb'GrB_Matrix_export' is
+controlled by the following enum:
+
+{\footnotesize
+\begin{verbatim}
+typedef enum
+{
+    GrB_CSR_FORMAT = 0,     // CSR format (equiv to GxB_SPARSE with GxB_BY_ROW)
+    GrB_CSC_FORMAT = 1,     // CSC format (equiv to GxB_SPARSE with GxB_BY_COL)
+    GrB_COO_FORMAT = 2      // triplet format (like input to GrB*build)
+}
+GrB_Format ; \end{verbatim}}
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_import:}  import a matrix}
 %-------------------------------------------------------------------------------
-\label{matrix_unpack_bitmapr}
+\label{GrB_matrix_import}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_unpack_BitmapR  // unpack a bitmap matrix, by row
+GrB_Info GrB_Matrix_import  // import a matrix
 (
-    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
-    int8_t **Ab,        // bitmap
-    void **Ax,          // values
-    GrB_Index *Ab_size, // size of Ab in bytes
-    GrB_Index *Ax_size, // size of Ax in bytes
-    bool *iso,          // if true, A is iso
-    GrB_Index *nvals,   // # of entries in bitmap
-    const GrB_Descriptor desc
+    GrB_Matrix *A,          // handle of matrix to create
+    GrB_Type type,          // type of matrix to create
+    GrB_Index nrows,        // number of rows of the matrix
+    GrB_Index ncols,        // number of columns of the matrix
+    const GrB_Index *Ap,    // pointers for CSR, CSC, column indices for COO
+    const GrB_Index *Ai,    // row indices for CSR, CSC
+    const <type> *Ax,       // values
+    GrB_Index Ap_len,       // number of entries in Ap (not # of bytes)
+    GrB_Index Ai_len,       // number of entries in Ai (not # of bytes)
+    GrB_Index Ax_len,       // number of entries in Ax (not # of bytes)
+    GrB_Format format       // import format
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_unpack_BitmapR' unpacks a matrix in BitmapR form.
-If successful, the \verb'GrB_Matrix A' is returned with no entries.
-The number of entries is in \verb'nvals'.
-The BitmapR format is two arrays \verb'Ab', and \verb'Ax'.  After an
-unpack, the user application is responsible for freeing these
-arrays via \verb'free' (or the \verb'free' function passed to \verb'GxB_init').
-The BitmapR format is described in Section~\ref{matrix_pack_bitmapr}.
-If \verb'Ab[p]' is zero, the value of \verb'Ax[p]' is undefined.
-This method takes $O(1)$ time if the matrix is already in BitmapR format.
+The \verb'GrB_Matrix_import' method copies from user-provided arrays into an
+opaque \verb'GrB_Matrix' and \verb'GrB_Matrix_export' copies data out, from an
+opaque \verb'GrB_Matrix' into user-provided arrays.
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_pack\_BitmapC:} pack a BitmapC matrix}
-%-------------------------------------------------------------------------------
-\label{matrix_pack_bitmapc}
+The suffix \verb'TYPE' in the prototype above is one of \verb'BOOL',
+\verb'INT8', \verb'INT16', etc, for built-n types, or \verb'UDT' for
+user-defined types.  The type of the \verb'Ax' array must match this type.  No
+typecasting is performed.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Matrix_pack_BitmapC  // pack a bitmap matrix, held by column
-(
-    GrB_Matrix A,       // matrix to create (type, nrows, ncols unchanged)
-    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
-    void **Ax,          // values, Ax_size >= nrows*ncols * (type size)
-                        // or Ax_size >= (type size), if iso is true
-    GrB_Index Ab_size,  // size of Ab in bytes
-    GrB_Index Ax_size,  // size of Ax in bytes
-    bool iso,           // if true, A is iso
-    GrB_Index nvals,    // # of entries in bitmap
-    const GrB_Descriptor desc
-) ;
-\end{verbatim}
-} \end{mdframed}
+Unlike the \verb'GxB'
+pack/unpack methods, memory is not handed off between the user application
+and GraphBLAS.   The three arrays \verb'Ap', \verb'Ai'.  and \verb'Ax' are not
+modified, and are still owned by the user application when the method finishes.
 
-\verb'GxB_Matrix_pack_BitmapC' packs a matrix from 2 user arrays in BitmapC
-format.  It is identical to \verb'GxB_Matrix_pack_BitmapR', except that the
-entry $A(i,j)$ is held in \verb'Ab[i+j*nrows]' and \verb'Ax[i+j*nrows]',
-in column-major format.
+The matrix can be imported in one of three different formats:
+
+\begin{packed_itemize}
+    \item \verb'GrB_CSR_FORMAT': % CSR format (equiv to GxB_SPARSE with GxB_BY_ROW)
+        Compressed-row format.  \verb'Ap' is an array of size \verb'nrows+1'.
+        The arrays \verb'Ai' and \verb'Ax' are of size \verb'nvals = Ap [nrows]',
+        and \verb'Ap[0]' must be zero.
+        The column indices of entries in the \verb'i'th row appear in
+        \verb'Ai[Ap[i]...Ap[i+1]-1]', and the values of those entries appear in
+        the same locations in \verb'Ax'.
+        The column indices need not be in any particular order.
+
+    \item \verb'GrB_CSC_FORMAT': % CSC format (equiv to GxB_SPARSE with GxB_BY_COL)
+        Compressed-column format.  \verb'Ap' is an array of size \verb'ncols+1'.
+        The arrays \verb'Ai' and \verb'Ax' are of size \verb'nvals = Ap [ncols]',
+        and \verb'Ap[0]' must be zero.
+        The row indices of entries in the \verb'j'th column appear in
+        \verb'Ai[Ap[j]...Ap[j+1]-1]', and the values of those entries appear in
+        the same locations in \verb'Ax'.
+        The row indices need not be in any particular order.
+        
+    \item \verb'GrB_COO_FORMAT': % triplet format (like input to GrB*build)
+        Coordinate format.  This is the same format as \newline
+        \verb'GrB_Matrix_build'.
+        The three arrays \verb'Ap', \verb'Ai', and \verb'Ax' have the same
+        size.  The \verb'k'th tuple has row index \verb'Ai[k]',
+        column index \verb'Ap[k]', and value \verb'Ax[k]'.  The tuples can
+        appear any order, but no duplicates are permitted.
+
+%   \item \verb'GrB_DENSE_ROW_FORMAT': % FullR format (GxB_FULL with GxB_BY_ROW)
+%       Dense matrix format, held by row.  Only the \verb'Ax' array is used, of
+%       size \verb'nrows*ncols'.
+%       It holds the matrix in dense format, in row major order.
+%
+%   \item \verb'GrB_DENSE_COL_FORMAT': % FullC format (GxB_FULL with GxB_BY_ROW)
+%       Dense matrix format, held by column.  Only the \verb'Ax' array is used, of
+%       size \verb'nrows*ncols'.
+%       It holds the matrix in dense format, in column major order.
+
+\end{packed_itemize}
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_unpack\_BitmapC:} unpack a BitmapC matrix}
+\subsubsection{{\sf GrB\_Matrix\_export:}  export a matrix}
 %-------------------------------------------------------------------------------
-\label{matrix_unpack_bitmapc}
+\label{GrB_matrix_export}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_unpack_BitmapC  // unpack a bitmap matrix, by col
+GrB_Info GrB_Matrix_export  // export a matrix
 (
-    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
-    int8_t **Ab,        // bitmap
-    void **Ax,          // values
-    GrB_Index *Ab_size, // size of Ab in bytes
-    GrB_Index *Ax_size, // size of Ax in bytes
-    bool *iso,          // if true, A is iso
-    GrB_Index *nvals,   // # of entries in bitmap
-    const GrB_Descriptor desc
+    GrB_Index *Ap,          // pointers for CSR, CSC, column indices for COO
+    GrB_Index *Ai,          // col indices for CSR/COO, row indices for CSC
+    <type> *Ax,             // values (must match the type of A_input)
+    GrB_Index *Ap_len,      // number of entries in Ap (not # of bytes)
+    GrB_Index *Ai_len,      // number of entries in Ai (not # of bytes)
+    GrB_Index *Ax_len,      // number of entries in Ax (not # of bytes)
+    GrB_Format format,      // export format
+    GrB_Matrix A            // matrix to export
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_unpack_BitmapC' unpacks a matrix in BitmapC form.
-It is identical to \verb'GxB_Matrix_unpack_BitmapR', except that the
-entry $A(i,j)$ is held in \verb'Ab[i+j*nrows]' and \verb'Ax[i+j*nrows]',
-in column-major format.
+\verb'GrB_Matrix_export' copies the contents of a matrix into three
+user-provided arrays, using any one of the three different formats
+described in Section~\ref{GrB_matrix_import}.  The size of the arrays must be
+at least as large as the lengths returned by \verb'GrB_Matrix_exportSize'.  The
+matrix \verb'A' is not modified.
+
+On input, the size of the three arrays \verb'Ap', \verb'Ai', and \verb'Ax' is
+given by \verb'Ap_len', \verb'Ai_len', and \verb'Ax_len', respectively.  These
+values are in terms of the number of entries in these arrays, not the number of
+bytes.  On output, these three value are adjusted to report the number of
+entries written to the three arrays.
+
+The suffix \verb'TYPE' in the prototype above is one of \verb'BOOL',
+\verb'INT8', \verb'INT16', etc, for built-n types, or \verb'UDT' for
+user-defined types.  The type of the \verb'Ax' array must match this type.  No
+typecasting is performed.
+
+% The \verb'GrB_DENSE_ROW_FORMAT' and \verb'GrB_DENSE_COL_FORMAT' formats can
+% only be used if all entries are present in the matrix.  That is,
+% \verb'GrB_Matrix_nvals (&nvals,A)' must return \verb'nvals' equal to
+% \verb'nrows*ncols'.
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_pack\_FullR:} pack a FullR matrix}
+\subsubsection{{\sf GrB\_Matrix\_exportSize:} determine size of export}
 %-------------------------------------------------------------------------------
-\label{matrix_pack_fullr}
+\label{export_size}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_pack_FullR  // pack a full matrix, held by row
+GrB_Info GrB_Matrix_exportSize  // determine sizes of user arrays for export
 (
-    GrB_Matrix A,       // matrix to create (type, nrows, ncols unchanged)
-    void **Ax,          // values, Ax_size >= nrows*ncols * (type size)
-                        // or Ax_size >= (type size), if iso is true
-    GrB_Index Ax_size,  // size of Ax in bytes
-    bool iso,           // if true, A is iso
-    const GrB_Descriptor desc
+    GrB_Index *Ap_len,      // # of entries required for Ap (not # of bytes)
+    GrB_Index *Ai_len,      // # of entries required for Ai (not # of bytes)
+    GrB_Index *Ax_len,      // # of entries required for Ax (not # of bytes)
+    GrB_Format format,      // export format
+    GrB_Matrix A            // matrix to export
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_pack_FullR' packs a matrix from a user array in FullR format.
-For the \verb'FullR' format, t value of $A(i,j)$ is \verb'Ax[i*ncols+j]'.  To
-iterate over the rows and entries of this matrix, the following code can be
-used (assuming it has type \verb'GrB_FP64').  If \verb'A' is both full and iso,
-it takes $O(1)$ memory, regardless of \verb'nrows' and \verb'ncols'.
-
-    \vspace{-0.1in}
-    {\footnotesize
-    \begin{verbatim}
-    for (int64_t i = 0 ; i < nrows ; i++)
-    {
-        for (int64_t j = 0 ; j < ncols ; j++)
-        {
-            int64_t p = i*ncols + j ;
-            double aij = Ax [iso ? 0 : p] ;   // numerical value of A(i,j)
-        }
-    } \end{verbatim}}
+Returns the required sizes of the arrays \verb'Ap', \verb'Ai', and \verb'Ax'
+for exporting a matrix using \verb'GrB_Matrix_export', using the same
+\verb'format'.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_unpack\_FullR:} unpack a FullR matrix}
+\subsubsection{{\sf GrB\_Matrix\_exportHint:} determine best export format}
 %-------------------------------------------------------------------------------
-\label{matrix_unpack_fullr}
+\label{export_hint}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_unpack_FullR  // unpack a full matrix, by row
+GrB_Info GrB_Matrix_exportHint  // suggest the best export format
 (
-    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
-    void **Ax,          // values
-    GrB_Index *Ax_size, // size of Ax in bytes
-    bool *iso,          // if true, A is iso
-    const GrB_Descriptor desc
+    GrB_Format *format,     // export format
+    GrB_Matrix A            // matrix to export
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_unpack_FullR' unpacks a matrix in FullR form.  It is identical
-to \verb'GxB_Matrix_unpack_BitmapR', except that all entries must be present.
-Prior to unpack, \verb'GrB_Matrix_nvals (&nvals, A)' must return
-\verb'nvals' equal to \verb'nrows*ncols'.  Otherwise, if the \verb'A' is
-unpacked with \newline \verb'GxB_Matrix_unpack_FullR', an error is returned
-(\verb'GrB_INVALID_VALUE') and the matrix is not unpacked.
+This method suggests the most efficient format for the export of a given
+matrix.  For SuiteSparse:GraphBLAS, the hint depends on the current
+format of the \verb'GrB_Matrix':
+
+\begin{packed_itemize}
+\item \verb'GxB_SPARSE', \verb'GxB_BY_ROW': export as \verb'GrB_CSR_FORMAT'
+\item \verb'GxB_SPARSE', \verb'GxB_BY_COL': export as \verb'GrB_CSC_FORMAT'
+\item \verb'GxB_HYPERSPARSE': export as \verb'GrB_COO_FORMAT'
+\item \verb'GxB_BITMAP', \verb'GxB_BY_ROW': export as \verb'GrB_CSR_FORMAT'
+\item \verb'GxB_BITMAP', \verb'GxB_BY_COL': export as \verb'GrB_CSC_FORMAT'
+%\item \verb'GxB_FULL', \verb'GxB_BY_ROW': export as \verb'GrB_DENSE_ROW_FORMAT'
+%\item \verb'GxB_FULL', \verb'GxB_BY_COL': export as \verb'GrB_DENSE_COL_FORMAT'
+\item \verb'GxB_FULL', \verb'GxB_BY_ROW': export as \verb'GrB_CSR_FORMAT'
+\item \verb'GxB_FULL', \verb'GxB_BY_COL': export as \verb'GrB_CSC_FORMAT'
+\end{packed_itemize}
+
+\newpage
+%===============================================================================
+\subsection{Sorting methods}
+%===============================================================================
+\label{sorting_methods}
+
+\verb'GxB_Matrix_sort' provides a mechanism to sort all the rows or
+all the columns of a matrix, and \verb'GxB_Vector_sort' sorts all the
+entries in a vector.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_pack\_FullC:} pack a FullC matrix}
+\subsubsection{{\sf GxB\_Vector\_sort:} sort a vector}
 %-------------------------------------------------------------------------------
-\label{matrix_pack_fullc}
+\label{vector_sort}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_pack_FullC  // pack a full matrix, held by column
+GrB_Info GxB_sort
 (
-    GrB_Matrix A,       // matrix to create (type, nrows, ncols unchanged)
-    void **Ax,          // values, Ax_size >= nrows*ncols * (type size)
-                        // or Ax_size >= (type size), if iso is true
-    GrB_Index Ax_size,  // size of Ax in bytes
-    bool iso,           // if true, A is iso
+    // output:
+    GrB_Vector w,           // vector of sorted values
+    GrB_Vector p,           // vector containing the permutation
+    // input
+    GrB_BinaryOp op,        // comparator op
+    GrB_Vector u,           // vector to sort
     const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_pack_FullC' packs a matrix from a user arrays in FullC
-format.  For the \verb'FullC' format,
-the value of $A(i,j)$ is \verb'Ax[i+j*nrows]'.
-To iterate over the rows and entries of this matrix, the following code can be
-used (assuming it has type \verb'GrB_FP64').
-If \verb'A' is both full and iso, it takes $O(1)$ memory,
-regardless of \verb'nrows' and \verb'ncols'.
-
-    \vspace{-0.1in}
-    {\footnotesize
-    \begin{verbatim}
-    for (int64_t i = 0 ; i < nrows ; i++)
-    {
-        for (int64_t j = 0 ; j < ncols ; j++)
-        {
-            int64_t p = i + j*nrows ;
-            double aij = Ax [iso ? 0 : p] ;   // numerical value of A(i,j)
-        }
-    } \end{verbatim}}
+\verb'GxB_Vector_sort' is identical to sorting the single column of an
+\verb'n'-by-1 matrix.  The descriptor is ignored, except to control the number
+of threads to use.  Refer to Section \ref{matrix_sort} for details.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_unpack\_FullC:} unpack a FullC matrix}
+\subsubsection{{\sf GxB\_Matrix\_sort:} sort the rows/columns of a matrix}
 %-------------------------------------------------------------------------------
-\label{matrix_unpack_fullc}
+\label{matrix_sort}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_unpack_FullC  // unpack a full matrix, by column
+GrB_Info GxB_sort
 (
-    GrB_Matrix A,       // matrix to unpack (type, nrows, ncols unchanged)
-    void **Ax,          // values
-    GrB_Index *Ax_size, // size of Ax in bytes
-    bool *iso,          // if true, A is iso
+    // output:
+    GrB_Matrix C,           // matrix of sorted values
+    GrB_Matrix P,           // matrix containing the permutations
+    // input
+    GrB_BinaryOp op,        // comparator op
+    GrB_Matrix A,           // matrix to sort
     const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_unpack_FullC' unpacks a matrix in FullC form.  It is identical
-to \verb'GxB_Matrix_unpack_BitmapC', except that all entries must be present.
-That is, prior to unpack, \verb'GrB_Matrix_nvals (&nvals, A)' must return
-\verb'nvals' equal to \verb'nrows*ncols'.  Otherwise, if the \verb'A' is
-unpacked with \newline \verb'GxB_Matrix_unpack_FullC', an error is returned
-(\verb'GrB_INVALID_VALUE') and the matrix is not unpacked.
+\verb'GxB_Matrix_sort' sorts all the rows or all the columns of a matrix.
+Each row (or column) is sorted separately.  The rows are sorted by default.
+To sort the columns, use \verb'GrB_DESC_T0'.  A comparator operator is
+provided to define the sorting order (ascending or descending).
+For example, to sort a \verb'GrB_FP64' matrix in ascending order,
+use \verb'GrB_LT_FP64' as the \verb'op', and to sort in descending order,
+use \verb'GrB_GT_FP64'.
+
+The \verb'op' must have a return value of \verb'GrB_BOOL', and the types of
+its two inputs must be the same.  The entries in \verb'A' are typecasted to
+the inputs of the \verb'op', if necessary.  Matrices with user-defined types
+can be sorted with a user-defined comparator operator, whose two input types
+must match the type of \verb'A', and whose output is \verb'GrB_BOOL'.
+
+The two matrix outputs are \verb'C' and \verb'P'.  Any entries present on input
+in \verb'C' or \verb'P' are discarded on output.  The type of \verb'C' must
+match the type of \verb'A' exactly.  The dimensions of \verb'C', \verb'P', and
+\verb'A' must also match exactly (even with the \verb'GrB_DESC_T0'
+descriptor).
+
+With the default sort (by row), suppose \verb'A(i,:)' contains \verb'k'
+entries.  In this case, \verb'C(i,0:k-1)' contains the values of those entries
+in sorted order, and \verb'P(i,0:k-1)' contains their corresponding column
+indices in the matrix \verb'A'.  If two values are the same, ties are broken
+according column index.
+
+If the matrix is sorted by column, and \verb'A(:,j)' contains \verb'k' entries,
+then \verb'C(0:k-1,j)' contains the values of those entries in sorted order,
+and \verb'P(0:k-1,j)' contains their corresponding row indices in the matrix
+\verb'A'.  If two values are the same, ties are broken according row index.
+
+The outputs \verb'C' and \verb'P' are both optional; either one (but not both)
+may be \verb'NULL', in which case that particular output matrix is not
+computed.
 
 \newpage
 %===============================================================================
-\subsection{GraphBLAS import/export: using copy semantics} %====================
+\subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
 %===============================================================================
-\label{GrB_import_export}
-
-The v2.0 C API includes import/export methods for matrices (not vectors) using
-a different strategy as compared to the \verb'GxB*pack/unpack*' methods.  The
-\verb'GxB' methods are based on {\em move semantics}, in which ownership of
-arrays is passed between SuiteSparse:GraphBLAS and the user application.  This
-allows the \verb'GxB*pack/unpack*' methods to work in $O(1)$ time, and require
-no additional memory, but it requires that GraphBLAS and the user application
-agree on which memory manager to use.  This is done via \verb'GxB_init'.  This
-allows GraphBLAS to \verb'malloc' an array that can be later \verb'free'd by
-the user application, and visa versa.
+\label{descriptor}
 
-The \verb'GrB' import/export methods take a different approach.  The data
-is always copied in and out between the opaque GraphBLAS matrix and the
-user arrays.  This takes $\Omega(e)$ time, if the matrix has $e$ entries,
-and requires more memory.  It has the advantage that it does not require
-GraphBLAS and the user application to agree on what memory manager to use,
-since no ownership of allocated arrays is changed.
+A GraphBLAS {\em descriptor} modifies the behavior of a GraphBLAS operation.
+If the descriptor is \verb'GrB_NULL', defaults are used.
 
-The format for \verb'GrB_Matrix_import' and \verb'GrB_Matrix_export' is
-controlled by the following enum:
+The access to these parameters and their values is governed
+by two \verb'enum' types, \verb'GrB_Desc_Field' and \verb'GrB_Desc_Value':
 
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
+#define GxB_NTHREADS 5  // for both GrB_Desc_field and GxB_Option_field
+#define GxB_CHUNK 7
 typedef enum
 {
-    GrB_CSR_FORMAT = 0,     // CSR format (equiv to GxB_SPARSE with GxB_BY_ROW)
-    GrB_CSC_FORMAT = 1,     // CSC format (equiv to GxB_SPARSE with GxB_BY_COL)
-    GrB_COO_FORMAT = 2      // triplet format (like input to GrB*build)
+    GrB_OUTP = 0,   // descriptor for output of a method
+    GrB_MASK = 1,   // descriptor for the mask input of a method
+    GrB_INP0 = 2,   // descriptor for the first input of a method
+    GrB_INP1 = 3,   // descriptor for the second input of a method
+    GxB_DESCRIPTOR_NTHREADS = GxB_NTHREADS,   // number of threads to use
+    GxB_DESCRIPTOR_CHUNK = GxB_CHUNK,   // chunk size for small problems
+    GxB_AxB_METHOD = 1000, // descriptor for selecting C=A*B algorithm
+    GxB_SORT = 35   // control sort in GrB_mxm
+    GxB_COMPRESSION = 36,   // select compression for serialize
+    GxB_IMPORT = 37,        // secure vs fast pack
 }
-GrB_Format ; \end{verbatim}}
+GrB_Desc_Field ;
+
+typedef enum
+{
+    // for all GrB_Descriptor fields:
+    GxB_DEFAULT = 0,    // default behavior of the method
+    // for GrB_OUTP only:
+    GrB_REPLACE = 1,    // clear the output before assigning new values to it
+    // for GrB_MASK only:
+    GrB_COMP = 2,       // use the complement of the mask
+    GrB_STRUCTURE = 4,  // use the structure of the mask
+    // for GrB_INP0 and GrB_INP1 only:
+    GrB_TRAN = 3,       // use the transpose of the input
+    // for GxB_AxB_METHOD only:
+    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
+    GxB_AxB_DOT       = 1003,   // dot product
+    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
+    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
+    // for GxB_IMPORT only:
+    GxB_SECURE_IMPORT = 502     // GxB*_pack* methods trust their input data
+}
+GrB_Desc_Value ;
+\end{verbatim} } \end{mdframed}
 
 \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_import:}  import a matrix}
-%-------------------------------------------------------------------------------
-\label{GrB_matrix_import}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_Matrix_import  // import a matrix
-(
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // number of rows of the matrix
-    GrB_Index ncols,        // number of columns of the matrix
-    const GrB_Index *Ap,    // pointers for CSR, CSC, column indices for COO
-    const GrB_Index *Ai,    // row indices for CSR, CSC
-    const <type> *Ax,       // values
-    GrB_Index Ap_len,       // number of entries in Ap (not # of bytes)
-    GrB_Index Ai_len,       // number of entries in Ai (not # of bytes)
-    GrB_Index Ax_len,       // number of entries in Ax (not # of bytes)
-    GrB_Format format       // import format
-) ;
-\end{verbatim}
-} \end{mdframed}
+\begin{itemize}
+\item \verb'GrB_OUTP' is a parameter that modifies the output of a
+    GraphBLAS operation.  In the default case, the output is not cleared, and
+    ${\bf Z = C \odot T}$ then ${\bf C \langle M \rangle = Z}$ are computed
+    as-is, where ${\bf T}$ is the results of the particular GraphBLAS
+    operation.
 
-The \verb'GrB_Matrix_import' method copies from user-provided arrays into an
-opaque \verb'GrB_Matrix' and \verb'GrB_Matrix_export' copies data out, from an
-opaque \verb'GrB_Matrix' into user-provided arrays.
+    In the non-default case, ${\bf Z = C \odot T}$ is first computed, using the
+    results of ${\bf T}$ and the accumulator $\odot$.  After this is done, if
+    the \verb'GrB_OUTP' descriptor field is set to \verb'GrB_REPLACE', then the
+    output is cleared of its entries.  Next, the assignment ${\bf C \langle M
+    \rangle = Z}$ is performed.
+
+\item \verb'GrB_MASK' is a parameter that modifies the \verb'Mask',
+    even if the mask is not present.
+
+    If this parameter is set to its default value, and if the mask is not
+    present (\verb'Mask==NULL') then implicitly \verb'Mask(i,j)=1' for all
+    \verb'i' and \verb'j'.  If the mask is present then \verb'Mask(i,j)=1'
+    means that \verb'C(i,j)' is to be modified by the ${\bf C \langle M \rangle
+    = Z}$ update.  Otherwise, if \verb'Mask(i,j)=0', then \verb'C(i,j)' is not
+    modified, even if \verb'Z(i,j)' is an entry with a different value; that
+    value is simply discarded.
+
+    If the \verb'GrB_MASK' parameter is set to \verb'GrB_COMP', then the
+    use of the mask is complemented.  In this case, if the mask is not present
+    (\verb'Mask==NULL') then implicitly \verb'Mask(i,j)=0' for all \verb'i' and
+    \verb'j'.  This means that none of ${\bf C}$ is modified and the entire
+    computation of ${\bf Z}$ might as well have been skipped.  That is, a
+    complemented empty mask means no modifications are made to the output
+    object at all, except perhaps to clear it in accordance with the
+    \verb'GrB_OUTP' descriptor.  With a complemented mask, if the mask is
+    present then \verb'Mask(i,j)=0' means that \verb'C(i,j)' is to be modified
+    by the ${\bf C \langle M \rangle = Z}$ update.  Otherwise, if
+    \verb'Mask(i,j)=1', then \verb'C(i,j)' is not modified, even if
+    \verb'Z(i,j)' is an entry with a different value; that value is simply
+    discarded.
+
+    If the \verb'GrB_MASK' parameter is set to \verb'GrB_STRUCTURE',
+    then the values of the mask are ignored, and just the pattern of the
+    entries is used.  Any entry \verb'M(i,j)' in the pattern is treated as if
+    it were true.
+
+    The \verb'GrB_COMP' and \verb'GrB_STRUCTURE' settings can be combined,
+    either by setting the mask option twice (once with each value), or by
+    setting the mask option to \verb'GrB_COMP+GrB_STRUCTURE' (the latter is an
+    extension to the specification).
+
+    Using a parameter to complement the \verb'Mask' is very useful because
+    constructing the actual complement of a very sparse mask is impossible
+    since it has too many entries.  If the number of places in \verb'C'
+    that should be modified is very small, then use a sparse mask without
+    complementing it.  If the number of places in \verb'C' that should
+    be protected from modification is very small, then use a sparse mask
+    to indicate those places, and use a descriptor \verb'GrB_MASK' that
+    complements the use of the mask.
+
+\item \verb'GrB_INP0' and \verb'GrB_INP1' modify the use of the
+    first and second input matrices \verb'A' and \verb'B' of the GraphBLAS
+    operation.
+
+    If the \verb'GrB_INP0' is set to \verb'GrB_TRAN', then \verb'A' is
+    transposed before using it in the operation.  Likewise, if
+    \verb'GrB_INP1' is set to \verb'GrB_TRAN', then the second input,
+    typically called \verb'B', is transposed.
+
+    Vectors and scalars are never transposed via the descriptor.  If a method's
+    first parameter is a matrix and the second a vector or scalar, then
+    \verb'GrB_INP0' modifies the matrix parameter and
+    \verb'GrB_INP1' is ignored.  If a method's first parameter is a
+    vector or scalar and the second a matrix, then \verb'GrB_INP1'
+    modifies the matrix parameter and \verb'GrB_INP0' is ignored.
+
+    To clarify this in each function, the inputs are labeled as
+    \verb'first input:' and \verb'second input:' in the function signatures.
+
+\item \verb'GxB_AxB_METHOD' suggests the method that should be
+    used to compute \verb'C=A*B'.  All the methods compute the same result,
+    except they may have different floating-point roundoff errors.  This
+    descriptor should be considered as a hint; SuiteSparse:GraphBLAS is
+    free to ignore it.
+
+    \begin{itemize}
+
+    \item \verb'GxB_DEFAULT' means that a method is selected automatically.
 
-The suffix \verb'TYPE' in the prototype above is one of \verb'BOOL',
-\verb'INT8', \verb'INT16', etc, for built-n types, or \verb'UDT' for
-user-defined types.  The type of the \verb'Ax' array must match this type.  No
-typecasting is performed.
+    \item \verb'GxB_AxB_SAXPY': select any saxpy-based method:
+        \verb'GxB_AxB_GUSTAVSON', and/or
+        \verb'GxB_AxB_HASH', or any mix of the two,
+        in contrast to the dot-product method.
 
-Unlike the \verb'GxB'
-pack/unpack methods, memory is not handed off between the user application
-and GraphBLAS.   The three arrays \verb'Ap', \verb'Ai'.  and \verb'Ax' are not
-modified, and are still owned by the user application when the method finishes.
+    \item \verb'GxB_AxB_GUSTAVSON':  an extended version of Gustavson's method
+    \cite{Gustavson78}, which is a very good general-purpose method, but
+    sometimes the workspace can be too large.  Assuming all matrices are stored
+    by column, it computes \verb'C(:,j)=A*B(:,j)' with a sequence of {\em
+    saxpy} operations (\verb'C(:,j)+=A(:,k)*B(k:,j)' for each nonzero
+    \verb'B(k,j)').  In the {\em coarse Gustavson} method, each internal thread
+    requires workspace of size $m$, to the number of rows of \verb'C', which is
+    not suitable if the matrices are extremely sparse or if there are many
+    threads.  For the {\em fine Gustavson} method, threads can share workspace
+    and update it via atomic operations.  If all matrices are stored by row,
+    then it computes \verb'C(i,:)=A(i,:)*B' in a sequence of sparse {\em saxpy}
+    operations, and using workspace of size $n$ per thread, or group of
+    threads, corresponding to the number of columns of \verb'C'.
 
-The matrix can be imported in one of three different formats:
+    \item \verb'GxB_AxB_HASH':  a hash-based method, based on
+        \cite{10.1145/3229710.3229720}.  It is very efficient for hypersparse
+        matrices, matrix-vector-multiply, and when $|{\bf B}|$ is small.
+        SuiteSparse:GraphBLAS includes a {\em coarse hash} method, in which
+        each thread has its own hash workspace, and a {\em fine hash}
+        method, in which groups of threads share a single hash workspace,
+        as concurrent data structure, using atomics.
 
-\begin{packed_itemize}
-    \item \verb'GrB_CSR_FORMAT': % CSR format (equiv to GxB_SPARSE with GxB_BY_ROW)
-        Compressed-row format.  \verb'Ap' is an array of size \verb'nrows+1'.
-        The arrays \verb'Ai' and \verb'Ax' are of size \verb'nvals = Ap [nrows]',
-        and \verb'Ap[0]' must be zero.
-        The column indices of entries in the \verb'i'th row appear in
-        \verb'Ai[Ap[i]...Ap[i+1]-1]', and the values of those entries appear in
-        the same locations in \verb'Ax'.
-        The column indices need not be in any particular order.
+% [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydin Buluc. 2018.
+% High-Performance Sparse Matrix-Matrix Products on Intel KNL and Multicore
+% Architectures. In Proc. 47th Intl. Conf. on Parallel Processing (ICPP '18).
+% Association for Computing Machinery, New York, NY, USA, Article 34, 1–10.
+% DOI:https://doi.org/10.1145/3229710.3229720
 
-    \item \verb'GrB_CSC_FORMAT': % CSC format (equiv to GxB_SPARSE with GxB_BY_COL)
-        Compressed-column format.  \verb'Ap' is an array of size \verb'ncols+1'.
-        The arrays \verb'Ai' and \verb'Ax' are of size \verb'nvals = Ap [ncols]',
-        and \verb'Ap[0]' must be zero.
-        The row indices of entries in the \verb'j'th column appear in
-        \verb'Ai[Ap[j]...Ap[j+1]-1]', and the values of those entries appear in
-        the same locations in \verb'Ax'.
-        The row indices need not be in any particular order.
-        
-    \item \verb'GrB_COO_FORMAT': % triplet format (like input to GrB*build)
-        Coordinate format.  This is the same format as \newline
-        \verb'GrB_Matrix_build'.
-        The three arrays \verb'Ap', \verb'Ai', and \verb'Ax' have the same
-        size.  The \verb'k'th tuple has row index \verb'Ai[k]',
-        column index \verb'Ap[k]', and value \verb'Ax[k]'.  The tuples can
-        appear any order, but no duplicates are permitted.
+\item \verb'GxB_AxB_DOT': computes \verb"C(i,j)=A(i,:)*B(j,:)'", for each
+    entry \verb'C(i,j)'.  If the mask is present and not complemented, only
+    entries for which \verb'M(i,j)=1' are computed.  This is a very specialized
+    method that works well only if the mask is present, very sparse, and not
+    complemented, when \verb'C' is small, or when \verb'C' is bitmap or full.
+    For example, it works very well
+    when \verb'A' and \verb'B' are tall and thin, and \verb"C<M>=A*B'" or
+    \verb"C=A*B'" are computed.  These expressions assume all matrices are in
+    CSR format.  If in CSC format, then the dot-product method used for
+    \verb"A'*B".  The method is impossibly slow if \verb'C' is large and the
+    mask is not present, since it takes $\Omega(mn)$ time if \verb'C' is
+    $m$-by-$n$ in that case.  It does not use any workspace at all.  Since it
+    uses no workspace, it can work very well for extremely sparse or
+    hypersparse matrices, when the mask is present and not complemented.
 
-%   \item \verb'GrB_DENSE_ROW_FORMAT': % FullR format (GxB_FULL with GxB_BY_ROW)
-%       Dense matrix format, held by row.  Only the \verb'Ax' array is used, of
-%       size \verb'nrows*ncols'.
-%       It holds the matrix in dense format, in row major order.
-%
-%   \item \verb'GrB_DENSE_COL_FORMAT': % FullC format (GxB_FULL with GxB_BY_ROW)
-%       Dense matrix format, held by column.  Only the \verb'Ax' array is used, of
-%       size \verb'nrows*ncols'.
-%       It holds the matrix in dense format, in column major order.
+    \end{itemize}
 
-\end{packed_itemize}
+\item \verb'GxB_NTHREADS' controls how many threads a method uses.
+    By default (if set to zero, or \verb'GxB_DEFAULT'), all available threads
+    are used.  The maximum available threads is controlled by the global
+    setting, which is \verb'omp_get_max_threads ( )' by default.  If set to
+    some positive integer \verb'nthreads' less than this maximum, at most
+    \verb'nthreads' threads will be used.  See Section~\ref{omp_parallelism}
+    for details.
 
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_export:}  export a matrix}
-%-------------------------------------------------------------------------------
-\label{GrB_matrix_export}
+\item \verb'GxB_CHUNK' is a \verb'double' value that controls how many threads
+    a method uses for small problems.  See Section~\ref{omp_parallelism} for
+    details.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_Matrix_export  // export a matrix
-(
-    GrB_Index *Ap,          // pointers for CSR, CSC, column indices for COO
-    GrB_Index *Ai,          // col indices for CSR/COO, row indices for CSC
-    <type> *Ax,             // values (must match the type of A_input)
-    GrB_Index *Ap_len,      // number of entries in Ap (not # of bytes)
-    GrB_Index *Ai_len,      // number of entries in Ai (not # of bytes)
-    GrB_Index *Ax_len,      // number of entries in Ax (not # of bytes)
-    GrB_Format format,      // export format
-    GrB_Matrix A            // matrix to export
-) ;
-\end{verbatim}
-} \end{mdframed}
+\item \verb'GxB_SORT' provides a hint to \verb'GrB_mxm', \verb'GrB_mxv',
+    \verb'GrB_vxm', and \verb'GrB_reduce' (to vector).  These methods can leave
+    the output matrix or vector in a jumbled state, where the final sort is
+    left as pending work.  This is typically fastest, since some algorithms can
+    tolerate jumbled matrices on input, and sometimes the sort can be skipped
+    entirely.  However, if the matrix or vector will be immediately exported in
+    unjumbled form, or provided as input to a method that requires it to not be
+    jumbled, then sorting it during the matrix multiplication is faster.
+    By default, these methods leave the result in jumbled form (a {\em lazy
+    sort}), if \verb'GxB_SORT' is set to zero (\verb'GxB_DEFAULT').  A nonzero
+    value will inform the matrix multiplication to sort its result, instead.
 
-\verb'GrB_Matrix_export' copies the contents of a matrix into three
-user-provided arrays, using any one of the three different formats
-described in Section~\ref{GrB_matrix_import}.  The size of the arrays must be
-at least as large as the lengths returned by \verb'GrB_Matrix_exportSize'.  The
-matrix \verb'A' is not modified.
+\item \verb'GxB_COMPRESSION' selects the compression method for serialization.
+    The default is ZSTD (level 1).  See Section~\ref{serialize_deserialize} for
+    other options.
 
-On input, the size of the three arrays \verb'Ap', \verb'Ai', and \verb'Ax' is
-given by \verb'Ap_len', \verb'Ai_len', and \verb'Ax_len', respectively.  These
-values are in terms of the number of entries in these arrays, not the number of
-bytes.  On output, these three value are adjusted to report the number of
-entries written to the three arrays.
+\item \verb'GxB_IMPORT' informs the \verb'GxB' pack methods
+    that they can trust their input data, or not.  The default is to trust
+    the input, for faster packing.  If the data is being packed from an
+    untrusted source, then additional checks should be made, and the 
+    following descriptor setting should be used:
 
-The suffix \verb'TYPE' in the prototype above is one of \verb'BOOL',
-\verb'INT8', \verb'INT16', etc, for built-n types, or \verb'UDT' for
-user-defined types.  The type of the \verb'Ax' array must match this type.  No
-typecasting is performed.
+    {\footnotesize
+    \begin{verbatim}
+    GxB_set (desc, GxB_IMPORT, GxB_SECURE_IMPORT) ; \end{verbatim}}
 
-% The \verb'GrB_DENSE_ROW_FORMAT' and \verb'GrB_DENSE_COL_FORMAT' formats can
-% only be used if all entries are present in the matrix.  That is,
-% \verb'GrB_Matrix_nvals (&nvals,A)' must return \verb'nvals' equal to
-% \verb'nrows*ncols'.
+\end{itemize}
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_exportSize:} determine size of export}
-%-------------------------------------------------------------------------------
-\label{export_size}
+The next sections describe the methods for a \verb'GrB_Descriptor':
 
-\begin{mdframed}[userdefinedwidth=6in]
+\vspace{0.2in}
 {\footnotesize
-\begin{verbatim}
-GrB_Info GrB_Matrix_exportSize  // determine sizes of user arrays for export
-(
-    GrB_Index *Ap_len,      // # of entries required for Ap (not # of bytes)
-    GrB_Index *Ai_len,      // # of entries required for Ai (not # of bytes)
-    GrB_Index *Ax_len,      // # of entries required for Ax (not # of bytes)
-    GrB_Format format,      // export format
-    GrB_Matrix A            // matrix to export
-) ;
-\end{verbatim}
-} \end{mdframed}
-
-Returns the required sizes of the arrays \verb'Ap', \verb'Ai', and \verb'Ax'
-for exporting a matrix using \verb'GrB_Matrix_export', using the same
-\verb'format'.
+\begin{tabular}{lll}
+\hline
+GraphBLAS function   & purpose                                      & Section \\
+\hline
+\verb'GrB_Descriptor_new'        & create a descriptor                  & \ref{descriptor_new} \\
+\verb'GrB_Descriptor_wait'       & wait for a descriptor                & \ref{descriptor_wait} \\
+\verb'GrB_Descriptor_set'        & set a parameter in a descriptor      & \ref{descriptor_set} \\
+\verb'GxB_Desc_set'              & set a parameter in a descriptor      & \ref{desc_set}  \\
+\verb'GxB_Desc_get'              & get a parameter from a descriptor    & \ref{desc_get}  \\
+\verb'GrB_Descriptor_free'       & free a descriptor                    & \ref{descriptor_free} \\
+\hline
+\end{tabular}
+}
 
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_exportHint:} determine best export format}
+\subsubsection{{\sf GrB\_Descriptor\_new:}  create a new descriptor}
 %-------------------------------------------------------------------------------
-\label{export_hint}
+\label{descriptor_new}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_Matrix_exportHint  // suggest the best export format
+GrB_Info GrB_Descriptor_new     // create a new descriptor
 (
-    GrB_Format *format,     // export format
-    GrB_Matrix A            // matrix to export
+    GrB_Descriptor *descriptor  // handle of descriptor to create
 ) ;
-\end{verbatim}
-} \end{mdframed}
-
-This method suggests the most efficient format for the export of a given
-matrix.  For SuiteSparse:GraphBLAS, the hint depends on the current
-format of the \verb'GrB_Matrix':
-
-\begin{packed_itemize}
-\item \verb'GxB_SPARSE', \verb'GxB_BY_ROW': export as \verb'GrB_CSR_FORMAT'
-\item \verb'GxB_SPARSE', \verb'GxB_BY_COL': export as \verb'GrB_CSC_FORMAT'
-\item \verb'GxB_HYPERSPARSE': export as \verb'GrB_COO_FORMAT'
-\item \verb'GxB_BITMAP', \verb'GxB_BY_ROW': export as \verb'GrB_CSR_FORMAT'
-\item \verb'GxB_BITMAP', \verb'GxB_BY_COL': export as \verb'GrB_CSC_FORMAT'
-%\item \verb'GxB_FULL', \verb'GxB_BY_ROW': export as \verb'GrB_DENSE_ROW_FORMAT'
-%\item \verb'GxB_FULL', \verb'GxB_BY_COL': export as \verb'GrB_DENSE_COL_FORMAT'
-\item \verb'GxB_FULL', \verb'GxB_BY_ROW': export as \verb'GrB_CSR_FORMAT'
-\item \verb'GxB_FULL', \verb'GxB_BY_COL': export as \verb'GrB_CSC_FORMAT'
-\end{packed_itemize}
-
-\newpage
-%===============================================================================
-\subsection{Sorting methods}
-%===============================================================================
-\label{sorting_methods}
+\end{verbatim} } \end{mdframed}
 
-\verb'GxB_Matrix_sort' provides a mechanism to sort all the rows or
-all the columns of a matrix, and \verb'GxB_Vector_sort' sorts all the
-entries in a vector.
+\verb'GrB_Descriptor_new' creates a new descriptor, with all fields set to
+their defaults (output is not replaced, the mask is not complemented, the mask
+is valued not structural, neither input matrix is transposed, the method
+used in \verb'C=A*B' is selected automatically, and \verb'GrB_mxm' leaves
+the final sort as pending work).
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Vector\_sort:} sort a vector}
+\subsubsection{{\sf GrB\_Descriptor\_wait:} wait for a descriptor}
 %-------------------------------------------------------------------------------
-\label{vector_sort}
+\label{descriptor_wait}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_sort
+GrB_Info GrB_wait                   // wait for a descriptor
 (
-    // output:
-    GrB_Vector w,           // vector of sorted values
-    GrB_Vector p,           // vector containing the permutation
-    // input
-    GrB_BinaryOp op,        // comparator op
-    GrB_Vector u,           // vector to sort
-    const GrB_Descriptor desc
+    GrB_Descriptor descriptor,      // descriptor to wait for
+    GrB_WaitMode mode               // GrB_COMPLETE or GrB_MATERIALIZE
 ) ;
 \end{verbatim}
-} \end{mdframed}
+}\end{mdframed}
 
-\verb'GxB_Vector_sort' is identical to sorting the single column of an
-\verb'n'-by-1 matrix.  The descriptor is ignored, except to control the number
-of threads to use.  Refer to Section \ref{matrix_sort} for details.
+After creating a user-defined descriptor, a GraphBLAS library may choose to
+exploit non-blocking mode to delay its creation.  Currently,
+SuiteSparse:GraphBLAS does nothing except to ensure that \verb'd' is valid.
 
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_sort:} sort the rows/columns of a matrix}
+\subsubsection{{\sf GrB\_Descriptor\_set:}  set a parameter in a descriptor}
 %-------------------------------------------------------------------------------
-\label{matrix_sort}
+\label{descriptor_set}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_sort
+GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
 (
-    // output:
-    GrB_Matrix C,           // matrix of sorted values
-    GrB_Matrix P,           // matrix containing the permutations
-    // input
-    GrB_BinaryOp op,        // comparator op
-    GrB_Matrix A,           // matrix to sort
-    const GrB_Descriptor desc
+    GrB_Descriptor desc,        // descriptor to modify
+    GrB_Desc_Field field,       // parameter to change
+    GrB_Desc_Value val          // value to change it to
 ) ;
-\end{verbatim}
-} \end{mdframed}
+\end{verbatim} } \end{mdframed}
 
-\verb'GxB_Matrix_sort' sorts all the rows or all the columns of a matrix.
-Each row (or column) is sorted separately.  The rows are sorted by default.
-To sort the columns, use \verb'GrB_DESC_T0'.  A comparator operator is
-provided to define the sorting order (ascending or descending).
-For example, to sort a \verb'GrB_FP64' matrix in ascending order,
-use \verb'GrB_LT_FP64' as the \verb'op', and to sort in descending order,
-use \verb'GrB_GT_FP64'.
+\verb'GrB_Descriptor_set' sets a descriptor field (\verb'GrB_OUTP',
+\verb'GrB_MASK', \verb'GrB_INP0', \verb'GrB_INP1', or \verb'GxB_AxB_METHOD') to
+a particular value.  Use \verb'GxB_Dec_set' to set the value of
+\verb'GxB_NTHREADS', \verb'GxB_CHUNK', and \verb'GxB_SORT'.
+If an error occurs, \verb'GrB_error(&err,desc)' returns details about the error.
 
-The \verb'op' must have a return value of \verb'GrB_BOOL', and the types of
-its two inputs must be the same.  The entries in \verb'A' are typecasted to
-the inputs of the \verb'op', if necessary.  Matrices with user-defined types
-can be sorted with a user-defined comparator operator, whose two input types
-must match the type of \verb'A', and whose output is \verb'GrB_BOOL'.
+\vspace{0.2in}
+\noindent
+{\footnotesize
+\begin{tabular}{|l|p{2.4in}|p{2.2in}|}
+\hline
+Descriptor & Default   & Non-default \\
+field      & &  \\
+\hline
 
-The two matrix outputs are \verb'C' and \verb'P'.  Any entries present on input
-in \verb'C' or \verb'P' are discarded on output.  The type of \verb'C' must
-match the type of \verb'A' exactly.  The dimensions of \verb'C', \verb'P', and
-\verb'A' must also match exactly (even with the \verb'GrB_DESC_T0'
-descriptor).
+\verb'GrB_OUTP'
+    & \verb'GxB_DEFAULT':
+    The output matrix is not cleared.  The operation computes
+    ${\bf C \langle M \rangle = C \odot T}$.
+    & \verb'GrB_REPLACE':
+    After computing ${\bf Z=C\odot T}$,
+    the output {\bf C} is cleared of all entries.
+    Then ${\bf C \langle M \rangle = Z}$ is performed. \\
 
-With the default sort (by row), suppose \verb'A(i,:)' contains \verb'k'
-entries.  In this case, \verb'C(i,0:k-1)' contains the values of those entries
-in sorted order, and \verb'P(i,0:k-1)' contains their corresponding column
-indices in the matrix \verb'A'.  If two values are the same, ties are broken
-according column index.
+\hline
 
-If the matrix is sorted by column, and \verb'A(:,j)' contains \verb'k' entries,
-then \verb'C(0:k-1,j)' contains the values of those entries in sorted order,
-and \verb'P(0:k-1,j)' contains their corresponding row indices in the matrix
-\verb'A'.  If two values are the same, ties are broken according row index.
+\verb'GrB_MASK'
+    & \verb'GxB_DEFAULT':
+    The Mask is not complemented.  \verb'Mask(i,j)=1' means the value $C_{ij}$
+    can be modified by the operation, while \verb'Mask(i,j)=0' means the value
+    $C_{ij}$ shall not be modified by the operation.
+    & \verb'GrB_COMP':
+    The Mask is complemented.  \verb'Mask(i,j)=0' means the value $C_{ij}$
+    can be modified by the operation, while \verb'Mask(i,j)=1' means the value
+    $C_{ij}$ shall not be modified by the operation. \\
+    &
+    & \verb'GrB_STRUCTURE':
+    The values of the Mask are ignored.  If \verb'Mask(i,j)' is an entry
+    in the \verb'Mask' matrix, it is treated as if \verb'Mask(i,j)=1'.
+    The two options \verb'GrB_COMP' and \verb'GrB_STRUCTURE' can be
+    combined, with two subsequent calls, or with a single call with the setting
+    \verb'GrB_COMP+GrB_STRUCTURE'.  \\
 
-The outputs \verb'C' and \verb'P' are both optional; either one (but not both)
-may be \verb'NULL', in which case that particular output matrix is not
-computed.
+\hline
 
-\newpage
-%===============================================================================
-\subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
-%===============================================================================
-\label{descriptor}
+\verb'GrB_INP0'
+    & \verb'GxB_DEFAULT':
+    The first input is not transposed prior to using it in the operation.
+    & \verb'GrB_TRAN':
+    The first input is transposed prior to using it in the operation.  Only
+    matrices are transposed, never vectors. \\
 
-A GraphBLAS {\em descriptor} modifies the behavior of a GraphBLAS operation.
-If the descriptor is \verb'GrB_NULL', defaults are used.
+\hline
 
-The access to these parameters and their values is governed
-by two \verb'enum' types, \verb'GrB_Desc_Field' and \verb'GrB_Desc_Value':
+\verb'GrB_INP1'
+    & \verb'GxB_DEFAULT':
+    The second input is not transposed prior to using it in the operation.
+    & \verb'GrB_TRAN':
+    The second input is transposed prior to using it in the operation.  Only
+    matrices are transposed, never vectors. \\
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-#define GxB_NTHREADS 5  // for both GrB_Desc_field and GxB_Option_field
-#define GxB_CHUNK 7
-typedef enum
-{
-    GrB_OUTP = 0,   // descriptor for output of a method
-    GrB_MASK = 1,   // descriptor for the mask input of a method
-    GrB_INP0 = 2,   // descriptor for the first input of a method
-    GrB_INP1 = 3,   // descriptor for the second input of a method
-    GxB_DESCRIPTOR_NTHREADS = GxB_NTHREADS,   // number of threads to use
-    GxB_DESCRIPTOR_CHUNK = GxB_CHUNK,   // chunk size for small problems
-    GxB_AxB_METHOD = 1000, // descriptor for selecting C=A*B algorithm
-    GxB_SORT = 35   // control sort in GrB_mxm
-    GxB_COMPRESSION = 36,   // select compression for serialize
-    GxB_IMPORT = 37,        // secure vs fast pack
-}
-GrB_Desc_Field ;
+\hline
 
-typedef enum
-{
-    // for all GrB_Descriptor fields:
-    GxB_DEFAULT = 0,    // default behavior of the method
-    // for GrB_OUTP only:
-    GrB_REPLACE = 1,    // clear the output before assigning new values to it
-    // for GrB_MASK only:
-    GrB_COMP = 2,       // use the complement of the mask
-    GrB_STRUCTURE = 4,  // use the structure of the mask
-    // for GrB_INP0 and GrB_INP1 only:
-    GrB_TRAN = 3,       // use the transpose of the input
-    // for GxB_AxB_METHOD only:
-    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
-    GxB_AxB_DOT       = 1003,   // dot product
-    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
-    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
-    // for GxB_IMPORT only:
-    GxB_SECURE_IMPORT = 502     // GxB*_pack* methods trust their input data
+\verb'GrB_AxB_METHOD'
+    & \verb'GxB_DEFAULT':
+    The method for \verb'C=A*B' is selected automatically.
+    & \verb'GxB_AxB_'{\em method}: The selected method is used to compute
+    \verb'C=A*B'.  \\
+
+\hline
+\end{tabular}
 }
-GrB_Desc_Value ;
-\end{verbatim} } \end{mdframed}
 
 \newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Desc\_set:}  set a parameter in a descriptor}
+%-------------------------------------------------------------------------------
+\label{desc_set}
 
-\begin{itemize}
-\item \verb'GrB_OUTP' is a parameter that modifies the output of a
-    GraphBLAS operation.  In the default case, the output is not cleared, and
-    ${\bf Z = C \odot T}$ then ${\bf C \langle M \rangle = Z}$ are computed
-    as-is, where ${\bf T}$ is the results of the particular GraphBLAS
-    operation.
-
-    In the non-default case, ${\bf Z = C \odot T}$ is first computed, using the
-    results of ${\bf T}$ and the accumulator $\odot$.  After this is done, if
-    the \verb'GrB_OUTP' descriptor field is set to \verb'GrB_REPLACE', then the
-    output is cleared of its entries.  Next, the assignment ${\bf C \langle M
-    \rangle = Z}$ is performed.
-
-\item \verb'GrB_MASK' is a parameter that modifies the \verb'Mask',
-    even if the mask is not present.
-
-    If this parameter is set to its default value, and if the mask is not
-    present (\verb'Mask==NULL') then implicitly \verb'Mask(i,j)=1' for all
-    \verb'i' and \verb'j'.  If the mask is present then \verb'Mask(i,j)=1'
-    means that \verb'C(i,j)' is to be modified by the ${\bf C \langle M \rangle
-    = Z}$ update.  Otherwise, if \verb'Mask(i,j)=0', then \verb'C(i,j)' is not
-    modified, even if \verb'Z(i,j)' is an entry with a different value; that
-    value is simply discarded.
-
-    If the \verb'GrB_MASK' parameter is set to \verb'GrB_COMP', then the
-    use of the mask is complemented.  In this case, if the mask is not present
-    (\verb'Mask==NULL') then implicitly \verb'Mask(i,j)=0' for all \verb'i' and
-    \verb'j'.  This means that none of ${\bf C}$ is modified and the entire
-    computation of ${\bf Z}$ might as well have been skipped.  That is, a
-    complemented empty mask means no modifications are made to the output
-    object at all, except perhaps to clear it in accordance with the
-    \verb'GrB_OUTP' descriptor.  With a complemented mask, if the mask is
-    present then \verb'Mask(i,j)=0' means that \verb'C(i,j)' is to be modified
-    by the ${\bf C \langle M \rangle = Z}$ update.  Otherwise, if
-    \verb'Mask(i,j)=1', then \verb'C(i,j)' is not modified, even if
-    \verb'Z(i,j)' is an entry with a different value; that value is simply
-    discarded.
-
-    If the \verb'GrB_MASK' parameter is set to \verb'GrB_STRUCTURE',
-    then the values of the mask are ignored, and just the pattern of the
-    entries is used.  Any entry \verb'M(i,j)' in the pattern is treated as if
-    it were true.
-
-    The \verb'GrB_COMP' and \verb'GrB_STRUCTURE' settings can be combined,
-    either by setting the mask option twice (once with each value), or by
-    setting the mask option to \verb'GrB_COMP+GrB_STRUCTURE' (the latter is an
-    extension to the specification).
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Desc_set           // set a parameter in a descriptor
+(
+    GrB_Descriptor desc,        // descriptor to modify
+    GrB_Desc_Field field,       // parameter to change
+    ...                         // value to change it to
+) ;
+\end{verbatim} } \end{mdframed}
 
-    Using a parameter to complement the \verb'Mask' is very useful because
-    constructing the actual complement of a very sparse mask is impossible
-    since it has too many entries.  If the number of places in \verb'C'
-    that should be modified is very small, then use a sparse mask without
-    complementing it.  If the number of places in \verb'C' that should
-    be protected from modification is very small, then use a sparse mask
-    to indicate those places, and use a descriptor \verb'GrB_MASK' that
-    complements the use of the mask.
+\verb'GxB_Desc_set' is like \verb'GrB_Descriptor_set', except that the type of
+the third parameter can vary with the field.   This function can modify all
+descriptor settings, including those that do not have the type
+\verb'GrB_Desc_Value'.  See also \verb'GxB_set' described in
+Section~\ref{options}.  If an error occurs, \verb'GrB_error(&err,desc)' returns
+details about the error.
 
-\item \verb'GrB_INP0' and \verb'GrB_INP1' modify the use of the
-    first and second input matrices \verb'A' and \verb'B' of the GraphBLAS
-    operation.
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Desc\_get:}  get a parameter from a descriptor}
+%-------------------------------------------------------------------------------
+\label{desc_get}
 
-    If the \verb'GrB_INP0' is set to \verb'GrB_TRAN', then \verb'A' is
-    transposed before using it in the operation.  Likewise, if
-    \verb'GrB_INP1' is set to \verb'GrB_TRAN', then the second input,
-    typically called \verb'B', is transposed.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Desc_get           // get a parameter from a descriptor
+(
+    GrB_Descriptor desc,        // descriptor to query; NULL means defaults
+    GrB_Desc_Field field,       // parameter to query
+    ...                         // value of the parameter
+) ;
+\end{verbatim} } \end{mdframed}
 
-    Vectors and scalars are never transposed via the descriptor.  If a method's
-    first parameter is a matrix and the second a vector or scalar, then
-    \verb'GrB_INP0' modifies the matrix parameter and
-    \verb'GrB_INP1' is ignored.  If a method's first parameter is a
-    vector or scalar and the second a matrix, then \verb'GrB_INP1'
-    modifies the matrix parameter and \verb'GrB_INP0' is ignored.
+\verb'GxB_Desc_get' returns the value of a single field in a descriptor.  The
+type of the third parameter is a pointer to a variable type, whose type depends
+on the field.  See also \verb'GxB_get' described in Section~\ref{options}.
 
-    To clarify this in each function, the inputs are labeled as
-    \verb'first input:' and \verb'second input:' in the function signatures.
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Descriptor\_free:} free a descriptor}
+%-------------------------------------------------------------------------------
+\label{descriptor_free}
 
-\item \verb'GxB_AxB_METHOD' suggests the method that should be
-    used to compute \verb'C=A*B'.  All the methods compute the same result,
-    except they may have different floating-point roundoff errors.  This
-    descriptor should be considered as a hint; SuiteSparse:GraphBLAS is
-    free to ignore it.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_free               // free a descriptor
+(
+    GrB_Descriptor *descriptor  // handle of descriptor to free
+) ;
+\end{verbatim} } \end{mdframed}
 
-    \begin{itemize}
+\verb'GrB_Descriptor_free' frees a descriptor.
+Either usage:
 
-    \item \verb'GxB_DEFAULT' means that a method is selected automatically.
+    {\small
+    \begin{verbatim}
+    GrB_Descriptor_free (&descriptor) ;
+    GrB_free (&descriptor) ; \end{verbatim}}
 
-    \item \verb'GxB_AxB_SAXPY': select any saxpy-based method:
-        \verb'GxB_AxB_GUSTAVSON', and/or
-        \verb'GxB_AxB_HASH', or any mix of the two,
-        in contrast to the dot-product method.
+\noindent
+frees the \verb'descriptor' and sets \verb'descriptor' to \verb'NULL'.  It
+safely does nothing if passed a \verb'NULL' handle, or if
+\verb'descriptor == NULL' on input.
 
-    \item \verb'GxB_AxB_GUSTAVSON':  an extended version of Gustavson's method
-    \cite{Gustavson78}, which is a very good general-purpose method, but
-    sometimes the workspace can be too large.  Assuming all matrices are stored
-    by column, it computes \verb'C(:,j)=A*B(:,j)' with a sequence of {\em
-    saxpy} operations (\verb'C(:,j)+=A(:,k)*B(k:,j)' for each nonzero
-    \verb'B(k,j)').  In the {\em coarse Gustavson} method, each internal thread
-    requires workspace of size $m$, to the number of rows of \verb'C', which is
-    not suitable if the matrices are extremely sparse or if there are many
-    threads.  For the {\em fine Gustavson} method, threads can share workspace
-    and update it via atomic operations.  If all matrices are stored by row,
-    then it computes \verb'C(i,:)=A(i,:)*B' in a sequence of sparse {\em saxpy}
-    operations, and using workspace of size $n$ per thread, or group of
-    threads, corresponding to the number of columns of \verb'C'.
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_DESC\_*:}  built-in descriptors}
+%-------------------------------------------------------------------------------
+\label{descriptor_predefined}
 
-    \item \verb'GxB_AxB_HASH':  a hash-based method, based on
-        \cite{10.1145/3229710.3229720}.  It is very efficient for hypersparse
-        matrices, matrix-vector-multiply, and when $|{\bf B}|$ is small.
-        SuiteSparse:GraphBLAS includes a {\em coarse hash} method, in which
-        each thread has its own hash workspace, and a {\em fine hash}
-        method, in which groups of threads share a single hash workspace,
-        as concurrent data structure, using atomics.
+Built-in descriptors are listed in the table below.  A dash in the table
+indicates the default.  These descriptors may not be modified or freed.
+Attempts to modify them result in an error (\verb'GrB_INVALID_VALUE'); attempts
+to free them are silently ignored. 
 
-% [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydin Buluc. 2018.
-% High-Performance Sparse Matrix-Matrix Products on Intel KNL and Multicore
-% Architectures. In Proc. 47th Intl. Conf. on Parallel Processing (ICPP '18).
-% Association for Computing Machinery, New York, NY, USA, Article 34, 1–10.
-% DOI:https://doi.org/10.1145/3229710.3229720
+% \verb'GrB_NULL' is the default descriptor, with all settings at their defaults:
+% \verb'OUTP': do not replace the output,
+% \verb'MASK': mask is valued and not complemented,
+% \verb'INP0': first input not transposed, and
+% \verb'INP1': second input not transposed.
+% For these pre-defined descriptors, the
+% \verb'GxB_NTHREADS',
+% \verb'GxB_CHUNK', and
+% \verb'GxB_SORT' settings are at their default values.
 
-\item \verb'GxB_AxB_DOT': computes \verb"C(i,j)=A(i,:)*B(j,:)'", for each
-    entry \verb'C(i,j)'.  If the mask is present and not complemented, only
-    entries for which \verb'M(i,j)=1' are computed.  This is a very specialized
-    method that works well only if the mask is present, very sparse, and not
-    complemented, when \verb'C' is small, or when \verb'C' is bitmap or full.
-    For example, it works very well
-    when \verb'A' and \verb'B' are tall and thin, and \verb"C<M>=A*B'" or
-    \verb"C=A*B'" are computed.  These expressions assume all matrices are in
-    CSR format.  If in CSC format, then the dot-product method used for
-    \verb"A'*B".  The method is impossibly slow if \verb'C' is large and the
-    mask is not present, since it takes $\Omega(mn)$ time if \verb'C' is
-    $m$-by-$n$ in that case.  It does not use any workspace at all.  Since it
-    uses no workspace, it can work very well for extremely sparse or
-    hypersparse matrices, when the mask is present and not complemented.
+\vspace{0.2in}
+\noindent
+{\footnotesize
+\begin{tabular}{|l|lllll|}
+\hline
+Descriptor              &  \verb'OUTP'          & \verb'MASK'           & \verb'MASK'       & \verb'INP0'       & \verb'INP1'       \\
+                        &                       & structural            & complement        & & \\
+\hline
+\verb'GrB_NULL'         &   -                   & -                     & -                 & -                 & -                 \\
+\verb'GrB_DESC_T1'      &   -                   & -                     & -                 & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_T0'      &   -                   & -                     & -                 & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_T0T1'    &   -                   & -                     & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\verb'GrB_DESC_C'       &   -                   & -                     & \verb'GrB_COMP'   & -                 & -                 \\
+\verb'GrB_DESC_CT1'     &   -                   & -                     & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_CT0'     &   -                   & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_CT0T1'   &   -                   & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\verb'GrB_DESC_S'       &   -                   & \verb'GrB_STRUCTURE'  & -                 & -                 & -                 \\
+\verb'GrB_DESC_ST1'     &   -                   & \verb'GrB_STRUCTURE'  & -                 & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_ST0'     &   -                   & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_ST0T1'   &   -                   & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\verb'GrB_DESC_SC'      &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & -                 \\
+\verb'GrB_DESC_SCT1'    &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_SCT0'    &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_SCT0T1'  &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\verb'GrB_DESC_R'       &   \verb'GrB_REPLACE'  & -                     & -                 & -                 & -                 \\
+\verb'GrB_DESC_RT1'     &   \verb'GrB_REPLACE'  & -                     & -                 & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_RT0'     &   \verb'GrB_REPLACE'  & -                     & -                 & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_RT0T1'   &   \verb'GrB_REPLACE'  & -                     & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\verb'GrB_DESC_RC'      &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & -                 & -                 \\
+\verb'GrB_DESC_RCT1'    &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_RCT0'    &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_RCT0T1'  &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\verb'GrB_DESC_RS'      &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & -                 & -                 \\
+\verb'GrB_DESC_RST1'    &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_RST0'    &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_RST0T1'  &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\verb'GrB_DESC_RSC'     &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & -                 \\
+\verb'GrB_DESC_RSCT1'   &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
+\verb'GrB_DESC_RSCT0'   &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
+\verb'GrB_DESC_RSCT0T1' &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\hline
+\end{tabular}}
 
-    \end{itemize}
+\newpage
+%===============================================================================
+\subsection{{\sf GrB\_free:} free any GraphBLAS object} %=======================
+%===============================================================================
+\label{free}
 
-\item \verb'GxB_NTHREADS' controls how many threads a method uses.
-    By default (if set to zero, or \verb'GxB_DEFAULT'), all available threads
-    are used.  The maximum available threads is controlled by the global
-    setting, which is \verb'omp_get_max_threads ( )' by default.  If set to
-    some positive integer \verb'nthreads' less than this maximum, at most
-    \verb'nthreads' threads will be used.  See Section~\ref{omp_parallelism}
-    for details.
+Each of the ten objects has \verb'GrB_*_new' and \verb'GrB_*_free' methods
+that are specific to each object.  They can also be accessed by a generic
+function, \verb'GrB_free', that works for all ten objects.  If \verb'G' is any
+of the ten objects, the statement
 
-\item \verb'GxB_CHUNK' is a \verb'double' value that controls how many threads
-    a method uses for small problems.  See Section~\ref{omp_parallelism} for
-    details.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_free (&G) ; \end{verbatim} }
 
-\item \verb'GxB_SORT' provides a hint to \verb'GrB_mxm', \verb'GrB_mxv',
-    \verb'GrB_vxm', and \verb'GrB_reduce' (to vector).  These methods can leave
-    the output matrix or vector in a jumbled state, where the final sort is
-    left as pending work.  This is typically fastest, since some algorithms can
-    tolerate jumbled matrices on input, and sometimes the sort can be skipped
-    entirely.  However, if the matrix or vector will be immediately exported in
-    unjumbled form, or provided as input to a method that requires it to not be
-    jumbled, then sorting it during the matrix multiplication is faster.
-    By default, these methods leave the result in jumbled form (a {\em lazy
-    sort}), if \verb'GxB_SORT' is set to zero (\verb'GxB_DEFAULT').  A nonzero
-    value will inform the matrix multiplication to sort its result, instead.
+\noindent
+frees the object and sets the variable \verb'G' to \verb'NULL'.  It is safe to
+pass in a \verb'NULL' handle, or to free an object twice:
 
-\item \verb'GxB_COMPRESSION' selects the compression method for serialization.
-    The default is LZ4.  See Section~\ref{serialize_deserialize} for other
-    options.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_free (NULL) ;       // SuiteSparse:GraphBLAS safely does nothing
+    GrB_free (&G) ;         // the object G is freed and G set to NULL
+    GrB_free (&G) ;         // SuiteSparse:GraphBLAS safely does nothing \end{verbatim} }
 
-\item \verb'GxB_IMPORT' informs the \verb'GxB' pack methods
-    that they can trust their input data, or not.  The default is to trust
-    the input, for faster packing.  If the data is being packed from an
-    untrusted source, then additional checks should be made, and the 
-    following descriptor setting should be used:
+\noindent
+However, the following sequence of operations is not safe.  The first two are
+valid but the last statement will lead to undefined behavior.
 
     {\footnotesize
     \begin{verbatim}
-    GxB_set (desc, GxB_IMPORT, GxB_SECURE_IMPORT) ; \end{verbatim}}
+    H = G ;                 // valid; creates a 2nd handle of the same object
+    GrB_free (&G) ;         // valid; G is freed and set to NULL; H now undefined
+    GrB_some_method (H) ;   // not valid; H is undefined \end{verbatim}}
 
-\end{itemize}
+Some objects are predefined, such as the built-in types.  If a user application
+attempts to free a built-in object, SuiteSparse:GraphBLAS will safely do
+nothing.  The \verb'GrB_free' function in SuiteSparse:GraphBLAS always
+returns \verb'GrB_SUCCESS'.
+
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{The mask, accumulator, and replace option} %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{sec:maskaccum}
+
+After a GraphBLAS operation computes a result ${\bf T}$, (for example, ${\bf
+T=AB}$ for \verb'GrB_mxm'), the results are assigned to an output matrix ${\bf
+C}$ via the mask/ accumulator phase, written as ${\bf C \langle M \rangle = C
+\odot T}$.  This phase is affected by the \verb'GrB_REPLACE' option in the
+descriptor, the presence of an optional binary accumulator operator ($\odot$),
+the presence of the optional mask matrix ${\bf M}$, and the status of the mask
+descriptor.  The interplay of these options is summarized in
+Table~\ref{tab:maskaccum}.
+
+The mask ${\bf M}$ may be present, or not.  It may be structural or valued, and
+it may be complemented, or not.  These options may be combined, for a total of
+8 cases, although the structural/valued option as no effect if ${\bf M}$ is not
+present.  If ${\bf M}$ is not present and not complemented, then $m_{ij}$ is
+implicitly true.  If not present yet complemented, then all $m_{ij}$ entries are
+implicitly zero; in this case, ${\bf T}$ need not be computed at all.  Either
+${\bf C}$ is not modified, or all its entries are cleared if the replace option
+is enabled.  If ${\bf M}$ is present, and the structural option is used, then
+$m_{ij}$ is treated as true if it is an entry in the matrix (its value is
+ignored).  Otherwise, the value of $m_{ij}$ is used.  In both cases, entries
+not present are implicitly zero.  These values are negated if the mask is
+complemented.  All of these various cases are combined to give a single
+effective value of the mask at position ${ij}$.
+
+The combination of all these options are presented in the
+Table~\ref{tab:maskaccum}.  The first column is the \verb'GrB_REPLACE' option.
+The second column lists whether or not the accumulator operator is present.
+The third column lists whether or not $c_{ij}$ exists on input to the
+mask/accumulator phase (a dash means that it does not exist).  The fourth
+column lists whether or not the entry $t_{ij}$ is present in the result matrix
+${\bf T}$.  The mask column is the final effective value of $m_{ij}$, after
+accounting for the presence of ${\bf M}$ and the mask options.  Finally, the
+last column states the result of the mask/accum step; if no action is listed in
+this column, then $c_{ij}$ is not modified.
+
+Several important observations can be made from this table.  First,
+if no mask is present (and the mask-complement descriptor option is not used),
+then only the first half of the table is used.  In this case, the \verb'GrB_REPLACE'
+option has no effect.  The entire matrix ${\bf C}$ is modified.
 
-The next sections describe the methods for a \verb'GrB_Descriptor':
+Consider the cases when $c_{ij}$ is present but $t_{ij}$ is not, and there is no
+mask or the effective value of the mask is true for this ${ij}$ position.  With
+no accumulator operator, $c_{ij}$ is deleted.  If the accumulator operator is
+present and the replace option is not used, $c_{ij}$ remains unchanged.
 
-\vspace{0.2in}
-{\footnotesize
-\begin{tabular}{lll}
+\begin{table}
+{\small
+\begin{tabular}{lllll|l}
 \hline
-GraphBLAS function   & purpose                                      & Section \\
+repl & accum & ${\bf C}$ & ${\bf T}$ & mask & action taken by ${\bf C \langle M \rangle = C \odot T}$ \\
 \hline
-\verb'GrB_Descriptor_new'        & create a descriptor                  & \ref{descriptor_new} \\
-\verb'GrB_Descriptor_wait'       & wait for a descriptor                & \ref{descriptor_wait} \\
-\verb'GrB_Descriptor_set'        & set a parameter in a descriptor      & \ref{descriptor_set} \\
-\verb'GxB_Desc_set'              & set a parameter in a descriptor      & \ref{desc_set}  \\
-\verb'GxB_Desc_get'              & get a parameter from a descriptor    & \ref{desc_get}  \\
-\verb'GrB_Descriptor_free'       & free a descriptor                    & \ref{descriptor_free} \\
+    -  &-   & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, update \\
+    -  &-   &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
+    -  &-   & $c_{ij}$ &  -        & 1    &  delete $c_{ij}$ because $t_{ij}$ not present \\
+    -  &-   &  -       &  -        & 1    &   \\
+    -  &-   & $c_{ij}$ & $t_{ij}$  & 0    &   \\
+    -  &-   &  -       & $t_{ij}$  & 0    &   \\
+    -  &-   & $c_{ij}$ &  -        & 0    &   \\
+    -  &-   &  -       &  -        & 0    &   \\
+\hline
+    yes&-   & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, update \\
+    yes&-   &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
+    yes&-   & $c_{ij}$ &  -        & 1    &  delete $c_{ij}$ because $t_{ij}$ not present \\
+    yes&-   &  -       &  -        & 1    &   \\
+    yes&-   & $c_{ij}$ & $t_{ij}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&-   &  -       & $t_{ij}$  & 0    &   \\
+    yes&-   & $c_{ij}$ &  -        & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&-   &  -       &  -        & 0    &   \\
+\hline
+    -  &yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
+    -  &yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
+    -  &yes & $c_{ij}$ &  -        & 1    &   \\
+    -  &yes &  -       &  -        & 1    &   \\
+    -  &yes & $c_{ij}$ & $t_{ij}$  & 0    &   \\
+    -  &yes &  -       & $t_{ij}$  & 0    &   \\
+    -  &yes & $c_{ij}$ &  -        & 0    &   \\
+    -  &yes &  -       &  -        & 0    &   \\
+\hline
+    yes&yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
+    yes&yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
+    yes&yes & $c_{ij}$ &  -        & 1    &   \\
+    yes&yes &  -       &  -        & 1    &   \\
+    yes&yes & $c_{ij}$ & $t_{ij}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&yes &  -       & $t_{ij}$  & 0    &   \\
+    yes&yes & $c_{ij}$ &  -        & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&yes &  -       &  -        & 0    &   \\
 \hline
 \end{tabular}
 }
+\caption{Results of the mask/accumulator phase. \label{tab:maskaccum}}
+\end{table}
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Descriptor\_new:}  create a new descriptor}
-%-------------------------------------------------------------------------------
-\label{descriptor_new}
-
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_Descriptor_new     // create a new descriptor
-(
-    GrB_Descriptor *descriptor  // handle of descriptor to create
-) ;
-\end{verbatim} } \end{mdframed}
-
-\verb'GrB_Descriptor_new' creates a new descriptor, with all fields set to
-their defaults (output is not replaced, the mask is not complemented, the mask
-is valued not structural, neither input matrix is transposed, the method
-used in \verb'C=A*B' is selected automatically, and \verb'GrB_mxm' leaves
-the final sort as pending work).
-
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Descriptor\_wait:} wait for a descriptor}
-%-------------------------------------------------------------------------------
-\label{descriptor_wait}
-
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_wait                   // wait for a descriptor
-(
-    GrB_Descriptor descriptor,      // descriptor to wait for
-    GrB_WaitMode mode               // GrB_COMPLETE or GrB_MATERIALIZE
-) ;
-\end{verbatim}
-}\end{mdframed}
-
-After creating a user-defined descriptor, a GraphBLAS library may choose to
-exploit non-blocking mode to delay its creation.  Currently,
-SuiteSparse:GraphBLAS does nothing except to ensure that \verb'd' is valid.
-
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Descriptor\_set:}  set a parameter in a descriptor}
-%-------------------------------------------------------------------------------
-\label{descriptor_set}
-
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
-(
-    GrB_Descriptor desc,        // descriptor to modify
-    GrB_Desc_Field field,       // parameter to change
-    GrB_Desc_Value val          // value to change it to
-) ;
-\end{verbatim} } \end{mdframed}
-
-\verb'GrB_Descriptor_set' sets a descriptor field (\verb'GrB_OUTP',
-\verb'GrB_MASK', \verb'GrB_INP0', \verb'GrB_INP1', or \verb'GxB_AxB_METHOD') to
-a particular value.  Use \verb'GxB_Dec_set' to set the value of
-\verb'GxB_NTHREADS', \verb'GxB_CHUNK', and \verb'GxB_SORT'.
-If an error occurs, \verb'GrB_error(&err,desc)' returns details about the error.
+When there is no mask and the mask \verb'GrB_COMP' option is not selected, the
+table simplifies (Table~\ref{tab:maskaccum_nomask}).  The \verb'GrB_REPLACE'
+option no longer has any effect.  The \verb'GrB_SECOND_T' binary operator when
+used as the accumulator unifies the first cases, shown in
+Table~\ref{tab:maskaccum_nomask_2nd}.  The only difference now is the behavior
+when $c_{ij}$ is present but $t_{ij}$ is not.  Finally, the effect of
+\verb'GrB_FIRST_T' as the accumulator is shown in
+Table~\ref{tab:maskaccum_nomask_1st}. 
 
-\vspace{0.2in}
-\noindent
-{\footnotesize
-\begin{tabular}{|l|p{2.4in}|p{2.2in}|}
+\begin{table}[h]
+\begin{center}
+{\small
+\begin{tabular}{lll|l}
 \hline
-Descriptor & Default   & Non-default \\
-field      & &  \\
+       accum & ${\bf C}$ & ${\bf T}$        & action taken by ${\bf C = C \odot T}$ \\
 \hline
-
-\verb'GrB_OUTP'
-    & \verb'GxB_DEFAULT':
-    The output matrix is not cleared.  The operation computes
-    ${\bf C \langle M \rangle = C \odot T}$.
-    & \verb'GrB_REPLACE':
-    After computing ${\bf Z=C\odot T}$,
-    the output {\bf C} is cleared of all entries.
-    Then ${\bf C \langle M \rangle = Z}$ is performed. \\
-
+        -   & $c_{ij}$ & $t_{ij}$         &  $c_{ij} = t_{ij}$, update \\
+        -   &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
+        -   & $c_{ij}$ &  -               &  delete $c_{ij}$ because $t_{ij}$ not present \\
+        -   &  -       &  -               &   \\
 \hline
-
-\verb'GrB_MASK'
-    & \verb'GxB_DEFAULT':
-    The Mask is not complemented.  \verb'Mask(i,j)=1' means the value $C_{ij}$
-    can be modified by the operation, while \verb'Mask(i,j)=0' means the value
-    $C_{ij}$ shall not be modified by the operation.
-    & \verb'GrB_COMP':
-    The Mask is complemented.  \verb'Mask(i,j)=0' means the value $C_{ij}$
-    can be modified by the operation, while \verb'Mask(i,j)=1' means the value
-    $C_{ij}$ shall not be modified by the operation. \\
-    &
-    & \verb'GrB_STRUCTURE':
-    The values of the Mask are ignored.  If \verb'Mask(i,j)' is an entry
-    in the \verb'Mask' matrix, it is treated as if \verb'Mask(i,j)=1'.
-    The two options \verb'GrB_COMP' and \verb'GrB_STRUCTURE' can be
-    combined, with two subsequent calls, or with a single call with the setting
-    \verb'GrB_COMP+GrB_STRUCTURE'.  \\
-
+        yes & $c_{ij}$ & $t_{ij}$         &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
+        yes &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
+        yes & $c_{ij}$ &  -               &   \\
+        yes &  -       &  -               &   \\
 \hline
+\end{tabular}
+}
+\caption{When no mask is present (and not complemented).
+\label{tab:maskaccum_nomask}}
+\end{center}
+\end{table}
 
-\verb'GrB_INP0'
-    & \verb'GxB_DEFAULT':
-    The first input is not transposed prior to using it in the operation.
-    & \verb'GrB_TRAN':
-    The first input is transposed prior to using it in the operation.  Only
-    matrices are transposed, never vectors. \\
+\begin{table}[h]
+\begin{center}
+{\small
+\begin{tabular}{lll|l}
+\hline
+       accum & ${\bf C}$ & ${\bf T}$        & action taken by ${\bf C = C \odot T}$ \\
+\hline
+        yes & $c_{ij}$ & $t_{ij}$         &  $c_{ij} = t_{ij}$, apply \verb'GrB_SECOND' accumulator \\
+        yes &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
+        yes & $c_{ij}$ &  -               &   \\
+        yes &  -       &  -               &   \\
+\hline
+\end{tabular}
+}
+\caption{No mask, with the SECOND operator as the accumulator.
+\label{tab:maskaccum_nomask_2nd}}
+\end{center}
+\end{table}
 
+\begin{table}[h]
+\begin{center}
+{\small
+\begin{tabular}{lll|l}
 \hline
+       accum & ${\bf C}$ & ${\bf T}$        & action taken by ${\bf C = C \odot T}$ \\
+\hline
+        yes & $c_{ij}$ & $t_{ij}$         &  \\ 
+        yes &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
+        yes & $c_{ij}$ &  -               &   \\
+        yes &  -       &  -               &   \\
+\hline
+\end{tabular}
+}
+\caption{No Mask, with the FIRST operator as the accumulator.
+\label{tab:maskaccum_nomask_1st}}
+\end{center}
+\end{table}
 
-\verb'GrB_INP1'
-    & \verb'GxB_DEFAULT':
-    The second input is not transposed prior to using it in the operation.
-    & \verb'GrB_TRAN':
-    The second input is transposed prior to using it in the operation.  Only
-    matrices are transposed, never vectors. \\
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{SuiteSparse:GraphBLAS Options} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{options}
 
-\hline
+SuiteSparse:GraphBLAS includes two type-generic methods, \verb'GxB_set' and
+\verb'GxB_get', that set and query various options and parameters settings,
+including a generic way to set values in the \verb'GrB_Descriptor' object.
+Using these methods, the user application can provide hints to
+SuiteSparse:GraphBLAS on how it should store and operate on its matrices.
+These hints have no effect on the results of any GraphBLAS operation (except
+perhaps floating-point roundoff differences), but they can have a great impact
+on the amount of time or memory taken.
 
-\verb'GrB_AxB_METHOD'
-    & \verb'GxB_DEFAULT':
-    The method for \verb'C=A*B' is selected automatically.
-    & \verb'GxB_AxB_'{\em method}: The selected method is used to compute
-    \verb'C=A*B'.  \\
+\begin{itemize}
+
+\item \verb'GxB_set (field, value)' sets global options.
 
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double [8]' & bitmap control \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_GLOBAL_NTHREADS'  & \verb'int'    & number of threads to use \\
+\verb'GxB_NTHREADS'         & \verb'int'    & number of threads to use \\
+\verb'GxB_GLOBAL_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'            & \verb'double' & chunk size \\
+\verb'GxB_BURBLE'           & \verb'int'    & diagnostic output \\
+\verb'GxB_PRINTF'           & see below     & diagnostic output \\
+\verb'GxB_FLUSH'            & see below     & diagnostic output \\
+\verb'GxB_MEMORY_POOL'      & \verb'int64_t [64]' & memory pool control \\
+\verb'GxB_PRINT_1BASED'     & \verb'int'    & for printing matrices/vectors \\
 \hline
 \end{tabular}
 }
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Desc\_set:}  set a parameter in a descriptor}
-%-------------------------------------------------------------------------------
-\label{desc_set}
+\item \verb'GxB_set (GrB_Matrix A, field, value)' provides hints to
+    SuiteSparse: GraphBLAS on how to store a particular matrix.
 
-\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Desc_set           // set a parameter in a descriptor
-(
-    GrB_Descriptor desc,        // descriptor to modify
-    GrB_Desc_Field field,       // parameter to change
-    ...                         // value to change it to
-) ;
-\end{verbatim} } \end{mdframed}
-
-\verb'GxB_Desc_set' is like \verb'GrB_Descriptor_set', except that the type of
-the third parameter can vary with the field.   This function can modify all
-descriptor settings, including those that do not have the type
-\verb'GrB_Desc_Value'.  See also \verb'GxB_set' described in
-Section~\ref{options}.  If an error occurs, \verb'GrB_error(&err,desc)' returns
-details about the error.
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\hline
+\end{tabular}
+}
 
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Desc\_get:}  get a parameter from a descriptor}
-%-------------------------------------------------------------------------------
-\label{desc_get}
+\item \verb'GxB_set (GrB_Vector v, field, value)' provides hints to
+    SuiteSparse: GraphBLAS on how to store a particular vector.
 
-\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Desc_get           // get a parameter from a descriptor
-(
-    GrB_Descriptor desc,        // descriptor to query; NULL means defaults
-    GrB_Desc_Field field,       // parameter to query
-    ...                         // value of the parameter
-) ;
-\end{verbatim} } \end{mdframed}
-
-\verb'GxB_Desc_get' returns the value of a single field in a descriptor.  The
-type of the third parameter is a pointer to a variable type, whose type depends
-on the field.  See also \verb'GxB_get' described in Section~\ref{options}.
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\hline
+\end{tabular}
+}
 
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Descriptor\_free:} free a descriptor}
-%-------------------------------------------------------------------------------
-\label{descriptor_free}
+\item \verb'GxB_set (GrB_Descriptor desc, field, value)' sets
+    the value of a field in a \verb'GrB_Descriptor'.
 
-\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
-\begin{verbatim}
-GrB_Info GrB_free               // free a descriptor
-(
-    GrB_Descriptor *descriptor  // handle of descriptor to free
-) ;
-\end{verbatim} } \end{mdframed}
-
-\verb'GrB_Descriptor_free' frees a descriptor.
-Either usage:
-
-    {\small
-    \begin{verbatim}
-    GrB_Descriptor_free (&descriptor) ;
-    GrB_free (&descriptor) ; \end{verbatim}}
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GrB_OUTP'     & \verb'GrB_Desc_Value' & replace option \\
+\verb'GrB_MASK'     & \verb'GrB_Desc_Value' & mask option \\
+\verb'GrB_INP0'     & \verb'GrB_Desc_Value' & transpose input 0 \\
+\verb'GrB_INP1'     & \verb'GrB_Desc_Value' & transpose input 1 \\
+\verb'GxB_DESCRIPTOR_NTHREADS'  & \verb'int' & number of threads to use \\
+\verb'GxB_NTHREADS'             & \verb'int' & number of threads to use \\
+\verb'GxB_DESCRIPTOR_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'                & \verb'double' & chunk size \\
+\verb'GxB_AxB_METHOD'           & \verb'int' & method for matrix multiply \\
+\verb'GxB_SORT'                 & \verb'int' & lazy vs aggressive sort \\
+\verb'GxB_COMPRESSION'          & \verb'int' & compression for serialization \\
+\verb'GxB_IMPORT'    & \verb'GrB_Desc_Value' & trust data on import/pack \\
+\hline
+\end{tabular}
+}
 
-\noindent
-frees the \verb'descriptor' and sets \verb'descriptor' to \verb'NULL'.  It
-safely does nothing if passed a \verb'NULL' handle, or if
-\verb'descriptor == NULL' on input.
+\end{itemize}
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_DESC\_*:}  built-in descriptors}
-%-------------------------------------------------------------------------------
-\label{descriptor_predefined}
+\verb'GxB_get' queries a \verb'GrB_Descriptor', a \verb'GrB_Matrix',
+a \verb'GrB_Vector', or the global options.
 
-Built-in descriptors are listed in the table below.  A dash in the table
-indicates the default.  These descriptors may not be modified or freed.
-Attempts to modify them result in an error (\verb'GrB_INVALID_VALUE'); attempts
-to free them are silently ignored. 
+\begin{itemize}
 
-% \verb'GrB_NULL' is the default descriptor, with all settings at their defaults:
-% \verb'OUTP': do not replace the output,
-% \verb'MASK': mask is valued and not complemented,
-% \verb'INP0': first input not transposed, and
-% \verb'INP1': second input not transposed.
-% For these pre-defined descriptors, the
-% \verb'GxB_NTHREADS',
-% \verb'GxB_CHUNK', and
-% \verb'GxB_SORT' settings are at their default values.
+\item \verb'GxB_get (field, &value)' retrieves the value of a global option.
 
-\vspace{0.2in}
-\noindent
 {\footnotesize
-\begin{tabular}{|l|lllll|}
-\hline
-Descriptor              &  \verb'OUTP'          & \verb'MASK'           & \verb'MASK'       & \verb'INP0'       & \verb'INP1'       \\
-                        &                       & structural            & complement        & & \\
-\hline
-\verb'GrB_NULL'         &   -                   & -                     & -                 & -                 & -                 \\
-\verb'GrB_DESC_T1'      &   -                   & -                     & -                 & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_T0'      &   -                   & -                     & -                 & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_T0T1'    &   -                   & -                     & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\begin{tabular}{lll}
+field                       & value         & description \\
 \hline
-\verb'GrB_DESC_C'       &   -                   & -                     & \verb'GrB_COMP'   & -                 & -                 \\
-\verb'GrB_DESC_CT1'     &   -                   & -                     & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_CT0'     &   -                   & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_CT0T1'   &   -                   & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double [8]' & bitmap control \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW' or \verb'GxB_BY_COL' \\
+\verb'GxB_GLOBAL_NTHREADS'  & \verb'int'    & number of threads to use \\
+\verb'GxB_NTHREADS'         & \verb'int'    & number of threads to use \\
+\verb'GxB_GLOBAL_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'            & \verb'double' & chunk size \\
+\verb'GxB_BURBLE'           & \verb'int'    & diagnostic output \\
+\verb'GxB_PRINTF'           & see below     & diagnostic output \\
+\verb'GxB_FLUSH'            & see below     & diagnostic output \\
+\verb'GxB_MEMORY_POOL'      & \verb'int64_t [64]' & memory pool control \\
+\verb'GxB_PRINT_1BASED'     & \verb'int'    & for printing matrices/vectors \\
+\verb'GxB_MODE'                 & \verb'int'    & blocking/non-blocking \\
+\verb'GxB_LIBRARY_NAME'         & \verb'char *' & name of library \\
+\verb'GxB_LIBRARY_VERSION'      & \verb'int [3]' & library version \\
+\verb'GxB_LIBRARY_DATE'         & \verb'char *' & release date \\
+\verb'GxB_LIBRARY_ABOUT'        & \verb'char *' & about the library \\
+\verb'GxB_LIBRARY_LICENSE'      & \verb'char *' & license \\
+\verb'GxB_LIBRARY_COMPILE_DATE' & \verb'char *' & date of compilation \\
+\verb'GxB_LIBRARY_COMPILE_TIME' & \verb'char *' & time of compilation \\
+\verb'GxB_LIBRARY_URL'          & \verb'char *' & url of library \\
+\verb'GxB_API_VERSION'          & \verb'int [3]' & C API version \\
+\verb'GxB_API_DATE'             & \verb'char *' & C API date \\
+\verb'GxB_API_ABOUT'            & \verb'char *' & about the C API \\
+\verb'GxB_API_URL'              & \verb'char *' & \verb'http://graphblas.org' \\
+\verb'GxB_COMPILER_NAME'        & \verb'char *' & C compiler name \\
+\verb'GxB_COMPILER_VERSION'     & \verb'int [3]' & C compiler version \\
 \hline
-\verb'GrB_DESC_S'       &   -                   & \verb'GrB_STRUCTURE'  & -                 & -                 & -                 \\
-\verb'GrB_DESC_ST1'     &   -                   & \verb'GrB_STRUCTURE'  & -                 & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_ST0'     &   -                   & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_ST0T1'   &   -                   & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\end{tabular}
+}
+
+\item \verb'GxB_get (GrB_Matrix A, field, &value)' retrieves the current
+    value of an option from a particular matrix \verb'A'.
+
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
 \hline
-\verb'GrB_DESC_SC'      &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & -                 \\
-\verb'GrB_DESC_SCT1'    &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_SCT0'    &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_SCT0T1'  &   -                   & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\verb'GxB_SPARSITY_STATUS'  & \verb'int'    & 1, 2, 4, or 8 \\
 \hline
-\verb'GrB_DESC_R'       &   \verb'GrB_REPLACE'  & -                     & -                 & -                 & -                 \\
-\verb'GrB_DESC_RT1'     &   \verb'GrB_REPLACE'  & -                     & -                 & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_RT0'     &   \verb'GrB_REPLACE'  & -                     & -                 & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_RT0T1'   &   \verb'GrB_REPLACE'  & -                     & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\end{tabular}
+}
+
+\item \verb'GxB_get (GrB_Vector A, field, &value)' retrieves the current
+    value of an option from a particular vector \verb'v'.
+
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
 \hline
-\verb'GrB_DESC_RC'      &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & -                 & -                 \\
-\verb'GrB_DESC_RCT1'    &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_RCT0'    &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_RCT0T1'  &   \verb'GrB_REPLACE'  & -                     & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\verb'GxB_SPARSITY_STATUS'  & \verb'int'    & 1, 2, 4, or 8 \\
 \hline
-\verb'GrB_DESC_RS'      &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & -                 & -                 \\
-\verb'GrB_DESC_RST1'    &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_RST0'    &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_RST0T1'  &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & -                 & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\end{tabular}
+}
+
+\item \verb'GxB_get (GrB_Descriptor desc, field, &value)' retrieves the value
+    of a field in a descriptor.
+
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
 \hline
-\verb'GrB_DESC_RSC'     &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & -                 \\
-\verb'GrB_DESC_RSCT1'   &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & -                 & \verb'GrB_TRAN'   \\
-\verb'GrB_DESC_RSCT0'   &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & -                 \\
-\verb'GrB_DESC_RSCT0T1' &   \verb'GrB_REPLACE'  & \verb'GrB_STRUCTURE'  & \verb'GrB_COMP'   & \verb'GrB_TRAN'   & \verb'GrB_TRAN'   \\
+\verb'GrB_OUTP'     & \verb'GrB_Desc_Value' & replace option \\
+\verb'GrB_MASK'     & \verb'GrB_Desc_Value' & mask option \\
+\verb'GrB_INP0'     & \verb'GrB_Desc_Value' & transpose input 0 \\
+\verb'GrB_INP1'     & \verb'GrB_Desc_Value' & transpose input 1 \\
+\verb'GxB_DESCRIPTOR_NTHREADS'  & \verb'int' & number of threads to use \\
+\verb'GxB_NTHREADS'             & \verb'int' & number of threads to use \\
+\verb'GxB_DESCRIPTOR_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'                & \verb'double' & chunk size \\
+\verb'GxB_AxB_METHOD'           & \verb'int' & method for matrix multiply \\
+\verb'GxB_SORT'                 & \verb'int' & lazy vs aggressive sort \\
+\verb'GxB_COMPRESSION'          & \verb'int' & compression for serialization \\
+\verb'GxB_IMPORT'    & \verb'GrB_Desc_Value' & trust data on import/pack \\
 \hline
-\end{tabular}}
+\end{tabular}
+}
 
-\newpage
-%===============================================================================
-\subsection{{\sf GrB\_free:} free any GraphBLAS object} %=======================
-%===============================================================================
-\label{free}
+\end{itemize}
 
-Each of the ten objects has \verb'GrB_*_new' and \verb'GrB_*_free' methods
-that are specific to each object.  They can also be accessed by a generic
-function, \verb'GrB_free', that works for all ten objects.  If \verb'G' is any
-of the ten objects, the statement
+%-------------------------------------------------------------------------------
+\subsection{OpenMP parallelism}
+%-------------------------------------------------------------------------------
+\label{omp_parallelism}
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_free (&G) ; \end{verbatim} }
+SuiteSparse:GraphBLAS is a parallel library, based on OpenMP.  By
+default, all GraphBLAS operations will use up to the maximum number of threads
+specified by the \verb'omp_get_max_threads' OpenMP function.  For small
+problems, GraphBLAS may choose to use fewer threads, using two parameters: the
+maximum number of threads to use (which may differ from the
+\verb'omp_get_max_threads' value), and a parameter called the \verb'chunk'.
+Suppose \verb'work' is a measure of the work an operation needs to perform (say
+the number of entries in the two input matrices for \verb'GrB_eWiseAdd').  No
+more than \verb'floor(work/chunk)' threads will be used (or one thread if the
+ratio is less than 1).
 
-\noindent
-frees the object and sets the variable \verb'G' to \verb'NULL'.  It is safe to
-pass in a \verb'NULL' handle, or to free an object twice:
+The default \verb'chunk' value is 65,536, but this may change in future versions,
+or it may be modified when GraphBLAS is installed on a particular machine.
+
+Both parameters can be set in two ways:
+
+\begin{itemize}
+
+\item Globally:  If the following methods are used, then all subsequent
+GraphBLAS operations will use these settings.  Note the typecast,
+\verb'(double)' \verb'chunk'.  This is necessary if a literal constant such as
+\verb'20000' is passed as this argument.  The type of the constant must be
+\verb'double'.
 
     {\footnotesize
     \begin{verbatim}
-    GrB_free (NULL) ;       // SuiteSparse:GraphBLAS safely does nothing
-    GrB_free (&G) ;         // the object G is freed and G set to NULL
-    GrB_free (&G) ;         // SuiteSparse:GraphBLAS safely does nothing \end{verbatim} }
+    int nthreads_max = 40 ;
+    GxB_set (GxB_NTHREADS, nthreads_max) ;
+    GxB_set (GxB_CHUNK, (double) 20000) ; \end{verbatim} }
 
-\noindent
-However, the following sequence of operations is not safe.  The first two are
-valid but the last statement will lead to undefined behavior.
+\item Per operation:  Most GraphBLAS operations take a \verb'GrB_Descriptor'
+input, and this can be modified to set the number of threads and chunk
+size for the operation that uses this descriptor.  Note that \verb'chunk'
+is a \verb'double'.
 
     {\footnotesize
     \begin{verbatim}
-    H = G ;                 // valid; creates a 2nd handle of the same object
-    GrB_free (&G) ;         // valid; G is freed and set to NULL; H now undefined
-    GrB_some_method (H) ;   // not valid; H is undefined \end{verbatim}}
+    GrB_Descriptor desc ;
+    GrB_Descriptor_new (&desc)
+    int nthreads_max = 40 ;
+    GxB_set (desc, GxB_NTHREADS, nthreads_max) ;
+    double chunk = 20000 ;
+    GxB_set (desc, GxB_CHUNK, chunk) ; \end{verbatim} }
 
-Some objects are predefined, such as the built-in types.  If a user application
-attempts to free a built-in object, SuiteSparse:GraphBLAS will safely do
-nothing.  The \verb'GrB_free' function in SuiteSparse:GraphBLAS always
-returns \verb'GrB_SUCCESS'.
+\end{itemize}
 
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{The mask, accumulator, and replace option} %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{sec:maskaccum}
+The smaller of \verb'nthreads_max' and \verb'floor(work/chunk)' is used for any
+given GraphBLAS operation, except that a single thread is used if this value is
+zero or less.
 
-After a GraphBLAS operation computes a result ${\bf T}$, (for example, ${\bf
-T=AB}$ for \verb'GrB_mxm'), the results are assigned to an output matrix ${\bf
-C}$ via the mask/ accumulator phase, written as ${\bf C \langle M \rangle = C
-\odot T}$.  This phase is affected by the \verb'GrB_REPLACE' option in the
-descriptor, the presence of an optional binary accumulator operator ($\odot$),
-the presence of the optional mask matrix ${\bf M}$, and the status of the mask
-descriptor.  The interplay of these options is summarized in
-Table~\ref{tab:maskaccum}.
+If either parameter is set to \verb'GxB_DEFAULT', then default values are used.
+The default for \verb'nthreads_max' is the return value from
+\verb'omp_get_max_threads', and the default chunk size is currently 65,536.
 
-The mask ${\bf M}$ may be present, or not.  It may be structural or valued, and
-it may be complemented, or not.  These options may be combined, for a total of
-8 cases, although the structural/valued option as no effect if ${\bf M}$ is not
-present.  If ${\bf M}$ is not present and not complemented, then $m_{ij}$ is
-implicitly true.  If not present yet complemented, then all $m_{ij}$ entries are
-implicitly zero; in this case, ${\bf T}$ need not be computed at all.  Either
-${\bf C}$ is not modified, or all its entries are cleared if the replace option
-is enabled.  If ${\bf M}$ is present, and the structural option is used, then
-$m_{ij}$ is treated as true if it is an entry in the matrix (its value is
-ignored).  Otherwise, the value of $m_{ij}$ is used.  In both cases, entries
-not present are implicitly zero.  These values are negated if the mask is
-complemented.  All of these various cases are combined to give a single
-effective value of the mask at position ${ij}$.
+If a descriptor value for either parameter is left at its default, or set to
+\verb'GxB_DEFAULT', then the global setting is used.  This global setting may
+have been modified from its default, and this modified value will be used.
 
-The combination of all these options are presented in the
-Table~\ref{tab:maskaccum}.  The first column is the \verb'GrB_REPLACE' option.
-The second column lists whether or not the accumulator operator is present.
-The third column lists whether or not $c_{ij}$ exists on input to the
-mask/accumulator phase (a dash means that it does not exist).  The fourth
-column lists whether or not the entry $t_{ij}$ is present in the result matrix
-${\bf T}$.  The mask column is the final effective value of $m_{ij}$, after
-accounting for the presence of ${\bf M}$ and the mask options.  Finally, the
-last column states the result of the mask/accum step; if no action is listed in
-this column, then $c_{ij}$ is not modified.
+For example, suppose \verb'omp_get_max_threads' reports 8 threads.  If \newline
+\verb'GxB_set (GxB_NTHREADS, 4)' is used, then the global setting is four
+threads, not eight.  If a descriptor is used but its \verb'GxB_NTHREADS' is not
+set, or set to \verb'GxB_DEFAULT', then any operation that uses this descriptor
+will use 4 threads.
 
-Several important observations can be made from this table.  First,
-if no mask is present (and the mask-complement descriptor option is not used),
-then only the first half of the table is used.  In this case, the \verb'GrB_REPLACE'
-option has no effect.  The entire matrix ${\bf C}$ is modified.
+%-------------------------------------------------------------------------------
+\subsection{Storing a matrix by row or by column}
+%-------------------------------------------------------------------------------
 
-Consider the cases when $c_{ij}$ is present but $t_{ij}$ is not, and there is no
-mask or the effective value of the mask is true for this ${ij}$ position.  With
-no accumulator operator, $c_{ij}$ is deleted.  If the accumulator operator is
-present and the replace option is not used, $c_{ij}$ remains unchanged.
+The GraphBLAS \verb'GrB_Matrix' is entirely opaque to the user application, and
+the GraphBLAS API does not specify how the matrix should be stored.  However,
+choices made in how the matrix is represented in a particular implementation,
+such as SuiteSparse:GraphBLAS, can have a large impact on performance.
 
-\begin{table}
-{\small
-\begin{tabular}{lllll|l}
-\hline
-repl & accum & ${\bf C}$ & ${\bf T}$ & mask & action taken by ${\bf C \langle M \rangle = C \odot T}$ \\
-\hline
-    -  &-   & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, update \\
-    -  &-   &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
-    -  &-   & $c_{ij}$ &  -        & 1    &  delete $c_{ij}$ because $t_{ij}$ not present \\
-    -  &-   &  -       &  -        & 1    &   \\
-    -  &-   & $c_{ij}$ & $t_{ij}$  & 0    &   \\
-    -  &-   &  -       & $t_{ij}$  & 0    &   \\
-    -  &-   & $c_{ij}$ &  -        & 0    &   \\
-    -  &-   &  -       &  -        & 0    &   \\
-\hline
-    yes&-   & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, update \\
-    yes&-   &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
-    yes&-   & $c_{ij}$ &  -        & 1    &  delete $c_{ij}$ because $t_{ij}$ not present \\
-    yes&-   &  -       &  -        & 1    &   \\
-    yes&-   & $c_{ij}$ & $t_{ij}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&-   &  -       & $t_{ij}$  & 0    &   \\
-    yes&-   & $c_{ij}$ &  -        & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&-   &  -       &  -        & 0    &   \\
-\hline
-    -  &yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
-    -  &yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
-    -  &yes & $c_{ij}$ &  -        & 1    &   \\
-    -  &yes &  -       &  -        & 1    &   \\
-    -  &yes & $c_{ij}$ & $t_{ij}$  & 0    &   \\
-    -  &yes &  -       & $t_{ij}$  & 0    &   \\
-    -  &yes & $c_{ij}$ &  -        & 0    &   \\
-    -  &yes &  -       &  -        & 0    &   \\
-\hline
-    yes&yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
-    yes&yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
-    yes&yes & $c_{ij}$ &  -        & 1    &   \\
-    yes&yes &  -       &  -        & 1    &   \\
-    yes&yes & $c_{ij}$ & $t_{ij}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&yes &  -       & $t_{ij}$  & 0    &   \\
-    yes&yes & $c_{ij}$ &  -        & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&yes &  -       &  -        & 0    &   \\
-\hline
-\end{tabular}
-}
-\caption{Results of the mask/accumulator phase. \label{tab:maskaccum}}
-\end{table}
+Many graph algorithms are just as fast in any format, but some algorithms are
+much faster in one format or the other.  For example, suppose the user
+application stores a directed graph as a matrix \verb'A', with the edge $(i,j)$
+represented as the value \verb'A(i,j)', and the application makes many accesses
+to the $i$th row of the matrix, with \verb'GrB_Col_extract'
+\verb'(w,...,A,GrB_ALL,...,i,desc)' with the transposed descriptor
+(\verb'GrB_INP0' set to \verb'GrB_TRAN').  If the matrix is stored by column
+this can be extremely slow, just like the expression \verb'w=A(i,:)' in MATLAB,
+where \verb'i' is a scalar.  Since this is a typical use-case in graph
+algorithms, the default format in SuiteSparse:GraphBLAS is to store its
+matrices by row, in Compressed Sparse Row format (CSR).
 
-When there is no mask and the mask \verb'GrB_COMP' option is not selected, the
-table simplifies (Table~\ref{tab:maskaccum_nomask}).  The \verb'GrB_REPLACE'
-option no longer has any effect.  The \verb'GrB_SECOND_T' binary operator when
-used as the accumulator unifies the first cases, shown in
-Table~\ref{tab:maskaccum_nomask_2nd}.  The only difference now is the behavior
-when $c_{ij}$ is present but $t_{ij}$ is not.  Finally, the effect of
-\verb'GrB_FIRST_T' as the accumulator is shown in
-Table~\ref{tab:maskaccum_nomask_1st}. 
+MATLAB stores its sparse matrices by column, in ``non-hypersparse'' format, in
+what is called the Compressed Sparse Column format, or CSC for short.  An
+\verb'm'-by-\verb'n' matrix in MATLAB is represented as a set of \verb'n'
+column vectors, each with a sorted list of row indices and values of the
+nonzero entries in that column.  As a result, \verb'w=A(:,j)' is very fast in
+MATLAB, since the result is already held in the data structure a single list,
+the $j$th column vector.  However, \verb'w=A(i,:)' is very slow in MATLAB,
+since every column in the matrix has to be searched to see if it contains row
+\verb'i'.  In MATLAB, if many such accesses are made, it is much better to
+transpose the matrix (say \verb"AT=A'") and then use \verb"w=AT(:,i)" instead.
+This can have a dramatic impact on the performance of MATLAB.
 
-\begin{table}[h]
-\begin{center}
-{\small
-\begin{tabular}{lll|l}
-\hline
-       accum & ${\bf C}$ & ${\bf T}$        & action taken by ${\bf C = C \odot T}$ \\
-\hline
-        -   & $c_{ij}$ & $t_{ij}$         &  $c_{ij} = t_{ij}$, update \\
-        -   &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
-        -   & $c_{ij}$ &  -               &  delete $c_{ij}$ because $t_{ij}$ not present \\
-        -   &  -       &  -               &   \\
-\hline
-        yes & $c_{ij}$ & $t_{ij}$         &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
-        yes &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
-        yes & $c_{ij}$ &  -               &   \\
-        yes &  -       &  -               &   \\
-\hline
-\end{tabular}
-}
-\caption{When no mask is present (and not complemented).
-\label{tab:maskaccum_nomask}}
-\end{center}
-\end{table}
+Likewise, if \verb'u' is a very sparse column vector and \verb'A' is stored by
+column, then \verb"w=u'*A" (via \verb'GrB_vxm') is slower than \verb'w=A*u'
+(via \verb'GrB_mxv').  The opposite is true if the matrix is stored by row.
+
+SuiteSparse:GraphBLAS stores its matrices by row, by default (with one
+exception described below).  However, it can also be instructed to store any
+selected matrices, or all matrices, by column instead (just like MATLAB), so
+that \verb'w=A(:,j)' (via \verb'GrB_Col_extract') is very fast.  The change in
+data format has no effect on the result, just the time and memory usage.  To
+use a column-oriented format by default, the following can be done in a user
+application that tends to access its matrices by column.
+
+    {\footnotesize
+    \begin{verbatim}
+    GrB_init (...) ;
+    // just after GrB_init: do the following:
+    #ifdef GxB_SUITESPARSE_GRAPHBLAS
+    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
+    #endif \end{verbatim} }
+
+If this is done, and no other \verb'GxB_set' calls are made with
+\verb'GxB_FORMAT', all matrices will be stored by column.
+The default format is \verb'GxB_BY_ROW'.
 
-\begin{table}[h]
-\begin{center}
-{\small
-\begin{tabular}{lll|l}
-\hline
-       accum & ${\bf C}$ & ${\bf T}$        & action taken by ${\bf C = C \odot T}$ \\
-\hline
-        yes & $c_{ij}$ & $t_{ij}$         &  $c_{ij} = t_{ij}$, apply \verb'GrB_SECOND' accumulator \\
-        yes &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
-        yes & $c_{ij}$ &  -               &   \\
-        yes &  -       &  -               &   \\
-\hline
-\end{tabular}
-}
-\caption{No mask, with the SECOND operator as the accumulator.
-\label{tab:maskaccum_nomask_2nd}}
-\end{center}
-\end{table}
+All vectors (\verb'GrB_Vector') are held by column, and this cannot be changed.
 
-\begin{table}[h]
-\begin{center}
-{\small
-\begin{tabular}{lll|l}
-\hline
-       accum & ${\bf C}$ & ${\bf T}$        & action taken by ${\bf C = C \odot T}$ \\
-\hline
-        yes & $c_{ij}$ & $t_{ij}$         &  \\ 
-        yes &  -       & $t_{ij}$         &  $c_{ij} = t_{ij}$, insert \\
-        yes & $c_{ij}$ &  -               &   \\
-        yes &  -       &  -               &   \\
-\hline
-\end{tabular}
-}
-\caption{No Mask, with the FIRST operator as the accumulator.
-\label{tab:maskaccum_nomask_1st}}
-\end{center}
-\end{table}
+By default, matrices of size \verb'm-by-1' are held by column, regardless of
+the global setting described above.  Matrices of size \verb'1-by-n' with
+\verb'n' not equal to 1 are held by row, regardless of the global setting.
+The global setting only affects matrices with both \verb'm > 1' and \verb'n > 1'.
+Empty matrices (\verb'0-by-0') are also controlled by the global setting.
 
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{SuiteSparse:GraphBLAS Options} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{options}
+After creating a matrix with \verb'GrB_Matrix_new (&A, ...)',
+its format can be changed arbitrarily with \verb'GxB_set (A, GxB_FORMAT, ...)'.
+So even an \verb'm-by-1' matrix can then be changed to be held by row, for
+example.  Likewise, once a \verb'1-by-n' matrix is created, it can be converted
+to column-oriented format.
 
-SuiteSparse:GraphBLAS includes two type-generic methods, \verb'GxB_set' and
-\verb'GxB_get', that set and query various options and parameters settings,
-including a generic way to set values in the \verb'GrB_Descriptor' object.
-Using these methods, the user application can provide hints to
-SuiteSparse:GraphBLAS on how it should store and operate on its matrices.
-These hints have no effect on the results of any GraphBLAS operation (except
-perhaps floating-point roundoff differences), but they can have a great impact
-on the amount of time or memory taken.
+%-------------------------------------------------------------------------------
+\subsection{Hypersparse matrices}
+\label{hypersparse}
+%-------------------------------------------------------------------------------
 
-\begin{itemize}
+MATLAB can store an \verb'm'-by-\verb'n' matrix with a very large value of
+\verb'm', since a CSC data structure takes $O(n+|{\bf A}|)$ memory, independent
+of \verb'm', where $|{\bf A}|$ is the number of nonzeros in the matrix.  It
+cannot store a matrix with a huge \verb'n', and this structure is also
+inefficient when $|{\bf A}|$ is much smaller than \verb'n'.  In contrast,
+SuiteSparse:GraphBLAS can store its matrices in {\em hypersparse} format,
+taking only $O(|{\bf A}|)$ memory, independent of how it is stored (by row or
+by column) and independent of both \verb'm' and \verb'n'
+\cite{BulucGilbert08,BulucGilbert12}.
 
-\item \verb'GxB_set (field, value)' sets global options.
+In both the CSR and CSC formats, the matrix is held as a set of sparse vectors.
+In non-hypersparse format, the set of sparse vectors is itself dense; all
+vectors are present, even if they are empty.  For example, an
+\verb'm'-by-\verb'n' matrix in non-hypersparse CSC format contains \verb'n'
+sparse vectors.  Each column vector takes at least one integer to represent,
+even for a column with no entries.  This allows for quick lookup for a
+particular vector, but the memory required is $O(n+|{\bf A}|)$.  With a
+hypersparse CSC format, the set of vectors itself is sparse, and columns with
+no entries take no memory at all.  The drawback of the hypersparse format is
+that finding an arbitrary column vector \verb'j', such as for the computation
+\verb'C=A(:,j)', takes $O(\log k)$ time if there $k \le n$ vectors in the data
+structure.  One advantage of the hypersparse structure is the memory required
+for an \verb'm'-by-\verb'n' hypersparse CSC matrix is only $O(|{\bf A}|)$,
+independent of \verb'm' and \verb'n'.  Algorithms that must visit all non-empty
+columns of a matrix are much faster when working with hypersparse matrices,
+since empty columns can be skipped.
 
-{\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
-\verb'GxB_BITMAP_SWITCH'    & \verb'double [8]' & bitmap control \\
-\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
-                                              or \verb'GxB_BY_COL' \\
-\verb'GxB_GLOBAL_NTHREADS'  & \verb'int'    & number of threads to use \\
-\verb'GxB_NTHREADS'         & \verb'int'    & number of threads to use \\
-\verb'GxB_GLOBAL_CHUNK'     & \verb'double' & chunk size \\
-\verb'GxB_CHUNK'            & \verb'double' & chunk size \\
-\verb'GxB_BURBLE'           & \verb'int'    & diagnostic output \\
-\verb'GxB_PRINTF'           & see below     & diagnostic output \\
-\verb'GxB_FLUSH'            & see below     & diagnostic output \\
-\verb'GxB_MEMORY_POOL'      & \verb'int64_t [64]' & memory pool control \\
-\verb'GxB_PRINT_1BASED'     & \verb'int'    & for printing matrices/vectors \\
-\hline
-\end{tabular}
-}
+The \verb'hyper_switch' parameter controls the hypersparsity of the internal
+data structure for a matrix.  The parameter is typically in the range 0 to 1.
+The default is \verb'hyper_switch' = \verb'GxB_HYPER_DEFAULT', which is an
+\verb'extern' \verb'const' \verb'double' value, currently set to 0.0625, or
+1/16.  This default ratio may change in the future.
 
-\item \verb'GxB_set (GrB_Matrix A, field, value)' provides hints to
-    SuiteSparse: GraphBLAS on how to store a particular matrix.
+The \verb'hyper_switch' determines how the matrix is converted between the
+hypersparse and non-hypersparse formats.  Let $n$ be the number of columns of a
+CSC matrix, or the number of rows of a CSR matrix.  The matrix can have at most
+$n$ non-empty vectors.
 
-{\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
-\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
-\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
-                                              or \verb'GxB_BY_COL' \\
-\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
-\hline
-\end{tabular}
-}
+Let $k$ be the actual number of non-empty vectors.  That is, for the CSC
+format, $k \le n$ is the number of columns that have at least one entry.  Let
+$h$ be the value of \verb'hyper_switch'.
 
-\item \verb'GxB_set (GrB_Vector v, field, value)' provides hints to
-    SuiteSparse: GraphBLAS on how to store a particular vector.
+If a matrix is currently hypersparse, it can be converted to non-hypersparse if
+the either condition $n \le 1$ or $k > 2nh$ holds, or both.  Otherwise, it
+stays hypersparse.  Note that if $n \le 1$ the matrix is always stored as
+non-hypersparse.
 
-{\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
-\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
-\hline
-\end{tabular}
-}
+If currently non-hypersparse, it can be converted to hypersparse if
+both conditions $n > 1$ and $k \le nh$ hold.  Otherwise, it stays
+non-hypersparse.  Note that if $n \le 1$ the matrix always remains
+non-hypersparse.
 
-\item \verb'GxB_set (GrB_Descriptor desc, field, value)' sets
-    the value of a field in a \verb'GrB_Descriptor'.
+The default value of \verb'hyper_switch' is assigned at startup by
+\verb'GrB_init', and can then be modified globally with \verb'GxB_set'.  All
+new matrices are created with the same \verb'hyper_switch', determined by the
+global value.  Once a particular matrix \verb'A' has been constructed, its
+hypersparsity ratio can be modified from the default with:
 
-{\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GrB_OUTP'     & \verb'GrB_Desc_Value' & replace option \\
-\verb'GrB_MASK'     & \verb'GrB_Desc_Value' & mask option \\
-\verb'GrB_INP0'     & \verb'GrB_Desc_Value' & transpose input 0 \\
-\verb'GrB_INP1'     & \verb'GrB_Desc_Value' & transpose input 1 \\
-\verb'GxB_DESCRIPTOR_NTHREADS'  & \verb'int' & number of threads to use \\
-\verb'GxB_NTHREADS'             & \verb'int' & number of threads to use \\
-\verb'GxB_DESCRIPTOR_CHUNK'     & \verb'double' & chunk size \\
-\verb'GxB_CHUNK'                & \verb'double' & chunk size \\
-\verb'GxB_AxB_METHOD'           & \verb'int' & method for matrix multiply \\
-\verb'GxB_SORT'                 & \verb'int' & lazy vs aggressive sort \\
-\verb'GxB_COMPRESSION'          & \verb'int' & compression for serialization \\
-\verb'GxB_IMPORT'    & \verb'GrB_Desc_Value' & trust data on import/pack \\
-\hline
-\end{tabular}
-}
+    {\footnotesize
+    \begin{verbatim}
+    double hyper_switch = 0.2 ;
+    GxB_set (A, GxB_HYPER_SWITCH, hyper_switch) ; \end{verbatim}}
 
-\end{itemize}
+To force a matrix to always be non-hypersparse, use \verb'hyper_switch' equal to
+\verb'GxB_NEVER_HYPER'.  To force a matrix to always stay hypersparse, set
+\verb'hyper_switch' to \verb'GxB_ALWAYS_HYPER'.
 
-\verb'GxB_get' queries a \verb'GrB_Descriptor', a \verb'GrB_Matrix',
-a \verb'GrB_Vector', or the global options.
+A \verb'GrB_Matrix' can thus be held in one of four formats: any combination of
+hyper/non-hyper and CSR/CSC.  All \verb'GrB_Vector' objects are always stored
+in non-hypersparse CSC format.
 
-\begin{itemize}
+A new matrix created via \verb'GrB_Matrix_new' starts with $k=0$ and is created
+in hypersparse form by default unless $n \le 1$ or if $h<0$, where $h$ is the
+global \verb'hyper_switch' value.  The matrix is created in either
+\verb'GxB_BY_ROW' or \verb'GxB_BY_COL' format, as determined by the last call
+to \verb'GxB_set(GxB_FORMAT,...)' or \verb'GrB_init'.
 
-\item \verb'GxB_get (field, &value)' retrieves the value of a global option.
+A new matrix \verb'C' created via \verb'GrB_dup (&C,A)' inherits the CSR/CSC
+format, hypersparsity format, and \verb'hyper_switch' from \verb'A'.
 
-{\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
-\verb'GxB_BITMAP_SWITCH'    & \verb'double [8]' & bitmap control \\
-\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW' or \verb'GxB_BY_COL' \\
-\verb'GxB_GLOBAL_NTHREADS'  & \verb'int'    & number of threads to use \\
-\verb'GxB_NTHREADS'         & \verb'int'    & number of threads to use \\
-\verb'GxB_GLOBAL_CHUNK'     & \verb'double' & chunk size \\
-\verb'GxB_CHUNK'            & \verb'double' & chunk size \\
-\verb'GxB_BURBLE'           & \verb'int'    & diagnostic output \\
-\verb'GxB_PRINTF'           & see below     & diagnostic output \\
-\verb'GxB_FLUSH'            & see below     & diagnostic output \\
-\verb'GxB_MEMORY_POOL'      & \verb'int64_t [64]' & memory pool control \\
-\verb'GxB_PRINT_1BASED'     & \verb'int'    & for printing matrices/vectors \\
-\verb'GxB_MODE'                 & \verb'int'    & blocking/non-blocking \\
-\verb'GxB_LIBRARY_NAME'         & \verb'char *' & name of library \\
-\verb'GxB_LIBRARY_VERSION'      & \verb'int [3]' & library version \\
-\verb'GxB_LIBRARY_DATE'         & \verb'char *' & release date \\
-\verb'GxB_LIBRARY_ABOUT'        & \verb'char *' & about the library \\
-\verb'GxB_LIBRARY_LICENSE'      & \verb'char *' & license \\
-\verb'GxB_LIBRARY_COMPILE_DATE' & \verb'char *' & date of compilation \\
-\verb'GxB_LIBRARY_COMPILE_TIME' & \verb'char *' & time of compilation \\
-\verb'GxB_LIBRARY_URL'          & \verb'char *' & url of library \\
-\verb'GxB_API_VERSION'          & \verb'int [3]' & C API version \\
-\verb'GxB_API_DATE'             & \verb'char *' & C API date \\
-\verb'GxB_API_ABOUT'            & \verb'char *' & about the C API \\
-\verb'GxB_API_URL'              & \verb'char *' & \verb'http://graphblas.org' \\
-\verb'GxB_COMPILER_NAME'        & \verb'char *' & C compiler name \\
-\verb'GxB_COMPILER_VERSION'     & \verb'int [3]' & C compiler version \\
+%-------------------------------------------------------------------------------
+\subsection{Bitmap matrices}
+\label{bitmap_switch}
+%-------------------------------------------------------------------------------
+
+By default, SuiteSparse:GraphBLAS switches between all four formats
+(hypersparse, sparse, bitmap, and full) automatically.  Let $d = |{\bf A}|/mn$
+for an $m$-by-$n$ matrix $\bf A$ with $|{\bf A}|$ entries.  If the matrix is
+currently in sparse or hypersparse format, and is modified so that $d$ exceeds
+a given threshold, it is converted into bitmap format.  The default threshold
+is controlled by the \verb'GxB_BITMAP_SWITCH' setting, which can be set
+globally, or for a particular matrix or vector.
+
+The default value of the switch to bitmap format depends on $\min(m,n)$, for a
+matrix of size $m$-by-$n$.  For the global setting, the bitmap switch is a
+\verb'double' array of size \verb'GxB_NBITMAP_SWITCH'.  The defaults are given
+below:
+
+\vspace{0.2in}
+{\small
+\begin{tabular}{lll}
+parameter & default & matrix sizes \\
 \hline
+\verb'bitmap_switch [0]' & 0.04 & $\min(m,n) = 1$ (and all vectors) \\
+\verb'bitmap_switch [1]' & 0.05 & $\min(m,n) = 2$ \\
+\verb'bitmap_switch [2]' & 0.06 & $\min(m,n) = 3$ to 4 \\
+\verb'bitmap_switch [3]' & 0.08 & $\min(m,n) = 5$ to 8 \\
+\verb'bitmap_switch [4]' & 0.10 & $\min(m,n) = 9$ to 16\\
+\verb'bitmap_switch [5]' & 0.20 & $\min(m,n) = 17$ to 32\\
+\verb'bitmap_switch [6]' & 0.30 & $\min(m,n) = 33$ to 64 \\
+\verb'bitmap_switch [7]' & 0.40 & $\min(m,n) > 64$ \\
 \end{tabular}
 }
+\vspace{0.2in}
 
-\item \verb'GxB_get (GrB_Matrix A, field, &value)' retrieves the current
-    value of an option from a particular matrix \verb'A'.
+That is, by default a \verb'GrB_Vector' is held in bitmap format if its density
+exceeds 4\%.  To change the global settings, do the following:
 
 {\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
-\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
-\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
-                                              or \verb'GxB_BY_COL' \\
-\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
-\verb'GxB_SPARSITY_STATUS'  & \verb'int'    & 1, 2, 4, or 8 \\
-\hline
-\end{tabular}
+\begin{verbatim}
+    double bswitch [GxB_NBITMAP_SWITCH] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 } ;
+    GxB_set (GxB_BITMAP_SWITCH, bswitch) ;
+\end{verbatim}
 }
 
-\item \verb'GxB_get (GrB_Vector A, field, &value)' retrieves the current
-    value of an option from a particular vector \verb'v'.
+If the matrix is currently in bitmap format, it is converted to full if all
+entries are present, or to sparse/hypersparse if $d$ drops below $b/2$, if its
+bitmap switch is $b$.  A matrix or vector with $d$ between $b/2$ and $b$
+remains in its current format.
+
+%-------------------------------------------------------------------------------
+\subsection{Parameter types}
+%-------------------------------------------------------------------------------
+The \verb'GxB_Option_Field' enumerated type gives the type of the \verb'field'
+parameter for the second argument of \verb'GxB_set' and \verb'GxB_get',
+for setting global options or matrix options.
 
 {\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
-\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
-                                              or \verb'GxB_BY_COL' \\
-\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
-\verb'GxB_SPARSITY_STATUS'  & \verb'int'    & 1, 2, 4, or 8 \\
-\hline
-\end{tabular}
+\begin{verbatim}
+typedef enum
+{
+    // for matrix/vector get/set and global get/set:
+    GxB_HYPER_SWITCH = 0,    // defines switch to hypersparse (double value)
+    GxB_BITMAP_SWITCH = 34,  // defines switch to hypersparse (double value)
+    GxB_FORMAT = 1,     // defines CSR/CSC format: GxB_BY_ROW or GxB_BY_COL
+    GxB_SPARSITY_CONTROL = 32,  // control the sparsity of a matrix or vector
+
+    // for global get/set only:
+    GxB_GLOBAL_NTHREADS = GxB_NTHREADS, // max number of threads to use
+    GxB_GLOBAL_CHUNK = GxB_CHUNK,       // chunk size for small problems
+    GxB_BURBLE = 99,                    // diagnositic output
+    GxB_PRINTF = 101,               // printf function for diagnostic output
+    GxB_FLUSH = 102,                // flush function for diagnostic output
+    GxB_MEMORY_POOL = 103,  // memory pool control
+    GxB_PRINT_1BASED = 104, // print matrices as 0-based or 1-based
+
+    // for matrix/vector get only:
+    GxB_SPARSITY_STATUS = 33,   // query the sparsity of a matrix or vector
+
+    // for global get only:
+    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
+    GxB_LIBRARY_NAME = 8,           // name of the library (char *)
+    GxB_LIBRARY_VERSION = 9,        // library version (3 int's)
+    GxB_LIBRARY_DATE = 10,          // date of the library (char *)
+    GxB_LIBRARY_ABOUT = 11,         // about the library (char *)
+    GxB_LIBRARY_URL = 12,           // URL for the library (char *)
+    GxB_LIBRARY_LICENSE = 13,       // license of the library (char *)
+    GxB_LIBRARY_COMPILE_DATE = 14,  // date library was compiled (char *)
+    GxB_LIBRARY_COMPILE_TIME = 15,  // time library was compiled (char *)
+    GxB_API_VERSION = 16,           // API version (3 int's)
+    GxB_API_DATE = 17,              // date of the API (char *)
+    GxB_API_ABOUT = 18,             // about the API (char *)
+    GxB_API_URL = 19,               // URL for the API (char *)
 }
+GxB_Option_Field ;
+\end{verbatim} }
 
-\item \verb'GxB_get (GrB_Descriptor desc, field, &value)' retrieves the value
-    of a field in a descriptor.
+The \verb'GxB_FORMAT' field can be by row or by column, set to a value
+with the type \verb'GxB_Format_Value':
 
 {\footnotesize
-\begin{tabular}{lll}
-field                       & value         & description \\
-\hline
-\verb'GrB_OUTP'     & \verb'GrB_Desc_Value' & replace option \\
-\verb'GrB_MASK'     & \verb'GrB_Desc_Value' & mask option \\
-\verb'GrB_INP0'     & \verb'GrB_Desc_Value' & transpose input 0 \\
-\verb'GrB_INP1'     & \verb'GrB_Desc_Value' & transpose input 1 \\
-\verb'GxB_DESCRIPTOR_NTHREADS'  & \verb'int' & number of threads to use \\
-\verb'GxB_NTHREADS'             & \verb'int' & number of threads to use \\
-\verb'GxB_DESCRIPTOR_CHUNK'     & \verb'double' & chunk size \\
-\verb'GxB_CHUNK'                & \verb'double' & chunk size \\
-\verb'GxB_AxB_METHOD'           & \verb'int' & method for matrix multiply \\
-\verb'GxB_SORT'                 & \verb'int' & lazy vs aggressive sort \\
-\verb'GxB_COMPRESSION'          & \verb'int' & compression for serialization \\
-\verb'GxB_IMPORT'    & \verb'GrB_Desc_Value' & trust data on import/pack \\
-\hline
-\end{tabular}
+\begin{verbatim}
+typedef enum
+{
+    GxB_BY_ROW = 0,     // CSR: compressed sparse row format
+    GxB_BY_COL = 1      // CSC: compressed sparse column format
 }
+GxB_Format_Value ;
+\end{verbatim} }
 
-\end{itemize}
+The default format is given by the predefined value \verb'GxB_FORMAT_DEFAULT',
+which is equal to \verb'GxB_BY_ROW'.
+The default hypersparsity
+ratio is 0.0625 (1/16), but this value may change in the future.
 
-%-------------------------------------------------------------------------------
-\subsection{OpenMP parallelism}
-%-------------------------------------------------------------------------------
-\label{omp_parallelism}
+Setting the \verb'GxB_HYPER_SWITCH' field to \verb'GxB_ALWAYS_HYPER' ensures a matrix
+always stays hypersparse.  If set to \verb'GxB_NEVER_HYPER', it always stays
+non-hypersparse.  At startup, \verb'GrB_init' defines the following initial
+settings:
 
-SuiteSparse:GraphBLAS is a parallel library, based on OpenMP.  By
-default, all GraphBLAS operations will use up to the maximum number of threads
-specified by the \verb'omp_get_max_threads' OpenMP function.  For small
-problems, GraphBLAS may choose to use fewer threads, using two parameters: the
-maximum number of threads to use (which may differ from the
-\verb'omp_get_max_threads' value), and a parameter called the \verb'chunk'.
-Suppose \verb'work' is a measure of the work an operation needs to perform (say
-the number of entries in the two input matrices for \verb'GrB_eWiseAdd').  No
-more than \verb'floor(work/chunk)' threads will be used (or one thread if the
-ratio is less than 1).
+{\footnotesize
+\begin{verbatim}
+    GxB_set (GxB_HYPER_SWITCH, GxB_HYPER_DEFAULT) ;
+    GxB_set (GxB_FORMAT, GxB_BY_ROW) ;
+\end{verbatim} }
 
-The default \verb'chunk' value is 65,536, but this may change in future versions,
-or it may be modified when GraphBLAS is installed on a particular machine.
+That is, by default, all new matrices are held by row in CSR format (except
+for \verb'n-by-1' matrices; see \verb'GrB_Matrix_new').
+If a matrix has fewer than $n/16$
+columns, it can be converted to hypersparse format.  If it has more than $n/8$
+columns, it can be converted to non-hypersparse format.  These options can be
+changed for all future matrices with \verb'GxB_set'.  For example, to change
+all future matrices to be in non-hypersparse CSC when created, use:
 
-Both parameters can be set in two ways:
+{\footnotesize
+\begin{verbatim}
+    GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
+\end{verbatim} }
 
-\begin{itemize}
+Then if a particular matrix needs a different format, then (as an example):
 
-\item Globally:  If the following methods are used, then all subsequent
-GraphBLAS operations will use these settings.  Note the typecast,
-\verb'(double)' \verb'chunk'.  This is necessary if a literal constant such as
-\verb'20000' is passed as this argument.  The type of the constant must be
-\verb'double'.
+{\footnotesize
+\begin{verbatim}
+    GxB_set (A, GxB_HYPER_SWITCH, 0.1) ;
+    GxB_set (A, GxB_FORMAT, GxB_BY_ROW) ;
+\end{verbatim} }
 
-    {\footnotesize
-    \begin{verbatim}
-    int nthreads_max = 40 ;
-    GxB_set (GxB_NTHREADS, nthreads_max) ;
-    GxB_set (GxB_CHUNK, (double) 20000) ; \end{verbatim} }
+This changes the matrix \verb'A' so that it is stored by row, and it is
+converted from non-hypersparse to hypersparse format if it has fewer than 10\%
+non-empty columns.  If it is hypersparse, it is a candidate for conversion to
+non-hypersparse if has 20\% or more non-empty columns.  If it has between 10\%
+and 20\% non-empty columns, it remains in its current format.
+MATLAB only supports a non-hypersparse CSC format.  The format in
+SuiteSparse:GraphBLAS that is equivalent to the MATLAB format is:
 
-\item Per operation:  Most GraphBLAS operations take a \verb'GrB_Descriptor'
-input, and this can be modified to set the number of threads and chunk
-size for the operation that uses this descriptor.  Note that \verb'chunk'
-is a \verb'double'.
+{\footnotesize
+\begin{verbatim}
+    GrB_init (...) ;
+    GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
+    // no subsequent use of GxB_HYPER_SWITCH or GxB_FORMAT
+\end{verbatim} }
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Descriptor desc ;
-    GrB_Descriptor_new (&desc)
-    int nthreads_max = 40 ;
-    GxB_set (desc, GxB_NTHREADS, nthreads_max) ;
-    double chunk = 20000 ;
-    GxB_set (desc, GxB_CHUNK, chunk) ; \end{verbatim} }
+The \verb'GxB_HYPER_SWITCH' and \verb'GxB_FORMAT' options should be considered as
+suggestions from the user application as to how SuiteSparse:GraphBLAS can
+obtain the best performance for a particular application.
+SuiteSparse:GraphBLAS is free to ignore any of these suggestions, both now and
+in the future, and the available options and formats may be augmented in the
+future.  Any prior options no longer needed in future versions of
+SuiteSparse:GraphBLAS will be silently ignored, so the use these options is
+safe for future updates.
 
-\end{itemize}
+The sparsity status of a matrix can be queried with the following, which
+returns a value of \verb'GxB_HYPERSPARSE' \verb'GxB_SPARSE' \verb'GxB_BITMAP'
+or \verb'GxB_FULL'.
 
-The smaller of \verb'nthreads_max' and \verb'floor(work/chunk)' is used for any
-given GraphBLAS operation, except that a single thread is used if this value is
-zero or less.
+{\footnotesize
+\begin{verbatim}
+    int sparsity ;
+    GxB_get (A, GxB_SPARSITY_STATUS, &sparsity) ; \end{verbatim}}
 
-If either parameter is set to \verb'GxB_DEFAULT', then default values are used.
-The default for \verb'nthreads_max' is the return value from
-\verb'omp_get_max_threads', and the default chunk size is currently 65,536.
+The sparsity format of a matrix can be controlled with \verb'GxB_set', which
+can be any mix (a sum or bitwise or) of \verb'GxB_HYPERSPARSE'
+\verb'GxB_SPARSE' \verb'GxB_BITMAP', and \verb'GxB_FULL'.  By default, a matrix
+or vector can be held in any format, with the default setting
+\verb'GxB_AUTO_SPARSITY', which is equal to \verb'GxB_HYPERSPARSE' +
+\verb'GxB_SPARSE' + \verb'GxB_BITMAP' + \verb'GxB_FULL'.  To enable a matrix to
+take on just \verb'GxB_SPARSE' or \verb'GxB_FULL' formats, but not
+\verb'GxB_HYPERSPARSE' or \verb'GxB_BITMAP', for example, use the following:
 
-If a descriptor value for either parameter is left at its default, or set to
-\verb'GxB_DEFAULT', then the global setting is used.  This global setting may
-have been modified from its default, and this modified value will be used.
+{\footnotesize
+\begin{verbatim}
+    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_FULL) ; \end{verbatim}}
 
-For example, suppose \verb'omp_get_max_threads' reports 8 threads.  If \newline
-\verb'GxB_set (GxB_NTHREADS, 4)' is used, then the global setting is four
-threads, not eight.  If a descriptor is used but its \verb'GxB_NTHREADS' is not
-set, or set to \verb'GxB_DEFAULT', then any operation that uses this descriptor
-will use 4 threads.
+In this case, SuiteSparse:GraphBLAS will hold the matrix in sparse format
+(\verb'CSC' or \verb'CSC', depending on its \verb'GxB_FORMAT'), unless all
+entries are present, in which case it will be converted to full format.
+
+Only the least 4 bits of the sparsity control are considered, so the
+formats can be bitwise negated.  For example, to allow for any format
+except full:
+
+{\footnotesize
+\begin{verbatim}
+    GxB_set (A, GxB_SPARSITY_CONTROL, ~GxB_FULL) ; \end{verbatim}}
 
 %-------------------------------------------------------------------------------
-\subsection{Storing a matrix by row or by column}
+\subsection{{\sf GxB\_BURBLE}, {\sf GxB\_PRINTF}, {\sf GxB\_FLUSH}: diagnostics}
 %-------------------------------------------------------------------------------
 
-The GraphBLAS \verb'GrB_Matrix' is entirely opaque to the user application, and
-the GraphBLAS API does not specify how the matrix should be stored.  However,
-choices made in how the matrix is represented in a particular implementation,
-such as SuiteSparse:GraphBLAS, can have a large impact on performance.
+\verb'GxB_set (GxB_BURBLE, ...)' controls the burble setting.  It can also be
+controlled via \verb'GrB.burble(b)' in the Octave/MATLAB interface.
 
-Many graph algorithms are just as fast in any format, but some algorithms are
-much faster in one format or the other.  For example, suppose the user
-application stores a directed graph as a matrix \verb'A', with the edge $(i,j)$
-represented as the value \verb'A(i,j)', and the application makes many accesses
-to the $i$th row of the matrix, with \verb'GrB_Col_extract'
-\verb'(w,...,A,GrB_ALL,...,i,desc)' with the transposed descriptor
-(\verb'GrB_INP0' set to \verb'GrB_TRAN').  If the matrix is stored by column
-this can be extremely slow, just like the expression \verb'w=A(i,:)' in MATLAB,
-where \verb'i' is a scalar.  Since this is a typical use-case in graph
-algorithms, the default format in SuiteSparse:GraphBLAS is to store its
-matrices by row, in Compressed Sparse Row format (CSR).
+{\footnotesize
+\begin{verbatim}
+     GxB_set (GxB_BURBLE, true) ;   // enable burble
+     GxB_set (GxB_BURBLE, false) ;  // disable burble \end{verbatim}}
 
-MATLAB stores its sparse matrices by column, in ``non-hypersparse'' format, in
-what is called the Compressed Sparse Column format, or CSC for short.  An
-\verb'm'-by-\verb'n' matrix in MATLAB is represented as a set of \verb'n'
-column vectors, each with a sorted list of row indices and values of the
-nonzero entries in that column.  As a result, \verb'w=A(:,j)' is very fast in
-MATLAB, since the result is already held in the data structure a single list,
-the $j$th column vector.  However, \verb'w=A(i,:)' is very slow in MATLAB,
-since every column in the matrix has to be searched to see if it contains row
-\verb'i'.  In MATLAB, if many such accesses are made, it is much better to
-transpose the matrix (say \verb"AT=A'") and then use \verb"w=AT(:,i)" instead.
-This can have a dramatic impact on the performance of MATLAB.
+If enabled, SuiteSparse:GraphBLAS reports which internal kernels it uses, and
+how much time is spent.  If you see the word \verb'generic', it means that
+SuiteSparse:GraphBLAS was unable to use is faster kernels in
+\verb'Source/Generated2', but used a generic kernel that relies on function
+pointers.  This is done for user-defined types and operators, and when
+typecasting is performed, and it is typically slower than the kernels in
+\verb'Source/Generated2'.
 
-Likewise, if \verb'u' is a very sparse column vector and \verb'A' is stored by
-column, then \verb"w=u'*A" (via \verb'GrB_vxm') is slower than \verb'w=A*u'
-(via \verb'GrB_mxv').  The opposite is true if the matrix is stored by row.
+If you see a lot of \verb'wait' statements, it may mean that a lot of time is
+spent finishing a matrix or vector.  This may be the result of an inefficient
+use of the \verb'setElement' and \verb'assign' methods.  If this occurs you
+might try changing the sparsity format of a vector or matrix to
+\verb'GxB_BITMAP', assuming there's enough space for it.
 
-SuiteSparse:GraphBLAS stores its matrices by row, by default (with one
-exception described below).  However, it can also be instructed to store any
-selected matrices, or all matrices, by column instead (just like MATLAB), so
-that \verb'w=A(:,j)' (via \verb'GrB_Col_extract') is very fast.  The change in
-data format has no effect on the result, just the time and memory usage.  To
-use a column-oriented format by default, the following can be done in a user
-application that tends to access its matrices by column.
+\verb'GxB_set (GxB_PRINTF, printf)' allows the user application to change the
+function used to print diagnostic output.  This also controls the output of the
+\verb'GxB_*print' functions.  By default this parameter is \verb'NULL', in
+which case the ANSI C11 \verb'printf' function is used.  The parameter is a
+function pointer with the same signature as the ANSI C11 \verb'printf'
+function.  The Octave/MATLAB interface to GraphBLAS uses the following so that
+GraphBLAS can print to the Octave/MATLAB Command Window:
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_init (...) ;
-    // just after GrB_init: do the following:
-    #ifdef GxB_SUITESPARSE_GRAPHBLAS
-    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
-    #endif \end{verbatim} }
+{\footnotesize
+\begin{verbatim}
+    GxB_set (GxB_PRINTF, mexPrintf) \end{verbatim}}
 
-If this is done, and no other \verb'GxB_set' calls are made with
-\verb'GxB_FORMAT', all matrices will be stored by column.
-The default format is \verb'GxB_BY_ROW'.
+After each call to the \verb'printf' function, an optional
+\verb'flush' function is called, which is \verb'NULL' by default.  If
+\verb'NULL', the function is not used.  This can be changed with
+\verb'GxB_set (GxB_FLUSH, flush)'.  The \verb'flush' function takes no
+arguments, and returns an \verb'int' which is 0 if successful, or any nonzero
+value on failure (the same output as the ANSI C11 \verb'fflush' function,
+except that \verb'flush' has no inputs).
 
-All vectors (\verb'GrB_Vector') are held by column, and this cannot be changed.
+%-------------------------------------------------------------------------------
+\subsection{Other global options}
+%-------------------------------------------------------------------------------
 
-By default, matrices of size \verb'm-by-1' are held by column, regardless of
-the global setting described above.  Matrices of size \verb'1-by-n' with
-\verb'n' not equal to 1 are held by row, regardless of the global setting.
-The global setting only affects matrices with both \verb'm > 1' and \verb'n > 1'.
-Empty matrices (\verb'0-by-0') are also controlled by the global setting.
+\verb'GxB_MODE' can only be
+queried by \verb'GxB_get'; it cannot be modified by \verb'GxB_set'.  The mode
+is the value passed to \verb'GrB_init' (blocking or non-blocking).
 
-After creating a matrix with \verb'GrB_Matrix_new (&A, ...)',
-its format can be changed arbitrarily with \verb'GxB_set (A, GxB_FORMAT, ...)'.
-So even an \verb'm-by-1' matrix can then be changed to be held by row, for
-example.  Likewise, once a \verb'1-by-n' matrix is created, it can be converted
-to column-oriented format.
+All threads in the same user application share the same global options,
+including hypersparsity, bitmap options, and CSR/CSC format determined by
+\verb'GxB_set', and the blocking mode determined by \verb'GrB_init'.
+Specific format and hypersparsity parameters of each matrix are specific to
+that matrix and can be independently changed.
 
-%-------------------------------------------------------------------------------
-\subsection{Hypersparse matrices}
-\label{hypersparse}
-%-------------------------------------------------------------------------------
+The \verb'GxB_LIBRARY_*' options can be used with \verb'GxB_get' to query the
+current implementation.  For all of these, \verb'GxB_get' returns a string
+(\verb'char *'), except for \verb'GxB_LIBRARY_VERSION', which takes as input an
+\verb'int' array of size three.  The \verb'GxB_API_*' options can be used with
+\verb'GxB_get' to query the current GraphBLAS C API Specification.  For all of
+these, \verb'GxB_get' returns a string (\verb'char *'), except for
+\verb'GxB_API_VERSION', which takes as input an \verb'int' array of size three.  
 
-MATLAB can store an \verb'm'-by-\verb'n' matrix with a very large value of
-\verb'm', since a CSC data structure takes $O(n+|{\bf A}|)$ memory, independent
-of \verb'm', where $|{\bf A}|$ is the number of nonzeros in the matrix.  It
-cannot store a matrix with a huge \verb'n', and this structure is also
-inefficient when $|{\bf A}|$ is much smaller than \verb'n'.  In contrast,
-SuiteSparse:GraphBLAS can store its matrices in {\em hypersparse} format,
-taking only $O(|{\bf A}|)$ memory, independent of how it is stored (by row or
-by column) and independent of both \verb'm' and \verb'n'
-\cite{BulucGilbert08,BulucGilbert12}.
+%===============================================================================
+\subsection{{\sf GxB\_Global\_Option\_set:} set a global option}
+%===============================================================================
 
-In both the CSR and CSC formats, the matrix is held as a set of sparse vectors.
-In non-hypersparse format, the set of sparse vectors is itself dense; all
-vectors are present, even if they are empty.  For example, an
-\verb'm'-by-\verb'n' matrix in non-hypersparse CSC format contains \verb'n'
-sparse vectors.  Each column vector takes at least one integer to represent,
-even for a column with no entries.  This allows for quick lookup for a
-particular vector, but the memory required is $O(n+|{\bf A}|)$.  With a
-hypersparse CSC format, the set of vectors itself is sparse, and columns with
-no entries take no memory at all.  The drawback of the hypersparse format is
-that finding an arbitrary column vector \verb'j', such as for the computation
-\verb'C=A(:,j)', takes $O(\log k)$ time if there $k \le n$ vectors in the data
-structure.  One advantage of the hypersparse structure is the memory required
-for an \verb'm'-by-\verb'n' hypersparse CSC matrix is only $O(|{\bf A}|)$,
-independent of \verb'm' and \verb'n'.  Algorithms that must visit all non-empty
-columns of a matrix are much faster when working with hypersparse matrices,
-since empty columns can be skipped.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_set                    // set a global default option
+(
+    const GxB_Option_Field field,   // option to change
+    ...                             // value to change it to
+) ;
+\end{verbatim} } \end{mdframed}
 
-The \verb'hyper_switch' parameter controls the hypersparsity of the internal
-data structure for a matrix.  The parameter is typically in the range 0 to 1.
-The default is \verb'hyper_switch' = \verb'GxB_HYPER_DEFAULT', which is an
-\verb'extern' \verb'const' \verb'double' value, currently set to 0.0625, or
-1/16.  This default ratio may change in the future.
+This usage of \verb'GxB_set' sets the value of a global option.
+The \verb'field' parameter can be
+\verb'GxB_HYPER_SWITCH',
+\verb'GxB_BITMAP_SWITCH',
+\verb'GxB_FORMAT',
+\verb'GxB_NTHREADS',
+\verb'GxB_CHUNK',
+\verb'GxB_BURBLE',
+\verb'GxB_PRINTF',
+\verb'GxB_FLUSH',
+\verb'GxB_MEMORY_POOL',
+or
+\verb'GxB_PRINT_1BASED'.
 
-The \verb'hyper_switch' determines how the matrix is converted between the
-hypersparse and non-hypersparse formats.  Let $n$ be the number of columns of a
-CSC matrix, or the number of rows of a CSR matrix.  The matrix can have at most
-$n$ non-empty vectors.
+For example, the following usage sets the global hypersparsity ratio to 0.2,
+the format of future matrices to \verb'GxB_BY_COL', the maximum number
+of threads to 4, the chunk size to 10000, and enables the burble.
+No existing matrices are changed.
 
-Let $k$ be the actual number of non-empty vectors.  That is, for the CSC
-format, $k \le n$ is the number of columns that have at least one entry.  Let
-$h$ be the value of \verb'hyper_switch'.
+{\footnotesize
+\begin{verbatim}
+    GxB_set (GxB_HYPER_SWITCH, 0.2) ;
+    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
+    GxB_set (GxB_NTHREADS, 4) ;
+    GxB_set (GxB_CHUNK, (double) 10000) ;
+    GxB_set (GxB_BURBLE, true) ;
+    GxB_set (GxB_PRINTF, mexPrintf) ;
+\end{verbatim} }
 
-If a matrix is currently hypersparse, it can be converted to non-hypersparse if
-the either condition $n \le 1$ or $k > 2nh$ holds, or both.  Otherwise, it
-stays hypersparse.  Note that if $n \le 1$ the matrix is always stored as
-non-hypersparse.
+The memory pool parameter sets an upper bound on the number of freed blocks of
+memory that SuiteSparse:GraphBLAS keeps in its internal memory pool for future
+allocations.   \verb'free_pool_limit'  is an \verb'int64_t' array of size 64,
+and \verb'free_pool_limit [k]' is the upper bound on the number of blocks
+of size $2^k$ that are kept in the pool.  Passing in a \verb'NULL' pointer
+sets the defaults.  Passing in an array of size 64 whose entries are all zero
+disables the memory pool entirely.
 
-If currently non-hypersparse, it can be converted to hypersparse if
-both conditions $n > 1$ and $k \le nh$ hold.  Otherwise, it stays
-non-hypersparse.  Note that if $n \le 1$ the matrix always remains
-non-hypersparse.
+%===============================================================================
+\subsection{{\sf GxB\_Matrix\_Option\_set:} set a matrix option}
+%===============================================================================
 
-The default value of \verb'hyper_switch' is assigned at startup by
-\verb'GrB_init', and can then be modified globally with \verb'GxB_set'.  All
-new matrices are created with the same \verb'hyper_switch', determined by the
-global value.  Once a particular matrix \verb'A' has been constructed, its
-hypersparsity ratio can be modified from the default with:
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_set                    // set an option in a matrix
+(
+    GrB_Matrix A,                   // matrix to modify
+    const GxB_Option_Field field,   // option to change
+    ...                             // value to change it to
+) ;
+\end{verbatim} } \end{mdframed}
 
-    {\footnotesize
-    \begin{verbatim}
-    double hyper_switch = 0.2 ;
-    GxB_set (A, GxB_HYPER_SWITCH, hyper_switch) ; \end{verbatim}}
+This usage of \verb'GxB_set' sets the value of a matrix option, for a
+particular matrix.
+The \verb'field' parameter can be
+\verb'GxB_HYPER_SWITCH',
+\verb'GxB_BITMAP_SWITCH',
+\verb'GxB_SPARSITY_CONTROL', or
+\verb'GxB_FORMAT'.
+
+For example, the following usage sets the hypersparsity ratio to 0.2, and the
+format of \verb'GxB_BY_COL', for a particular matrix \verb'A', and sets the
+sparsity control to \verb'GxB_SPARSE+GxB_FULL' (allowing the matrix to be held
+in CSC or FullC formats, but not BitmapC or HyperCSC).  SuiteSparse:GraphBLAS
+currently applies these changes immediately, but since they are simply hints,
+future versions of SuiteSparse:GraphBLAS may delay the change in format if it
+can obtain better performance.
+
+If the setting is just \verb'GxB_FULL' and some entries are missing, then
+the matrix is held in bitmap format.
+
+{\footnotesize
+\begin{verbatim}
+    GxB_set (A, GxB_HYPER_SWITCH, 0.2) ;
+    GxB_set (A, GxB_FORMAT, GxB_BY_COL) ;
+    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_FULL) ;
+\end{verbatim} }
+
+For performance, the matrix option should be set as soon as it is created with
+\verb'GrB_Matrix_new', so the internal transformation takes less time.
 
-To force a matrix to always be non-hypersparse, use \verb'hyper_switch' equal to
-\verb'GxB_NEVER_HYPER'.  To force a matrix to always stay hypersparse, set
-\verb'hyper_switch' to \verb'GxB_ALWAYS_HYPER'.
+If an error occurs, \verb'GrB_error(&err,A)' returns details about the error.
 
-A \verb'GrB_Matrix' can thus be held in one of four formats: any combination of
-hyper/non-hyper and CSR/CSC.  All \verb'GrB_Vector' objects are always stored
-in non-hypersparse CSC format.
+%===============================================================================
+\subsection{{\sf GxB\_Desc\_set:} set a {\sf GrB\_Descriptor} value}
+%===============================================================================
+\label{gxbset}
 
-A new matrix created via \verb'GrB_Matrix_new' starts with $k=0$ and is created
-in hypersparse form by default unless $n \le 1$ or if $h<0$, where $h$ is the
-global \verb'hyper_switch' value.  The matrix is created in either
-\verb'GxB_BY_ROW' or \verb'GxB_BY_COL' format, as determined by the last call
-to \verb'GxB_set(GxB_FORMAT,...)' or \verb'GrB_init'.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_set                // set a parameter in a descriptor
+(
+    GrB_Descriptor desc,        // descriptor to modify
+    const GrB_Desc_Field field, // parameter to change
+    ...                         // value to change it to
+) ;
+\end{verbatim} } \end{mdframed}
 
-A new matrix \verb'C' created via \verb'GrB_dup (&C,A)' inherits the CSR/CSC
-format, hypersparsity format, and \verb'hyper_switch' from \verb'A'.
+This usage is similar to \verb'GrB_Descriptor_set', just with a name that is
+consistent with the other usages of this generic function.  Unlike
+\verb'GrB_Descriptor_set', the \verb'field' may also be \verb'GxB_NTHREADS',
+\verb'GxB_CHUNK', \verb'GxB_SORT', \verb'GxB_COMPRESSION', or
+\verb'GxB_IMPORT'.  Refer to Sections~\ref{descriptor_set}~and~\ref{desc_set}
+for details.  If an error occurs, \verb'GrB_error(&err,desc)' returns details
+about the error.
 
-%-------------------------------------------------------------------------------
-\subsection{Bitmap matrices}
-\label{bitmap_switch}
-%-------------------------------------------------------------------------------
+\newpage
+%===============================================================================
+\subsection{{\sf GxB\_Global\_Option\_get:} retrieve a global option}
+%===============================================================================
+\label{gxbget}
 
-By default, SuiteSparse:GraphBLAS switches between all four formats
-(hypersparse, sparse, bitmap, and full) automatically.  Let $d = |{\bf A}|/mn$
-for an $m$-by-$n$ matrix $\bf A$ with $|{\bf A}|$ entries.  If the matrix is
-currently in sparse or hypersparse format, and is modified so that $d$ exceeds
-a given threshold, it is converted into bitmap format.  The default threshold
-is controlled by the \verb'GxB_BITMAP_SWITCH' setting, which can be set
-globally, or for a particular matrix or vector.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_get                    // gets the current global default option
+(
+    const GxB_Option_Field field,   // option to query
+    ...                             // return value of the global option
+) ;
+\end{verbatim} } \end{mdframed}
 
-The default value of the switch to bitmap format depends on $\min(m,n)$, for a
-matrix of size $m$-by-$n$.  For the global setting, the bitmap switch is a
-\verb'double' array of size \verb'GxB_NBITMAP_SWITCH'.  The defaults are given
-below:
+This usage of \verb'GxB_get' retrieves the value of a global option.  The
+\verb'field' parameter can be one of the following:
 
 \vspace{0.2in}
-{\small
-\begin{tabular}{lll}
-parameter & default & matrix sizes \\
-\hline
-\verb'bitmap_switch [0]' & 0.04 & $\min(m,n) = 1$ (and all vectors) \\
-\verb'bitmap_switch [1]' & 0.05 & $\min(m,n) = 2$ \\
-\verb'bitmap_switch [2]' & 0.06 & $\min(m,n) = 3$ to 4 \\
-\verb'bitmap_switch [3]' & 0.08 & $\min(m,n) = 5$ to 8 \\
-\verb'bitmap_switch [4]' & 0.10 & $\min(m,n) = 9$ to 16\\
-\verb'bitmap_switch [5]' & 0.20 & $\min(m,n) = 17$ to 32\\
-\verb'bitmap_switch [6]' & 0.30 & $\min(m,n) = 33$ to 64 \\
-\verb'bitmap_switch [7]' & 0.40 & $\min(m,n) > 64$ \\
+{\footnotesize
+\begin{tabular}{ll}
+        \hline
+        \verb'GxB_HYPER_SWITCH'         & sparse/hyper setting \\
+        \verb'GxB_BITMAP_SWITCH'        & bitmap/sparse setting \\
+        \verb'GxB_FORMAT'               & by row/col setting \\
+        \verb'GxB_MODE'                 & blocking / non-blocking \\
+        \verb'GxB_NTHREADS'             & default number of threads \\
+        \verb'GxB_CHUNK'                & default chunk size \\
+        \verb'GxB_BURBLE'       & burble setting \\
+        \verb'GxB_PRINTF'       & printf function \\
+        \verb'GxB_FLUSH'        & flush function \\
+        \verb'GxB_MEMORY_POOL'  & memory pool control \\
+        \verb'GxB_PRINT_1BASED' & for printing matrices/vectors \\
+        \hline
+        \verb'GxB_LIBRARY_NAME'         & the string
+                                        \verb'"SuiteSparse:GraphBLAS"' \\
+        \verb'GxB_LIBRARY_VERSION'      & \verb'int' array of size 3 \\
+        \verb'GxB_LIBRARY_DATE'         & date of release \\
+        \verb'GxB_LIBRARY_ABOUT'        & author, copyright \\
+        \verb'GxB_LIBRARY_LICENSE'      & license for the library \\
+        \verb'GxB_LIBRARY_COMPILE_DATE' & date of compilation \\
+        \verb'GxB_LIBRARY_COMPILE_TIME' & time of compilation \\
+        \verb'GxB_LIBRARY_URL'          & URL of the library \\
+        \hline
+        \verb'GxB_API_VERSION'  & GraphBLAS C API Specification Version \\
+        \verb'GxB_API_DATE'     & date of the C API Spec.  \\
+        \verb'GxB_API_ABOUT'    & about of the C API Spec. \\
+        \verb'GxB_API_URL'      & URL of the specification \\
+        \hline
 \end{tabular}
 }
 \vspace{0.2in}
 
-That is, by default a \verb'GrB_Vector' is held in bitmap format if its density
-exceeds 4\%.  To change the global settings, do the following:
+For example:
 
 {\footnotesize
 \begin{verbatim}
-    double bswitch [GxB_NBITMAP_SWITCH] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 } ;
-    GxB_set (GxB_BITMAP_SWITCH, bswitch) ;
-\end{verbatim}
-}
+    double h ;
+    GxB_get (GxB_HYPER_SWITCH, &h) ;
+    printf ("hyper_switch = %g for all new matrices\n", h) ;
 
-If the matrix is currently in bitmap format, it is converted to full if all
-entries are present, or to sparse/hypersparse if $d$ drops below $b/2$, if its
-bitmap switch is $b$.  A matrix or vector with $d$ between $b/2$ and $b$
-remains in its current format.
+    double b [GxB_BITMAP_SWITCH] ;
+    GxB_get (GxB_BITMAP_SWITCH, b) ;
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        printf ("bitmap_switch [%d] = %g ", k, b [k]) ;
+        if (k == 0)
+        {
+            printf ("for vectors and matrices with 1 row or column\n") ;
+        }
+        else if (k == GxB_NBITMAP_SWITCH - 1) 
+        {
+            printf ("for matrices with min dimension > %d\n", 1 << (k-1)) ;
+        }
+        else
+        {
+            printf ("for matrices with min dimension %d to %d\n",
+                (1 << (k-1)) + 1, 1 << k) ;
+        }
+    }
 
-%-------------------------------------------------------------------------------
-\subsection{Parameter types}
-%-------------------------------------------------------------------------------
-The \verb'GxB_Option_Field' enumerated type gives the type of the \verb'field'
-parameter for the second argument of \verb'GxB_set' and \verb'GxB_get',
-for setting global options or matrix options.
+    GxB_Format_Value s ;
+    GxB_get (GxB_FORMAT, &s) ;
+    if (s == GxB_BY_COL) printf ("all new matrices are stored by column\n") ;
+    else printf ("all new matrices are stored by row\n") ;
+
+    GrB_mode mode ;
+    GxB_get (GxB_MODE, &mode) ;
+    if (mode == GrB_BLOCKING) printf ("GrB_init(GrB_BLOCKING) was called.\n") ;
+    else printf ("GrB_init(GrB_NONBLOCKING) was called.\n") ;
+
+    int nthreads_max ;
+    GxB_get (GxB_NTHREADS, &nthreads_max) ;
+    printf ("max # of threads to use: %d\n", nthreads_max) ;
+
+    double chunk ;
+    GxB_get (GxB_CHUNK, &chunk) ;
+    printf ("chunk size: %g\n", chunk) ;
+
+    int64_t free_pool_limit [64] ;
+    GxB_get (GxB_MEMORY_POOL, free_pool_limit) ;
+    for (int k = 0 ; k < 64 ; k++)
+        printf ("pool %d: limit %ld\n", free_pool_limit [k]) ;
+
+    char *name ;
+    int ver [3] ;
+    GxB_get (GxB_LIBRARY_NAME, &name) ;
+    GxB_get (GxB_LIBRARY_VERSION, ver) ;
+    printf ("Library %s, version %d.%d.%d\n", name, ver [0], ver [1], ver [2]) ; \end{verbatim} }
 
+%===============================================================================
+\subsection{{\sf GxB\_Matrix\_Option\_get:} retrieve a matrix option}
+%===============================================================================
+
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-typedef enum
-{
-    // for matrix/vector get/set and global get/set:
-    GxB_HYPER_SWITCH = 0,    // defines switch to hypersparse (double value)
-    GxB_BITMAP_SWITCH = 34,  // defines switch to hypersparse (double value)
-    GxB_FORMAT = 1,     // defines CSR/CSC format: GxB_BY_ROW or GxB_BY_COL
-    GxB_SPARSITY_CONTROL = 32,  // control the sparsity of a matrix or vector
-
-    // for global get/set only:
-    GxB_GLOBAL_NTHREADS = GxB_NTHREADS, // max number of threads to use
-    GxB_GLOBAL_CHUNK = GxB_CHUNK,       // chunk size for small problems
-    GxB_BURBLE = 99,                    // diagnositic output
-    GxB_PRINTF = 101,               // printf function for diagnostic output
-    GxB_FLUSH = 102,                // flush function for diagnostic output
-    GxB_MEMORY_POOL = 103,  // memory pool control
-    GxB_PRINT_1BASED = 104, // print matrices as 0-based or 1-based
+GrB_Info GxB_get                    // gets the current option of a matrix
+(
+    GrB_Matrix A,                   // matrix to query
+    GxB_Option_Field field,         // option to query
+    ...                             // return value of the matrix option
+) ;
+\end{verbatim} } \end{mdframed}
 
-    // for matrix/vector get only:
-    GxB_SPARSITY_STATUS = 33,   // query the sparsity of a matrix or vector
+This usage of \verb'GxB_get' retrieves the value of a matrix option.  The
+\verb'field' parameter can be
+\verb'GxB_HYPER_SWITCH',
+\verb'GxB_BITMAP_SWITCH',
+\verb'GxB_SPARSITY_CONTROL',
+\verb'GxB_SPARSITY_STATUS',
+or
+\verb'GxB_FORMAT'.
+For example:
 
-    // for global get only:
-    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
-    GxB_LIBRARY_NAME = 8,           // name of the library (char *)
-    GxB_LIBRARY_VERSION = 9,        // library version (3 int's)
-    GxB_LIBRARY_DATE = 10,          // date of the library (char *)
-    GxB_LIBRARY_ABOUT = 11,         // about the library (char *)
-    GxB_LIBRARY_URL = 12,           // URL for the library (char *)
-    GxB_LIBRARY_LICENSE = 13,       // license of the library (char *)
-    GxB_LIBRARY_COMPILE_DATE = 14,  // date library was compiled (char *)
-    GxB_LIBRARY_COMPILE_TIME = 15,  // time library was compiled (char *)
-    GxB_API_VERSION = 16,           // API version (3 int's)
-    GxB_API_DATE = 17,              // date of the API (char *)
-    GxB_API_ABOUT = 18,             // about the API (char *)
-    GxB_API_URL = 19,               // URL for the API (char *)
-}
-GxB_Option_Field ;
-\end{verbatim} }
+\vspace{-0.1in}
+{\footnotesize
+\begin{verbatim}
+    double h, b  ;
+    int sparsity, scontrol ;
+    GxB_get (A, GxB_SPARSITY_STATUS, &sparsity) ;
+    GxB_get (A, GxB_HYPER_SWITCH, &h) ;
+    printf ("matrix A has hyper_switch = %g\n", h) ;
+    GxB_get (A, GxB_BITMAP_SWITCH, &b) ;
+    printf ("matrix A has bitmap_switch = %g\n", b) ;
+    switch (sparsity)
+    {
+        case GxB_HYPERSPARSE: printf ("matrix A is hypersparse\n") ; break ;
+        case GxB_SPARSE:      printf ("matrix A is sparse\n"     ) ; break ;
+        case GxB_BITMAP:      printf ("matrix A is bitmap\n"     ) ; break ;
+        case GxB_FULL:        printf ("matrix A is full\n"       ) ; break ;
+    }
+    GxB_Format_Value s ;
+    GxB_get (A, GxB_FORMAT, &s) ;
+    printf ("matrix A is stored by %s\n", (s == GxB_BY_COL) ? "col" : "row") ;
+    GxB_get (A, GxB_SPARSITY_CONTROL, &scontrol) ;
+    if (scontrol & GxB_HYPERSPARSE) printf ("A may become hypersparse\n") ;
+    if (scontrol & GxB_SPARSE     ) printf ("A may become sparse\n") ;
+    if (scontrol & GxB_BITMAP     ) printf ("A may become bitmap\n") ;
+    if (scontrol & GxB_FULL       ) printf ("A may become full\n") ; \end{verbatim} }
 
-The \verb'GxB_FORMAT' field can be by row or by column, set to a value
-with the type \verb'GxB_Format_Value':
+\newpage
+%===============================================================================
+\subsection{{\sf GxB\_Desc\_get:} retrieve a {\sf GrB\_Descriptor} value}
+%===============================================================================
 
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-typedef enum
-{
-    GxB_BY_ROW = 0,     // CSR: compressed sparse row format
-    GxB_BY_COL = 1      // CSC: compressed sparse column format
-}
-GxB_Format_Value ;
-\end{verbatim} }
+GrB_Info GxB_get                // get a parameter from a descriptor
+(
+    GrB_Descriptor desc,        // descriptor to query; NULL means defaults
+    GrB_Desc_Field field,       // parameter to query
+    ...                         // value of the parameter
+) ;
+\end{verbatim} } \end{mdframed}
 
-The default format is given by the predefined value \verb'GxB_FORMAT_DEFAULT',
-which is equal to \verb'GxB_BY_ROW'.
-The default hypersparsity
-ratio is 0.0625 (1/16), but this value may change in the future.
+This usage is the same as \verb'GxB_Desc_get'.  The \verb'field' parameter can
+be \verb'GrB_OUTP', \verb'GrB_MASK', \verb'GrB_INP0', \verb'GrB_INP1',
+\verb'GxB_AxB_METHOD',
+\verb'GxB_NTHREADS',
+\verb'GxB_CHUNK',
+\verb'GxB_SORT',
+\verb'GxB_COMPRESSION', or
+\verb'GxB_IMPORT'.
+Refer to Section~\ref{desc_get} for details.
 
-Setting the \verb'GxB_HYPER_SWITCH' field to \verb'GxB_ALWAYS_HYPER' ensures a matrix
-always stays hypersparse.  If set to \verb'GxB_NEVER_HYPER', it always stays
-non-hypersparse.  At startup, \verb'GrB_init' defines the following initial
-settings:
+%===============================================================================
+\subsection{Summary of usage of {\sf GxB\_set} and {\sf GxB\_get}}
+%===============================================================================
 
-{\footnotesize
-\begin{verbatim}
-    GxB_set (GxB_HYPER_SWITCH, GxB_HYPER_DEFAULT) ;
-    GxB_set (GxB_FORMAT, GxB_BY_ROW) ;
-\end{verbatim} }
+The different usages of \verb'GxB_set' and \verb'GxB_get' are summarized below.
 
-That is, by default, all new matrices are held by row in CSR format (except
-for \verb'n-by-1' matrices; see \verb'GrB_Matrix_new').
-If a matrix has fewer than $n/16$
-columns, it can be converted to hypersparse format.  If it has more than $n/8$
-columns, it can be converted to non-hypersparse format.  These options can be
-changed for all future matrices with \verb'GxB_set'.  For example, to change
-all future matrices to be in non-hypersparse CSC when created, use:
+\noindent
+To set/get the global options:
 
-{\footnotesize
-\begin{verbatim}
+    {\footnotesize
+    \begin{verbatim}
+    GxB_set (GxB_HYPER_SWITCH, double h) ;
+    GxB_set (GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
     GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+    GxB_get (GxB_HYPER_SWITCH, double *h) ;
+    double b [GxB_NBITMAP_SWITCH] ;
+    GxB_set (GxB_BITMAP_SWITCH, b) ;
+    GxB_set (GxB_BITMAP_SWITCH, NULL) ;     // set defaults
+    GxB_get (GxB_BITMAP_SWITCH, b) ;
+    GxB_set (GxB_FORMAT, GxB_BY_ROW) ;
     GxB_set (GxB_FORMAT, GxB_BY_COL) ;
-\end{verbatim} }
+    GxB_get (GxB_FORMAT, GxB_Format_Value *s) ;
+    GxB_set (GxB_NTHREADS, int nthreads_max) ;
+    GxB_get (GxB_NTHREADS, int *nthreads_max) ;
+    GxB_set (GxB_CHUNK, double chunk) ;
+    GxB_get (GxB_CHUNK, double *chunk) ;
+    GxB_set (GxB_BURBLE, bool burble) ;
+    GxB_get (GxB_BURBLE, bool *burble) ;
+    GxB_set (GxB_PRINTF, void *printf_function) ;
+    GxB_get (GxB_PRINTF, void **printf_function) ;
+    GxB_set (GxB_FLUSH, void *flush_function) ;
+    GxB_get (GxB_FLUSH, void **flush_function) ;
+    int64_t free_pool_limit [64] ;
+    GxB_set (GxB_MEMORY_POOL, free_pool_limit) ;
+    GxB_set (GxB_MEMORY_POOL, NULL) ;     // set defaults
+    GxB_get (GxB_MEMORY_POOL, free_pool_limit) ;
+    GxB_set (GxB_PRINT_1BASED, bool onebased) ;
+    GxB_get (GxB_PRINT_1BASED, bool *onebased) ; \end{verbatim} }
 
-Then if a particular matrix needs a different format, then (as an example):
+\noindent
+To get global options that can be queried but not modified:
 
-{\footnotesize
-\begin{verbatim}
-    GxB_set (A, GxB_HYPER_SWITCH, 0.1) ;
-    GxB_set (A, GxB_FORMAT, GxB_BY_ROW) ;
-\end{verbatim} }
+    {\footnotesize
+    \begin{verbatim}
+    GxB_get (GxB_MODE,                 GrB_Mode *mode) ;
+    GxB_get (GxB_LIBRARY_NAME,         char **) ;
+    GxB_get (GxB_LIBRARY_VERSION,      int *) ;
+    GxB_get (GxB_LIBRARY_DATE,         char **) ;
+    GxB_get (GxB_LIBRARY_ABOUT,        char **) ;
+    GxB_get (GxB_LIBRARY_LICENSE,      char **) ;
+    GxB_get (GxB_LIBRARY_COMPILE_DATE, char **) ;
+    GxB_get (GxB_LIBRARY_COMPILE_TIME, char **) ;
+    GxB_get (GxB_LIBRARY_URL,          char **) ;
+    GxB_get (GxB_API_VERSION,          int *) ;
+    GxB_get (GxB_API_DATE,             char **) ;
+    GxB_get (GxB_API_ABOUT,            char **) ;
+    GxB_get (GxB_API_URL,              char **) ; \end{verbatim} }
 
-This changes the matrix \verb'A' so that it is stored by row, and it is
-converted from non-hypersparse to hypersparse format if it has fewer than 10\%
-non-empty columns.  If it is hypersparse, it is a candidate for conversion to
-non-hypersparse if has 20\% or more non-empty columns.  If it has between 10\%
-and 20\% non-empty columns, it remains in its current format.
-MATLAB only supports a non-hypersparse CSC format.  The format in
-SuiteSparse:GraphBLAS that is equivalent to the MATLAB format is:
+\noindent
+To set/get a matrix option or status
 
-{\footnotesize
-\begin{verbatim}
-    GrB_init (...) ;
-    GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
-    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
-    // no subsequent use of GxB_HYPER_SWITCH or GxB_FORMAT
-\end{verbatim} }
+    {\footnotesize
+    \begin{verbatim}
+    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, double h) ;
+    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
+    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+    GxB_get (GrB_Matrix A, GxB_HYPER_SWITCH, double *h) ;
+    GxB_set (GrB_Matrix A, GxB_BITMAP_SWITCH, double b) ;
+    GxB_get (GrB_Matrix A, GxB_BITMAP_SWITCH, double *b) ;
+    GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_ROW) ;
+    GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_COL) ;
+    GxB_get (GrB_Matrix A, GxB_FORMAT, GxB_Format_Value *s) ;
+    GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+    GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, scontrol) ;
+    GxB_get (GrB_Matrix A, GxB_SPARSITY_CONTROL, int *scontrol) ;
+    GxB_get (GrB_Matrix A, GxB_SPARSITY_STATUS, int *sparsity) ; \end{verbatim} }
+
+\noindent
+To set/get a vector option or status:
+
+    {\footnotesize
+    \begin{verbatim}
+    GxB_set (GrB_Vector v, GxB_BITMAP_SWITCH, double b) ;
+    GxB_get (GrB_Vector v, GxB_BITMAP_SWITCH, double *b) ;
+    GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_ROW) ;
+    GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_COL) ;
+    GxB_get (GrB_Vector v, GxB_FORMAT, GxB_Format_Value *s) ;
+    GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+    GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, scontrol) ;
+    GxB_get (GrB_Vector v, GxB_SPARSITY_CONTROL, int *scontrol) ;
+    GxB_get (GrB_Vector v, GxB_SPARSITY_STATUS, int *sparsity) ; \end{verbatim} }
+
+\noindent
+To set/get a descriptor field:
+
+    {\footnotesize
+    \begin{verbatim}
+    GxB_set (GrB_Descriptor d, GrB_OUTP, GxB_DEFAULT) ;
+    GxB_set (GrB_Descriptor d, GrB_OUTP, GrB_REPLACE) ;
+    GxB_get (GrB_Descriptor d, GrB_OUTP, GrB_Desc_Value *v) ;
+    GxB_set (GrB_Descriptor d, GrB_MASK, GxB_DEFAULT) ;
+    GxB_set (GrB_Descriptor d, GrB_MASK, GrB_COMP) ;
+    GxB_set (GrB_Descriptor d, GrB_MASK, GrB_STRUCTURE) ;
+    GxB_set (GrB_Descriptor d, GrB_MASK, GrB_COMP+GrB_STRUCTURE) ;
+    GxB_get (GrB_Descriptor d, GrB_MASK, GrB_Desc_Value *v) ;
+    GxB_set (GrB_Descriptor d, GrB_INP0, GxB_DEFAULT) ;
+    GxB_set (GrB_Descriptor d, GrB_INP0, GrB_TRAN) ;
+    GxB_get (GrB_Descriptor d, GrB_INP0, GrB_Desc_Value *v) ;
+    GxB_set (GrB_Descriptor d, GrB_INP1, GxB_DEFAULT) ;
+    GxB_set (GrB_Descriptor d, GrB_INP1, GrB_TRAN) ;
+    GxB_get (GrB_Descriptor d, GrB_INP1, GrB_Desc_Value *v) ;
+    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_DEFAULT) ;
+    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON) ;
+    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HASH) ;
+    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_SAXPY) ;
+    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_DOT) ;
+    GxB_get (GrB_Descriptor d, GrB_AxB_METHOD, GrB_Desc_Value *v) ;
+    GxB_set (GrB_Descriptor d, GxB_NTHREADS, int nthreads) ;
+    GxB_get (GrB_Descriptor d, GxB_NTHREADS, int *nthreads) ;
+    GxB_set (GrB_Descriptor d, GxB_CHUNK, double chunk) ;
+    GxB_get (GrB_Descriptor d, GxB_CHUNK, double *chunk) ;
+    GxB_set (GrB_Descriptor d, GxB_SORT, sort) ;
+    GxB_get (GrB_Descriptor d, GxB_SORT, int *sort) ;
+    GxB_set (GrB_Descriptor d, GxB_COMPRESSION, GxB_FAST_IMPORT) ;
+    GxB_set (GrB_Descriptor d, GxB_COMPRESSION, GxB_SECURE_IMPORT) ;
+    GxB_get (GrB_Descriptor d, GxB_COMPRESSION, GrB_Desc_Value *method) ;
+    GxB_set (GrB_Descriptor d, GxB_IMPORT, int method) ;
+    GxB_get (GrB_Descriptor d, GxB_IMPORT, int *method) ; \end{verbatim} }
+
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{colon}
+
+Octave/MATLAB uses a colon notation to index into matrices, such as
+\verb'C=A(2:4,3:8)', which extracts \verb'C' as 3-by-6 submatrix from \verb'A',
+from rows 2 through 4 and columns 3 to 8 of the matrix \verb'A'.  A single
+colon is used to denote all rows, \verb'C=A(:,9)', or all columns,
+\verb'C=A(12,:)', which refers to the 9th column and 12th row of \verb'A',
+respectively.  An arbitrary integer list can be given as well, such as the
+Octave/MATLAB statements:
+
+    {\footnotesize
+    \begin{verbatim}
+    I = [2 1 4] ;
+    J = [3 5] ;
+    C = A (I,J) ; \end{verbatim} }
+\noindent
+which creates the 3-by-2 matrix \verb'C' as follows:
+\[
+C =
+\left[
+\begin{array}{cc}
+a_{2,3} & a_{2,5} \\
+a_{1,3} & a_{1,5} \\
+a_{4,3} & a_{4,5} \\
+\end{array}
+\right]
+\]
+
+The GraphBLAS API can do the equivalent of \verb'C=A(I,J)',
+\verb'C=A(:,J)', \verb'C=A(I,:)', and \verb'C=A(:,:)', by passing a parameter
+\verb'const GrB_Index *I' as either an array of size \verb'ni', or as the
+special value \verb'GrB_ALL', which corresponds to the stand-alone colon
+\verb'C=A(:,J)', and the same can be done for \verb'J'..  To compute
+\verb'C=A(2:4,3:8)' in GraphBLAS requires the user application to create two
+explicit integer arrays \verb'I' and \verb'J' of size 3 and 5, respectively,
+and then fill them with the explicit values \verb'[2,3,4]' and
+\verb'[3,4,5,6,7,8]'.  This works well if the lists are small, or if the matrix
+has more entries than rows or columns.
+
+However, particularly with hypersparse matrices, the size of the explicit
+arrays \verb'I' and \verb'J' can vastly exceed the number of entries in the
+matrix.  When using its hypersparse format, SuiteSparse:GraphBLAS allows the
+user application to create a \verb'GrB_Matrix' with dimensions up to $2^{60}$,
+with no memory constraints.  The only constraint on memory usage in a
+hypersparse matrix is the number of entries in the matrix.
+
+For example, creating a $n$-by-$n$ matrix \verb'A' of type \verb'GrB_FP64' with
+$n=2^{60}$ and one million entries is trivial to do in Version 2.1 (and later)
+of SuiteSparse:GraphBLAS, taking at most 24MB of space.  SuiteSparse:GraphBLAS
+Version 2.1 (or later) could do this on an old smartphone.  However, using just
+the pure GraphBLAS API, constructing \verb'C=A(0:(n/2),0:(n/2))'
+in SuiteSparse Version 2.0 would require the creation of an integer array
+\verb'I' of size $2^{59}$, containing the sequence 0, 1, 2, 3, ...., requiring
+about 4 ExaBytes of memory (4 million terabytes).  This is roughly 1000 times
+larger than the memory size of the world's largest computer in 2018.
+
+SuiteSparse:GraphBLAS Version 2.1 and later extends the GraphBLAS API with a
+full implementation of the MATLAB colon notation for integers,
+\verb'I=begin:inc:end'.  This extension allows the construction of the matrix
+\verb'C=A(0:(n/2),0:(n/2))' in this example, with dimension $2^{59}$, probably
+taking just milliseconds on an old smartphone.
 
-The \verb'GxB_HYPER_SWITCH' and \verb'GxB_FORMAT' options should be considered as
-suggestions from the user application as to how SuiteSparse:GraphBLAS can
-obtain the best performance for a particular application.
-SuiteSparse:GraphBLAS is free to ignore any of these suggestions, both now and
-in the future, and the available options and formats may be augmented in the
-future.  Any prior options no longer needed in future versions of
-SuiteSparse:GraphBLAS will be silently ignored, so the use these options is
-safe for future updates.
+The \verb'GrB_extract', \verb'GrB_assign', and \verb'GxB_subassign' operations
+(described in the Section~\ref{operations}) each have parameters that define a
+list of integer indices, using two parameters:
 
-The sparsity status of a matrix can be queried with the following, which
-returns a value of \verb'GxB_HYPERSPARSE' \verb'GxB_SPARSE' \verb'GxB_BITMAP'
-or \verb'GxB_FULL'.
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    const GrB_Index *I ;    // an array, or a special value GrB_ALL
+    GrB_Index ni ;          // the size of I, or a special value \end{verbatim}}
 
-{\footnotesize
-\begin{verbatim}
-    int sparsity ;
-    GxB_get (A, GxB_SPARSITY_STATUS, &sparsity) ; \end{verbatim}}
+\vspace{-0.05in}
+These two parameters define five kinds of index lists, which can be used to
+specify either an explicit or implicit list of row indices and/or column
+indices.  The length of the list of indices is denoted \verb'|I|'.  This
+discussion applies equally to the row indices \verb'I' and the column indices
+\verb'J'.  The five kinds are listed below.
 
-The sparsity format of a matrix can be controlled with \verb'GxB_set', which
-can be any mix (a sum or bitwise or) of \verb'GxB_HYPERSPARSE'
-\verb'GxB_SPARSE' \verb'GxB_BITMAP', and \verb'GxB_FULL'.  By default, a matrix
-or vector can be held in any format, with the default setting
-\verb'GxB_AUTO_SPARSITY', which is equal to \verb'GxB_HYPERSPARSE' +
-\verb'GxB_SPARSE' + \verb'GxB_BITMAP' + \verb'GxB_FULL'.  To enable a matrix to
-take on just \verb'GxB_SPARSE' or \verb'GxB_FULL' formats, but not
-\verb'GxB_HYPERSPARSE' or \verb'GxB_BITMAP', for example, use the following:
+\begin{enumerate}
+\item
+    An explicit list of indices, such as \verb'I = [2 1 4 7 2]' in MATLAB
+    notation, is handled by passing in \verb'I' as a pointer to an array of
+    size 5, and passing \verb'ni=5' as the size of the list.
+    The length of the explicit list is \verb'ni=|I|'.
+    Duplicates may appear, except that for some uses of \verb'GrB_assign'
+    and \verb'GxB_subassign', duplicates lead to undefined behavior
+    according to the GraphBLAS C API Specification.
+    SuiteSparse:GraphBLAS specifies how duplicates are handled in all cases,
+    as an addition to the specification.
+    See Section~\ref{duplicates} for details.
 
-{\footnotesize
-\begin{verbatim}
-    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_FULL) ; \end{verbatim}}
+\item To specify all rows of a matrix, use \verb'I = GrB_ALL'.  The
+    parameter \verb'ni' is ignored.  This is equivalent to \verb'C=A(:,J)'
+    in MATLAB.  In GraphBLAS, this is the sequence \verb'0:(m-1)' if \verb'A'
+    has \verb'm' rows, with length \verb'|I|=m'.  If \verb'J' is used the
+    columns of an \verb'm'-by-\verb'n' matrix, then \verb'J=GrB_ALL' refers to
+    all columns, and is the sequence \verb'0:(n-1)', of length \verb'|J|=n'.
 
-In this case, SuiteSparse:GraphBLAS will hold the matrix in sparse format
-(\verb'CSC' or \verb'CSC', depending on its \verb'GxB_FORMAT'), unless all
-entries are present, in which case it will be converted to full format.
+    \begin{alert}
+    {\bf SPEC:} If \verb'I' or \verb'J' are \verb'GrB_ALL', the specification
+    requires that \verb'ni' be passed in as \verb'm' (the number of rows)
+    and \verb'nj' be passed in as \verb'n'.  Any other value is an error.
+    SuiteSparse:GraphBLAS ignores these scalar inputs and treats them as if
+    they are equal to their only possible correct value.
+    \end{alert}
 
-Only the least 4 bits of the sparsity control are considered, so the
-formats can be bitwise negated.  For example, to allow for any format
-except full:
+\item To specify a contiguous range of indices, such as \verb'I=10:20'
+    in MATLAB, the array \verb'I' has size 2, and \verb'ni' is passed to
+    SuiteSparse:GraphBLAS as the special value \verb'ni = GxB_RANGE'.  The
+    beginning index is \verb'I[GxB_BEGIN]' and the ending index is
+    \verb'I[GxB_END]'.   Both values must be non-negative since
+    \verb'GrB_Index' is an unsigned integer (\verb'uint64_t').  The value of
+    \verb'I[GxB_INC]' is ignored.
 
-{\footnotesize
-\begin{verbatim}
-    GxB_set (A, GxB_SPARSITY_CONTROL, ~GxB_FULL) ; \end{verbatim}}
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    // to specify I = 10:20
+    GrB_Index I [2], ni = GxB_RANGE ;
+    I [GxB_BEGIN] = 10 ;      // the start of the sequence
+    I [GxB_END  ] = 20 ;      // the end of the sequence \end{verbatim}}
 
-%-------------------------------------------------------------------------------
-\subsection{{\sf GxB\_BURBLE}, {\sf GxB\_PRINTF}, {\sf GxB\_FLUSH}: diagnostics}
-%-------------------------------------------------------------------------------
+    \vspace{-0.05in}
+    Let $b$ = \verb'I[GxB_BEGIN]', let $e$ = \verb'I[GxB_END]',
+    The sequence has length zero if $b > e$; otherwise the length is
+    $|I| = (e-b) + 1$.
 
-\verb'GxB_set (GxB_BURBLE, ...)' controls the burble setting.  It can also be
-controlled via \verb'GrB.burble(b)' in the Octave/MATLAB interface.
+\item To specify a strided range of indices with a non-negative stride,
+    such as \verb'I=3:2:10', the array \verb'I' has size 3, and \verb'ni' has
+    the special value \verb'GxB_STRIDE'.  This is the sequence 3, 5, 7, 9, of
+    length 4.  Note that 10 does not appear in the list.  The end point need
+    not appear if the increment goes past it.
 
-{\footnotesize
-\begin{verbatim}
-     GxB_set (GxB_BURBLE, true) ;   // enable burble
-     GxB_set (GxB_BURBLE, false) ;  // disable burble \end{verbatim}}
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    // to specify I = 3:2:10
+    GrB_Index I [3], ni = GxB_STRIDE ;
+    I [GxB_BEGIN ] = 3 ;      // the start of the sequence
+    I [GxB_INC   ] = 2 ;      // the increment
+    I [GxB_END   ] = 10 ;     // the end of the sequence \end{verbatim}}
 
-If enabled, SuiteSparse:GraphBLAS reports which internal kernels it uses, and
-how much time is spent.  If you see the word \verb'generic', it means that
-SuiteSparse:GraphBLAS was unable to use is faster kernels in
-\verb'Source/Generated2', but used a generic kernel that relies on function
-pointers.  This is done for user-defined types and operators, and when
-typecasting is performed, and it is typically slower than the kernels in
-\verb'Source/Generated2'.
+    \vspace{-0.05in}
+    The \verb'GxB_STRIDE' sequence is the same as the \verb'List' generated by
+    the following for loop:
 
-If you see a lot of \verb'wait' statements, it may mean that a lot of time is
-spent finishing a matrix or vector.  This may be the result of an inefficient
-use of the \verb'setElement' and \verb'assign' methods.  If this occurs you
-might try changing the sparsity format of a vector or matrix to
-\verb'GxB_BITMAP', assuming there's enough space for it.
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    int64_t k = 0 ;
+    GrB_Index *List = (a pointer to an array of large enough size)
+    for (int64_t i = I [GxB_BEGIN] ; i <= I [GxB_END] ; i += I [GxB_INC])
+    {
+        // i is the kth entry in the sequence
+        List [k++] = i ;
+    } \end{verbatim}}
 
-\verb'GxB_set (GxB_PRINTF, printf)' allows the user application to change the
-function used to print diagnostic output.  This also controls the output of the
-\verb'GxB_*print' functions.  By default this parameter is \verb'NULL', in
-which case the ANSI C11 \verb'printf' function is used.  The parameter is a
-function pointer with the same signature as the ANSI C11 \verb'printf'
-function.  The Octave/MATLAB interface to GraphBLAS uses the following so that
-GraphBLAS can print to the Octave/MATLAB Command Window:
+    \vspace{-0.05in}
+    Then passing the explicit array \verb'List' and its length \verb'ni=k' has
+    the same effect as passing in the array \verb'I' of size 3, with
+    \verb'ni=GxB_STRIDE'.  The latter is simply much faster to produce, and
+    much more efficient for SuiteSparse:GraphBLAS to process.
 
-{\footnotesize
-\begin{verbatim}
-    GxB_set (GxB_PRINTF, mexPrintf) \end{verbatim}}
+    Let $b$ = \verb'I[GxB_BEGIN]', let $e$ = \verb'I[GxB_END]', and let
+    $\Delta$ = \verb'I[GxB_INC]'.  The sequence has length zero if $b > e$ or
+    $\Delta=0$.  Otherwise, the length of the sequence is
+    \[
+    |I| = \Bigl\lfloor\dfrac{e-b}{\Delta}\Bigr\rfloor + 1
+    \]
 
-After each call to the \verb'printf' function, an optional
-\verb'flush' function is called, which is \verb'NULL' by default.  If
-\verb'NULL', the function is not used.  This can be changed with
-\verb'GxB_set (GxB_FLUSH, flush)'.  The \verb'flush' function takes no
-arguments, and returns an \verb'int' which is 0 if successful, or any nonzero
-value on failure (the same output as the ANSI C11 \verb'fflush' function,
-except that \verb'flush' has no inputs).
+\item
+    In MATLAB notation, if the stride is negative, the sequence is decreasing.
+    For example, \verb'10:-2:1' is the sequence 10, 8, 6, 4, 2, in that order.
+    In SuiteSparse:GraphBLAS, use \verb'ni = GxB_BACKWARDS', with an array
+    \verb'I' of size 3.  The following example specifies defines the equivalent
+    of the MATLAB expression \verb'10:-2:1' in SuiteSparse:GraphBLAS:
 
-%-------------------------------------------------------------------------------
-\subsection{Other global options}
-%-------------------------------------------------------------------------------
+    \vspace{-0.1in}
+    {\footnotesize
+    \begin{verbatim}
+    // to specify I = 10:-2:1
+    GrB_Index I [3], ni = GxB_BACKWARDS ;
+    I [GxB_BEGIN ] = 10 ;     // the start of the sequence
+    I [GxB_INC   ] = 2 ;      // the magnitude of the increment
+    I [GxB_END   ] = 1 ;      // the end of the sequence \end{verbatim}}
 
-\verb'GxB_MODE' can only be
-queried by \verb'GxB_get'; it cannot be modified by \verb'GxB_set'.  The mode
-is the value passed to \verb'GrB_init' (blocking or non-blocking).
+    \vspace{-0.1in}
+    The value -2 cannot be assigned to the \verb'GrB_Index' array \verb'I',
+    since that is an unsigned type.  The signed increment is represented
+    instead with the special value \verb'ni = GxB_BACKWARDS'.
+    The \verb'GxB_BACKWARDS' sequence is the same as generated by the following
+    for loop:
 
-All threads in the same user application share the same global options,
-including hypersparsity, bitmap options, and CSR/CSC format determined by
-\verb'GxB_set', and the blocking mode determined by \verb'GrB_init'.
-Specific format and hypersparsity parameters of each matrix are specific to
-that matrix and can be independently changed.
+    \vspace{-0.1in}
+    {\footnotesize
+    \begin{verbatim}
+    int64_t k = 0 ;
+    GrB_Index *List = (a pointer to an array of large enough size)
+    for (int64_t i = I [GxB_BEGIN] ; i >= I [GxB_END] ; i -= I [GxB_INC])
+    {
+        // i is the kth entry in the sequence
+        List [k++] = i ;
+    } \end{verbatim}}
 
-The \verb'GxB_LIBRARY_*' options can be used with \verb'GxB_get' to query the
-current implementation.  For all of these, \verb'GxB_get' returns a string
-(\verb'char *'), except for \verb'GxB_LIBRARY_VERSION', which takes as input an
-\verb'int' array of size three.  The \verb'GxB_API_*' options can be used with
-\verb'GxB_get' to query the current GraphBLAS C API Specification.  For all of
-these, \verb'GxB_get' returns a string (\verb'char *'), except for
-\verb'GxB_API_VERSION', which takes as input an \verb'int' array of size three.  
+    \vspace{-0.1in}
+    Let $b$ = \verb'I[GxB_BEGIN]', let $e$ = \verb'I[GxB_END]', and let
+    $\Delta$ = \verb'I[GxB_INC]' (note that $\Delta$ is not negative).  The
+    sequence has length zero if $b < e$ or $\Delta=0$.  Otherwise, the length
+    of the sequence is
+    \[
+    |I| = \Bigl\lfloor\dfrac{b-e}{\Delta}\Bigr\rfloor + 1
+    \]
 
-%===============================================================================
-\subsection{{\sf GxB\_Global\_Option\_set:} set a global option}
-%===============================================================================
+\end{enumerate}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_set                    // set a global default option
-(
-    const GxB_Option_Field field,   // option to change
-    ...                             // value to change it to
-) ;
-\end{verbatim} } \end{mdframed}
+Since \verb'GrB_Index' is an unsigned integer, all three values
+\verb'I[GxB_BEGIN]', \verb'I[GxB_INC]', and \verb'I[GxB_END]' must
+be non-negative.
 
-This usage of \verb'GxB_set' sets the value of a global option.
-The \verb'field' parameter can be
-\verb'GxB_HYPER_SWITCH',
-\verb'GxB_BITMAP_SWITCH',
-\verb'GxB_FORMAT',
-\verb'GxB_NTHREADS',
-\verb'GxB_CHUNK',
-\verb'GxB_BURBLE',
-\verb'GxB_PRINTF',
-\verb'GxB_FLUSH',
-\verb'GxB_MEMORY_POOL',
-or
-\verb'GxB_PRINT_1BASED'.
+Just as in MATLAB, it is valid to specify an empty sequence of length zero.
+For example, \verb'I = 5:3' has length zero in MATLAB and the same is
+true for a \verb'GxB_RANGE' sequence in SuiteSparse:GraphBLAS, with
+\verb'I[GxB_BEGIN]=5' and \verb'I[GxB_END]=3'.  This has the same
+effect as array \verb'I' with \verb'ni=0'.
 
-For example, the following usage sets the global hypersparsity ratio to 0.2,
-the format of future matrices to \verb'GxB_BY_COL', the maximum number
-of threads to 4, the chunk size to 10000, and enables the burble.
-No existing matrices are changed.
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{GraphBLAS Operations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{operations}
 
-{\footnotesize
-\begin{verbatim}
-    GxB_set (GxB_HYPER_SWITCH, 0.2) ;
-    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
-    GxB_set (GxB_NTHREADS, 4) ;
-    GxB_set (GxB_CHUNK, (double) 10000) ;
-    GxB_set (GxB_BURBLE, true) ;
-    GxB_set (GxB_PRINTF, mexPrintf) ;
-\end{verbatim} }
+The next sections define each of the GraphBLAS operations, also listed in the
+table below.
 
-The memory pool parameter sets an upper bound on the number of freed blocks of
-memory that SuiteSparse:GraphBLAS keeps in its internal memory pool for future
-allocations.   \verb'free_pool_limit'  is an \verb'int64_t' array of size 64,
-and \verb'free_pool_limit [k]' is the upper bound on the number of blocks
-of size $2^k$ that are kept in the pool.  Passing in a \verb'NULL' pointer
-sets the defaults.  Passing in an array of size 64 whose entries are all zero
-disables the memory pool entirely.
+\vspace{0.2in}
+{\small
+\begin{tabular}{lll}
+\hline
+\verb'GrB_mxm'       & matrix-matrix multiply  & ${\bf C \langle M \rangle = C \odot AB}$ \\
+\verb'GrB_vxm'       & vector-matrix multiply  & ${\bf w^{\sf T}\langle m^{\sf T}\rangle = w^{\sf T}\odot u^{\sf T}A}$ \\
+\verb'GrB_mxv'       & matrix-vector multiply  & ${\bf w \langle m \rangle = w \odot Au}$ \\
+\hline
+\verb'GrB_eWiseMult' & element-wise,           & ${\bf C \langle M \rangle = C \odot (A \otimes B)}$ \\
+                     & set intersection        & ${\bf w \langle m \rangle = w \odot (u \otimes v)}$ \\
+\hline
+\verb'GrB_eWiseAdd'  & element-wise,           & ${\bf C \langle M \rangle = C \odot (A \oplus  B)}$ \\
+                     & set union               & ${\bf w \langle m \rangle = w \odot (u \oplus  v)}$ \\
+\hline
+\verb'GxB_eWiseUnion'& element-wise,           & ${\bf C \langle M \rangle = C \odot (A \oplus  B)}$ \\
+                     & set union               & ${\bf w \langle m \rangle = w \odot (u \oplus  v)}$ \\
+\hline
+\verb'GrB_extract'   & extract submatrix       & ${\bf C \langle M \rangle = C \odot A(I,J)}$ \\
+                     &                         & ${\bf w \langle m \rangle = w \odot u(i)}$ \\
+\hline
+\verb'GxB_subassign' & assign submatrix,       & ${\bf C (I,J) \langle M \rangle = C(I,J) \odot A}$ \\
+                     & with submask for ${\bf C(I,J)}$
+                                               & ${\bf w (i)   \langle m \rangle = w(i)   \odot u}$ \\
+\hline
+\verb'GrB_assign'    & assign submatrix        & ${\bf C \langle M \rangle (I,J) = C(I,J) \odot A}$ \\
+                     & with submask for ${\bf C}$
+                                               & ${\bf w \langle m \rangle (i)   = w(i)   \odot u}$ \\
+\hline
+\verb'GrB_apply'     & apply unary operator    & ${\bf C \langle M \rangle = C \odot} f{\bf (A)}$ \\
+                     &                         & ${\bf w \langle m \rangle = w \odot} f{\bf (u)}$ \\
+                     & apply binary operator   & ${\bf C \langle M \rangle = C \odot} f(x,{\bf A})$ \\
+                     &                         & ${\bf C \langle M \rangle = C \odot} f({\bf A},y)$ \\
+                     &                         & ${\bf w \langle m \rangle = w \odot} f(x,{\bf x})$ \\
+                     &                         & ${\bf w \langle m \rangle = w \odot} f({\bf u},y)$ \\
+                     & apply index-unary op    & ${\bf C \langle M \rangle = C \odot} f({\bf A},i,j,k)$ \\
+                     &                         & ${\bf w \langle m \rangle = w \odot} f({\bf u},i,0,k)$ \\
+\hline
+\verb'GrB_select'    & select entries          & ${\bf C \langle M \rangle = C \odot} \mbox{select}({\bf A},i,j,k)$ \\
+                     &                         & ${\bf w \langle m \rangle = w \odot} \mbox{select}({\bf u},i,0,k)$ \\
+\hline
+\verb'GrB_reduce'    & reduce to vector        & ${\bf w \langle m \rangle = w \odot} [{\oplus}_j {\bf A}(:,j)]$ \\
+                     & reduce to scalar        & $s = s \odot [{\oplus}_{ij}  {\bf A}(I,J)]$ \\
+\hline
+\verb'GrB_transpose' & transpose               & ${\bf C \langle M \rangle = C \odot A^{\sf T}}$ \\
+\hline
+\verb'GrB_kronecker' & Kronecker product       & ${\bf C \langle M \rangle = C \odot \mbox{kron}(A, B)}$ \\
+\hline
+\end{tabular}
+}
+\vspace{0.2in}
+
+If an error occurs, \verb'GrB_error(&err,C)' or \verb'GrB_error(&err,w)'
+returns details about the error, for operations that return a modified matrix
+\verb'C' or vector \verb'w'.  The only operation that cannot return an error
+string is reduction to a scalar with \verb'GrB_reduce'.
 
+\newpage
 %===============================================================================
-\subsection{{\sf GxB\_Matrix\_Option\_set:} set a matrix option}
+\subsection{{\sf GrB\_mxm:} matrix-matrix multiply} %===========================
 %===============================================================================
+\label{mxm}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_set                    // set an option in a matrix
+GrB_Info GrB_mxm                    // C<Mask> = accum (C, A*B)
 (
-    GrB_Matrix A,                   // matrix to modify
-    const GxB_Option_Field field,   // option to change
-    ...                             // value to change it to
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_Semiring semiring,    // defines '+' and '*' for A*B
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
 ) ;
 \end{verbatim} } \end{mdframed}
 
-This usage of \verb'GxB_set' sets the value of a matrix option, for a
-particular matrix.
-The \verb'field' parameter can be
-\verb'GxB_HYPER_SWITCH',
-\verb'GxB_BITMAP_SWITCH',
-\verb'GxB_SPARSITY_CONTROL', or
-\verb'GxB_FORMAT'.
-
-For example, the following usage sets the hypersparsity ratio to 0.2, and the
-format of \verb'GxB_BY_COL', for a particular matrix \verb'A', and sets the
-sparsity control to \verb'GxB_SPARSE+GxB_FULL' (allowing the matrix to be held
-in CSC or FullC formats, but not BitmapC or HyperCSC).  SuiteSparse:GraphBLAS
-currently applies these changes immediately, but since they are simply hints,
-future versions of SuiteSparse:GraphBLAS may delay the change in format if it
-can obtain better performance.
-
-If the setting is just \verb'GxB_FULL' and some entries are missing, then
-the matrix is held in bitmap format.
+\verb'GrB_mxm' multiplies two sparse matrices \verb'A' and \verb'B' using the
+\verb'semiring'.  The input matrices \verb'A' and \verb'B' may be transposed
+according to the descriptor, \verb'desc' (which may be \verb'NULL') and then
+typecasted to match the multiply operator of the \verb'semiring'.  Next,
+\verb'T=A*B' is computed on the \verb'semiring', precisely defined in the
+\verb'GB_spec_mxm.m' script in \verb'GraphBLAS/Test'.  The actual algorithm
+exploits sparsity and does not take $O(n^3)$ time, but it computes the
+following:
 
 {\footnotesize
 \begin{verbatim}
-    GxB_set (A, GxB_HYPER_SWITCH, 0.2) ;
-    GxB_set (A, GxB_FORMAT, GxB_BY_COL) ;
-    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_FULL) ;
-\end{verbatim} }
-
-For performance, the matrix option should be set as soon as it is created with
-\verb'GrB_Matrix_new', so the internal transformation takes less time.
+[m s] = size (A.matrix) ;
+[s n] = size (B.matrix) ;
+T.matrix  = zeros (m, n, multiply.ztype) ;
+T.pattern = zeros (m, n, 'logical') ;
+T.matrix (:,:) = identity ;             % the identity of the semiring's monoid
+T.class = multiply.ztype ;              % the ztype of the semiring's multiply op
+A = cast (A.matrix, multiply.xtype) ;   % the xtype of the semiring's multiply op
+B = cast (B.matrix, multiply.ytype) ;   % the ytype of the semiring's multiply op
+for j = 1:n
+    for i = 1:m
+        for k = 1:s
+            % T (i,j) += A (i,k) * B (k,j), using the semiring
+            if (A.pattern (i,k) && B.pattern (k,j))
+                z = multiply (A (i,k), B (k,j)) ;
+                T.matrix  (i,j) = add (T.matrix (i,j),  z) ;
+                T.pattern (i,j) = true ;
+            end
+        end
+    end
+end \end{verbatim}}
 
-If an error occurs, \verb'GrB_error(&err,A)' returns details about the error.
+Finally, \verb'T' is typecasted into the type of \verb'C', and the results are
+written back into \verb'C' via the \verb'accum' and \verb'Mask', ${\bf C
+\langle M \rangle  = C \odot T}$.  The latter step is reflected in the MATLAB
+function \verb'GB_spec_accum_mask.m', discussed in Section~\ref{accummask}.
 
-%===============================================================================
-\subsection{{\sf GxB\_Desc\_set:} set a {\sf GrB\_Descriptor} value}
-%===============================================================================
-\label{gxbset}
+\paragraph{\bf Performance considerations:}
+Suppose all matrices are in \verb'GxB_BY_COL' format, and \verb'B' is extremely
+sparse but \verb'A' is not as sparse.  Then computing \verb'C=A*B' is very
+fast, and much faster than when \verb'A' is extremely sparse.  For example, if
+\verb'A' is square and \verb'B' is a column vector that is all nonzero except
+for one entry \verb'B(j,0)=1', then \verb'C=A*B' is the same as extracting
+column \verb'A(:,j)'.  This is very fast if \verb'A' is stored by column but
+slow if \verb'A' is stored by row.  If \verb'A' is a sparse row with a single
+entry \verb'A(0,i)=1', then \verb'C=A*B' is the same as extracting row
+\verb'B(i,:)'.  This is fast if \verb'B' is stored by row but slow if \verb'B'
+is stored by column.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_set                // set a parameter in a descriptor
-(
-    GrB_Descriptor desc,        // descriptor to modify
-    const GrB_Desc_Field field, // parameter to change
-    ...                         // value to change it to
-) ;
-\end{verbatim} } \end{mdframed}
+If the user application needs to repeatedly extract rows and columns from a
+matrix, whether by matrix multiplication or by \verb'GrB_extract', then keep
+two copies: one stored by row, and other by column, and use the copy that
+results in the fastest computation.
 
-This usage is similar to \verb'GrB_Descriptor_set', just with a name that is
-consistent with the other usages of this generic function.  Unlike
-\verb'GrB_Descriptor_set', the \verb'field' may also be \verb'GxB_NTHREADS',
-\verb'GxB_CHUNK', \verb'GxB_SORT', \verb'GxB_COMPRESSION', or
-\verb'GxB_IMPORT'.  Refer to Sections~\ref{descriptor_set}~and~\ref{desc_set}
-for details.  If an error occurs, \verb'GrB_error(&err,desc)' returns details
-about the error.
+By default, \verb'GrB_mxm', \verb'GrB_mxv', \verb'GrB_vxm', and
+\verb'GrB_reduce' (to vector) can return their result in a jumbled state, with
+the sort left pending.  It can sometimes be faster for these methods to do the
+sort as they compute their result.  Use the \verb'GxB_SORT' descriptor setting
+to select this option.  Refer to Section~\ref{descriptor} for details.
 
 \newpage
 %===============================================================================
-\subsection{{\sf GxB\_Global\_Option\_get:} retrieve a global option}
+\subsection{{\sf GrB\_vxm:} vector-matrix multiply} %===========================
 %===============================================================================
-\label{gxbget}
+\label{vxm}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_get                    // gets the current global default option
+GrB_Info GrB_vxm                    // w'<mask> = accum (w, u'*A)
 (
-    const GxB_Option_Field field,   // option to query
-    ...                             // return value of the global option
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_Semiring semiring,    // defines '+' and '*' for u'*A
+    const GrB_Vector u,             // first input:  vector u
+    const GrB_Matrix A,             // second input: matrix A
+    const GrB_Descriptor desc       // descriptor for w, mask, and A
 ) ;
 \end{verbatim} } \end{mdframed}
 
-This usage of \verb'GxB_get' retrieves the value of a global option.  The
-\verb'field' parameter can be one of the following:
-
-\vspace{0.2in}
-{\footnotesize
-\begin{tabular}{ll}
-        \hline
-        \verb'GxB_HYPER_SWITCH'         & sparse/hyper setting \\
-        \verb'GxB_BITMAP_SWITCH'        & bitmap/sparse setting \\
-        \verb'GxB_FORMAT'               & by row/col setting \\
-        \verb'GxB_MODE'                 & blocking / non-blocking \\
-        \verb'GxB_NTHREADS'             & default number of threads \\
-        \verb'GxB_CHUNK'                & default chunk size \\
-        \verb'GxB_BURBLE'       & burble setting \\
-        \verb'GxB_PRINTF'       & printf function \\
-        \verb'GxB_FLUSH'        & flush function \\
-        \verb'GxB_MEMORY_POOL'  & memory pool control \\
-        \verb'GxB_PRINT_1BASED' & for printing matrices/vectors \\
-        \hline
-        \verb'GxB_LIBRARY_NAME'         & the string
-                                        \verb'"SuiteSparse:GraphBLAS"' \\
-        \verb'GxB_LIBRARY_VERSION'      & \verb'int' array of size 3 \\
-        \verb'GxB_LIBRARY_DATE'         & date of release \\
-        \verb'GxB_LIBRARY_ABOUT'        & author, copyright \\
-        \verb'GxB_LIBRARY_LICENSE'      & license for the library \\
-        \verb'GxB_LIBRARY_COMPILE_DATE' & date of compilation \\
-        \verb'GxB_LIBRARY_COMPILE_TIME' & time of compilation \\
-        \verb'GxB_LIBRARY_URL'          & URL of the library \\
-        \hline
-        \verb'GxB_API_VERSION'  & GraphBLAS C API Specification Version \\
-        \verb'GxB_API_DATE'     & date of the C API Spec.  \\
-        \verb'GxB_API_ABOUT'    & about of the C API Spec. \\
-        \verb'GxB_API_URL'      & URL of the specification \\
-        \hline
-\end{tabular}
-}
-\vspace{0.2in}
+\verb'GrB_vxm' multiplies a row vector \verb"u'" times a matrix \verb'A'.  The
+matrix \verb'A' may be first transposed according to \verb'desc' (as the second
+input, \verb'GrB_INP1'); the column vector \verb'u' is never transposed via the
+descriptor.  The inputs \verb'u' and \verb'A' are typecasted to match the
+\verb'xtype' and \verb'ytype' inputs, respectively, of the multiply operator of
+the \verb'semiring'.  Next, an intermediate column vector \verb"t=A'*u" is
+computed on the \verb'semiring' using the same method as \verb'GrB_mxm'.
+Finally, the column vector \verb't' is typecasted from the \verb'ztype' of the
+multiply operator of the \verb'semiring' into the type of \verb'w', and the
+results are written back into \verb'w' using the optional accumulator
+\verb'accum' and \verb'mask'.
 
-For example:
+The last step is ${\bf w \langle m \rangle  = w \odot t}$, as described
+in Section~\ref{accummask}, except that all the
+terms are column vectors instead of matrices.
+
+\paragraph{\bf Performance considerations:} % u'=u'*A
+If the \verb'GxB_FORMAT' of \verb'A' is \verb'GxB_BY_ROW', and the default
+descriptor is used (\verb'A' is not transposed), then \verb'GrB_vxm' is faster
+than than \verb'GrB_mxv' with its default descriptor, when the vector \verb'u'
+is very sparse.
+However, if the \verb'GxB_FORMAT' of \verb'A' is \verb'GxB_BY_COL', then
+\verb'GrB_mxv' with its default descriptor is faster than \verb'GrB_vxm' with
+its default descriptor, when the vector \verb'u' is very sparse.
+Using the non-default \verb'GrB_TRAN' descriptor for \verb'A' makes the
+\verb'GrB_vxm' operation equivalent to \verb'GrB_mxv' with its default
+descriptor (with the operands reversed in the multiplier, as well).  The
+reverse is true as well; \verb'GrB_mxv' with \verb'GrB_TRAN' is the same as
+\verb'GrB_vxm' with a default descriptor.
+
+\newpage
+%===============================================================================
+\subsection{{\sf GrB\_mxv:} matrix-vector multiply} %===========================
+%===============================================================================
+\label{mxv}
 
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-    double h ;
-    GxB_get (GxB_HYPER_SWITCH, &h) ;
-    printf ("hyper_switch = %g for all new matrices\n", h) ;
+GrB_Info GrB_mxv                    // w<mask> = accum (w, A*u)
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_Semiring semiring,    // defines '+' and '*' for A*B
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Vector u,             // second input: vector u
+    const GrB_Descriptor desc       // descriptor for w, mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-    double b [GxB_BITMAP_SWITCH] ;
-    GxB_get (GxB_BITMAP_SWITCH, b) ;
-    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
-    {
-        printf ("bitmap_switch [%d] = %g ", k, b [k]) ;
-        if (k == 0)
-        {
-            printf ("for vectors and matrices with 1 row or column\n") ;
-        }
-        else if (k == GxB_NBITMAP_SWITCH - 1) 
-        {
-            printf ("for matrices with min dimension > %d\n", 1 << (k-1)) ;
-        }
-        else
-        {
-            printf ("for matrices with min dimension %d to %d\n",
-                (1 << (k-1)) + 1, 1 << k) ;
-        }
-    }
+\verb'GrB_mxv' multiplies a matrix \verb'A' times a column vector \verb'u'.
+The matrix \verb'A' may be first transposed according to \verb'desc' (as the
+first input); the column vector \verb'u' is never transposed via the
+descriptor.  The inputs \verb'A' and \verb'u' are typecasted to match the
+\verb'xtype' and \verb'ytype' inputs, respectively, of the multiply operator of
+the \verb'semiring'. Next, an intermediate column vector \verb't=A*u' is
+computed on the \verb'semiring' using the same method as \verb'GrB_mxm'.
+Finally, the column vector \verb't' is typecasted from the \verb'ztype' of the
+multiply operator of the \verb'semiring' into the type of \verb'w', and the
+results are written back into \verb'w' using the optional accumulator
+\verb'accum' and \verb'mask'.
 
-    GxB_Format_Value s ;
-    GxB_get (GxB_FORMAT, &s) ;
-    if (s == GxB_BY_COL) printf ("all new matrices are stored by column\n") ;
-    else printf ("all new matrices are stored by row\n") ;
+The last step is ${\bf w \langle m \rangle  = w \odot t}$, as described
+in Section~\ref{accummask}, except that all the terms are column vectors instead
+of matrices.
 
-    GrB_mode mode ;
-    GxB_get (GxB_MODE, &mode) ;
-    if (mode == GrB_BLOCKING) printf ("GrB_init(GrB_BLOCKING) was called.\n") ;
-    else printf ("GrB_init(GrB_NONBLOCKING) was called.\n") ;
+\paragraph{\bf Performance considerations:} % u=A*u
+Refer to the discussion of \verb'GrB_vxm'.  In SuiteSparse:GraphBLAS,
+\verb'GrB_mxv' is very efficient when \verb'u' is sparse or dense, when the
+default descriptor is used, and when the matrix is \verb'GxB_BY_COL'.  When
+\verb'u' is very sparse and \verb'GrB_INP0' is set to its non-default
+\verb'GrB_TRAN', then this method is not efficient if the matrix is in
+\verb'GxB_BY_COL' format.  If an application needs to perform \verb"A'*u"
+repeatedly where \verb'u' is very sparse, then use the \verb'GxB_BY_ROW' format
+for \verb'A' instead.
 
-    int nthreads_max ;
-    GxB_get (GxB_NTHREADS, &nthreads_max) ;
-    printf ("max # of threads to use: %d\n", nthreads_max) ;
+\newpage
+%===============================================================================
+\subsection{{\sf GrB\_eWiseMult:} element-wise operations, set intersection} %==
+%===============================================================================
+\label{eWiseMult}
 
-    double chunk ;
-    GxB_get (GxB_CHUNK, &chunk) ;
-    printf ("chunk size: %g\n", chunk) ;
+Element-wise ``multiplication'' is shorthand for applying a binary operator
+element-wise on two matrices or vectors \verb'A' and \verb'B', for all entries
+that appear in the set intersection of the patterns of \verb'A' and \verb'B'.
+This is like \verb'A.*B' for two sparse matrices in MATLAB, except that in
+GraphBLAS any binary operator can be used, not just multiplication.
 
-    int64_t free_pool_limit [64] ;
-    GxB_get (GxB_MEMORY_POOL, free_pool_limit) ;
-    for (int k = 0 ; k < 64 ; k++)
-        printf ("pool %d: limit %ld\n", free_pool_limit [k]) ;
+The pattern of the result of the element-wise ``multiplication'' is exactly
+this set intersection.  Entries in \verb'A' but not \verb'B', or visa versa, do
+not appear in the result.
 
-    char *name ;
-    int ver [3] ;
-    GxB_get (GxB_LIBRARY_NAME, &name) ;
-    GxB_get (GxB_LIBRARY_VERSION, ver) ;
-    printf ("Library %s, version %d.%d.%d\n", name, ver [0], ver [1], ver [2]) ; \end{verbatim} }
+Let $\otimes$ denote the binary operator to be used.  The computation ${\bf T =
+A \otimes B}$ is given below.  Entries not in the intersection of ${\bf A}$ and
+${\bf B}$ do not appear in the pattern of ${\bf T}$.  That is:
+    \vspace{-0.2in}
+    {\small
+    \begin{tabbing}
+    \hspace{2em} \= \hspace{2em} \= \hspace{2em} \= \\
+    \> for all entries $(i,j)$ in ${\bf A \cap B}$ \\
+    \> \> $t_{ij} = a_{ij} \otimes b_{ij}$ \\
+    \end{tabbing} }
+    \vspace{-0.2in}
 
-%===============================================================================
-\subsection{{\sf GxB\_Matrix\_Option\_get:} retrieve a matrix option}
-%===============================================================================
+Depending on what kind of operator is used and what the implicit value is
+assumed to be, this can give the Hadamard product.  This is the case for
+\verb'A.*B' in MATLAB since the implicit value is zero.  However, computing a
+Hadamard product is not necessarily the goal of the \verb'eWiseMult' operation.
+It simply applies any binary operator, built-in or user-defined, to the set
+intersection of \verb'A' and \verb'B', and discards any entry outside this
+intersection.  Its usefulness in a user's application does not depend upon it
+computing a Hadamard product in all cases.  The operator need not be
+associative, commutative, nor have any particular property except for type
+compatibility with \verb'A' and \verb'B', and the output matrix \verb'C'.
+
+The generic name for this operation is \verb'GrB_eWiseMult', which can be used
+for both matrices and vectors.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Vector\_eWiseMult:} element-wise vector multiply}
+%-------------------------------------------------------------------------------
+\label{eWiseMult_vector}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_get                    // gets the current option of a matrix
+GrB_Info GrB_eWiseMult              // w<mask> = accum (w, u.*v)
 (
-    GrB_Matrix A,                   // matrix to query
-    GxB_Option_Field field,         // option to query
-    ...                             // return value of the matrix option
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const <operator> multiply,      // defines '.*' for t=u.*v
+    const GrB_Vector u,             // first input:  vector u
+    const GrB_Vector v,             // second input: vector v
+    const GrB_Descriptor desc       // descriptor for w and mask
 ) ;
-\end{verbatim} } \end{mdframed}
+\end{verbatim}
+} \end{mdframed}
 
-This usage of \verb'GxB_get' retrieves the value of a matrix option.  The
-\verb'field' parameter can be
-\verb'GxB_HYPER_SWITCH',
-\verb'GxB_BITMAP_SWITCH',
-\verb'GxB_SPARSITY_CONTROL',
-\verb'GxB_SPARSITY_STATUS',
-or
-\verb'GxB_FORMAT'.
-For example:
+\verb'GrB_Vector_eWiseMult' computes the element-wise ``multiplication'' of two
+vectors \verb'u' and \verb'v', element-wise using any binary operator (not just
+times).  The vectors are not transposed via the descriptor.  The vectors
+\verb'u' and \verb'v' are first typecasted into the first and second inputs of
+the \verb'multiply' operator.  Next, a column vector \verb't' is computed,
+denoted ${\bf t = u \otimes v}$.  The pattern of \verb't' is the set
+intersection of \verb'u' and \verb'v'.  The result \verb't' has the type of the
+output \verb'ztype' of the \verb'multiply' operator.
 
-\vspace{-0.1in}
+The \verb'operator' is typically a \verb'GrB_BinaryOp', but the method is
+type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
+additive operator of the monoid is used as the \verb'multiply' binary operator.
+If given a semiring (\verb'GrB_Semiring'), the multiply operator of the
+semiring is used as the \verb'multiply' binary operator.
+
+The next and final step is ${\bf w \langle m \rangle  = w \odot t}$, as
+described in Section~\ref{accummask}, except that all the terms are column
+vectors instead of matrices.  Note for all GraphBLAS operations, including this
+one, the accumulator ${\bf w \odot t}$ is always applied in a set union manner,
+even though ${\bf t = u \otimes v}$ for this operation is applied in a set
+intersection manner.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_eWiseMult:} element-wise matrix multiply}
+%-------------------------------------------------------------------------------
+\label{eWiseMult_matrix}
+
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-    double h, b  ;
-    int sparsity, scontrol ;
-    GxB_get (A, GxB_SPARSITY_STATUS, &sparsity) ;
-    GxB_get (A, GxB_HYPER_SWITCH, &h) ;
-    printf ("matrix A has hyper_switch = %g\n", h) ;
-    GxB_get (A, GxB_BITMAP_SWITCH, &b) ;
-    printf ("matrix A has bitmap_switch = %g\n", b) ;
-    switch (sparsity)
-    {
-        case GxB_HYPERSPARSE: printf ("matrix A is hypersparse\n") ; break ;
-        case GxB_SPARSE:      printf ("matrix A is sparse\n"     ) ; break ;
-        case GxB_BITMAP:      printf ("matrix A is bitmap\n"     ) ; break ;
-        case GxB_FULL:        printf ("matrix A is full\n"       ) ; break ;
-    }
-    GxB_Format_Value s ;
-    GxB_get (A, GxB_FORMAT, &s) ;
-    printf ("matrix A is stored by %s\n", (s == GxB_BY_COL) ? "col" : "row") ;
-    GxB_get (A, GxB_SPARSITY_CONTROL, &scontrol) ;
-    if (scontrol & GxB_HYPERSPARSE) printf ("A may become hypersparse\n") ;
-    if (scontrol & GxB_SPARSE     ) printf ("A may become sparse\n") ;
-    if (scontrol & GxB_BITMAP     ) printf ("A may become bitmap\n") ;
-    if (scontrol & GxB_FULL       ) printf ("A may become full\n") ; \end{verbatim} }
+GrB_Info GrB_eWiseMult              // C<Mask> = accum (C, A.*B)
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const <operator> multiply,      // defines '.*' for T=A.*B
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+) ;
+\end{verbatim}
+} \end{mdframed}
 
-\newpage
-%===============================================================================
-\subsection{{\sf GxB\_Desc\_get:} retrieve a {\sf GrB\_Descriptor} value}
-%===============================================================================
+\verb'GrB_Matrix_eWiseMult' computes the element-wise ``multiplication'' of two
+matrices \verb'A' and \verb'B', element-wise using any binary operator (not
+just times).  The input matrices may be transposed first, according to the
+descriptor \verb'desc'.  They are then typecasted into the first and second
+inputs of the \verb'multiply' operator.  Next, a matrix \verb'T' is computed,
+denoted ${\bf T = A \otimes B}$.  The pattern of \verb'T' is the set
+intersection of \verb'A' and \verb'B'.  The result \verb'T' has the type of the
+output \verb'ztype' of the \verb'multiply' operator.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_get                // get a parameter from a descriptor
-(
-    GrB_Descriptor desc,        // descriptor to query; NULL means defaults
-    GrB_Desc_Field field,       // parameter to query
-    ...                         // value of the parameter
-) ;
-\end{verbatim} } \end{mdframed}
+The \verb'multiply' operator is typically a \verb'GrB_BinaryOp', but the method
+is type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
+additive operator of the monoid is used as the \verb'multiply' binary operator.
+If given a semiring (\verb'GrB_Semiring'), the multiply operator of the
+semiring is used as the \verb'multiply' binary operator.
 
-This usage is the same as \verb'GxB_Desc_get'.  The \verb'field' parameter can
-be \verb'GrB_OUTP', \verb'GrB_MASK', \verb'GrB_INP0', \verb'GrB_INP1',
-\verb'GxB_AxB_METHOD',
-\verb'GxB_NTHREADS',
-\verb'GxB_CHUNK',
-\verb'GxB_SORT',
-\verb'GxB_COMPRESSION', or
-\verb'GxB_IMPORT'.
-Refer to Section~\ref{desc_get} for details.
+\vspace{0.05in}
+The operation can be expressed in MATLAB notation as:
+    {\footnotesize
+    \begin{verbatim}
+    [nrows, ncols] = size (A.matrix) ;
+    T.matrix = zeros (nrows, ncols, multiply.ztype) ;
+    T.class = multiply.ztype ;
+    p = A.pattern & B.pattern ;
+    A = cast (A.matrix (p), multiply.xtype) ;
+    B = cast (B.matrix (p), multiply.ytype) ;
+    T.matrix (p) = multiply (A, B) ;
+    T.pattern = p ; \end{verbatim} }
+
+The final step is ${\bf C \langle M \rangle  = C \odot T}$, as described in
+Section~\ref{accummask}.  Note for all GraphBLAS operations, including this
+one, the accumulator ${\bf C \odot T}$ is always applied in a set union manner,
+even though ${\bf T = A \otimes B}$ for this operation is applied in a set
+intersection manner.
 
+\newpage
 %===============================================================================
-\subsection{Summary of usage of {\sf GxB\_set} and {\sf GxB\_get}}
+\subsection{{\sf GrB\_eWiseAdd:} element-wise operations, set union} %==========
 %===============================================================================
+\label{eWiseAdd}
 
-The different usages of \verb'GxB_set' and \verb'GxB_get' are summarized below.
+Element-wise ``addition'' is shorthand for applying a binary operator
+element-wise on two matrices or vectors \verb'A' and \verb'B', for all entries
+that appear in the set intersection of the patterns of \verb'A' and \verb'B'.
+This is like \verb'A+B' for two sparse matrices in MATLAB, except that in
+GraphBLAS any binary operator can be used, not just addition.  The pattern of
+the result of the element-wise ``addition'' is the set union of the pattern of
+\verb'A' and \verb'B'.  Entries in neither in \verb'A' nor in \verb'B' do
+not appear in the result.
 
-\noindent
-To set/get the global options:
+Let $\oplus$ denote the binary operator to be used.  The computation ${\bf T =
+A \oplus B}$ is exactly the same as the computation with accumulator operator
+as described in Section~\ref{accummask}.  It acts like a sparse matrix
+addition, except that any operator can be used.  The pattern of ${\bf A \oplus
+B}$ is the set union of the patterns of ${\bf A}$ and ${\bf B}$, and the
+operator is applied only on the set intersection of ${\bf A}$ and ${\bf B}$.
+Entries not in either the pattern of ${\bf A}$ or ${\bf B}$ do not appear in
+the pattern of ${\bf T}$.  That is:
+    \vspace{-0.2in}
+    {\small
+    \begin{tabbing}
+    \hspace{2em} \= \hspace{2em} \= \hspace{2em} \= \\
+    \> for all entries $(i,j)$ in ${\bf A \cap B}$ \\
+    \> \> $t_{ij} = a_{ij} \oplus b_{ij}$ \\
+    \> for all entries $(i,j)$ in ${\bf A \setminus B}$ \\
+    \> \> $t_{ij} = a_{ij}$ \\
+    \> for all entries $(i,j)$ in ${\bf B \setminus A}$ \\
+    \> \> $t_{ij} = b_{ij}$
+    \end{tabbing}
+    }
 
-    {\footnotesize
-    \begin{verbatim}
-    GxB_set (GxB_HYPER_SWITCH, double h) ;
-    GxB_set (GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
-    GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
-    GxB_get (GxB_HYPER_SWITCH, double *h) ;
-    double b [GxB_NBITMAP_SWITCH] ;
-    GxB_set (GxB_BITMAP_SWITCH, b) ;
-    GxB_set (GxB_BITMAP_SWITCH, NULL) ;     // set defaults
-    GxB_get (GxB_BITMAP_SWITCH, b) ;
-    GxB_set (GxB_FORMAT, GxB_BY_ROW) ;
-    GxB_set (GxB_FORMAT, GxB_BY_COL) ;
-    GxB_get (GxB_FORMAT, GxB_Format_Value *s) ;
-    GxB_set (GxB_NTHREADS, int nthreads_max) ;
-    GxB_get (GxB_NTHREADS, int *nthreads_max) ;
-    GxB_set (GxB_CHUNK, double chunk) ;
-    GxB_get (GxB_CHUNK, double *chunk) ;
-    GxB_set (GxB_BURBLE, bool burble) ;
-    GxB_get (GxB_BURBLE, bool *burble) ;
-    GxB_set (GxB_PRINTF, void *printf_function) ;
-    GxB_get (GxB_PRINTF, void **printf_function) ;
-    GxB_set (GxB_FLUSH, void *flush_function) ;
-    GxB_get (GxB_FLUSH, void **flush_function) ;
-    int64_t free_pool_limit [64] ;
-    GxB_set (GxB_MEMORY_POOL, free_pool_limit) ;
-    GxB_set (GxB_MEMORY_POOL, NULL) ;     // set defaults
-    GxB_get (GxB_MEMORY_POOL, free_pool_limit) ;
-    GxB_set (GxB_PRINT_1BASED, bool onebased) ;
-    GxB_get (GxB_PRINT_1BASED, bool *onebased) ; \end{verbatim} }
+The only difference between element-wise ``multiplication'' (${\bf T =A \otimes
+B}$) and ``addition'' (${\bf T = A \oplus B}$) is the pattern of the result,
+and what happens to entries outside the intersection.  With $\otimes$ the
+pattern of ${\bf T}$ is the intersection; with $\oplus$ it is the set union.
+Entries outside the set intersection are dropped for $\otimes$, and kept for
+$\oplus$; in both cases the operator is only applied to those (and only those)
+entries in the intersection.  Any binary operator can be used interchangeably
+for either operation.
 
-\noindent
-To get global options that can be queried but not modified:
+Element-wise operations do not operate on the implicit values, even implicitly,
+since the operations make no assumption about the semiring.  As a result, the
+results can be different from MATLAB, which can always assume the implicit
+value is zero.  For example, \verb'C=A-B' is the conventional matrix
+subtraction in MATLAB.  Computing \verb'A-B' in GraphBLAS with \verb'eWiseAdd'
+will apply the \verb'MINUS' operator to the intersection, entries in \verb'A'
+but not \verb'B' will be unchanged and appear in \verb'C', and entries in
+neither \verb'A' nor \verb'B' do not appear in \verb'C'.  For these cases, the
+results matches the MATLAB \verb'C=A-B'.  Entries in \verb'B' but not \verb'A'
+do appear in \verb'C' but they are not negated; they cannot be subtracted from
+an implicit value in \verb'A'.  This is by design.  If conventional matrix
+subtraction of two sparse matrices is required, and the implicit value is known
+to be zero, use \verb'GrB_apply' to negate the values in \verb'B', and then
+use \verb'eWiseAdd' with the \verb'PLUS' operator, to compute \verb'A+(-B)'.
 
-    {\footnotesize
-    \begin{verbatim}
-    GxB_get (GxB_MODE,                 GrB_Mode *mode) ;
-    GxB_get (GxB_LIBRARY_NAME,         char **) ;
-    GxB_get (GxB_LIBRARY_VERSION,      int *) ;
-    GxB_get (GxB_LIBRARY_DATE,         char **) ;
-    GxB_get (GxB_LIBRARY_ABOUT,        char **) ;
-    GxB_get (GxB_LIBRARY_LICENSE,      char **) ;
-    GxB_get (GxB_LIBRARY_COMPILE_DATE, char **) ;
-    GxB_get (GxB_LIBRARY_COMPILE_TIME, char **) ;
-    GxB_get (GxB_LIBRARY_URL,          char **) ;
-    GxB_get (GxB_API_VERSION,          int *) ;
-    GxB_get (GxB_API_DATE,             char **) ;
-    GxB_get (GxB_API_ABOUT,            char **) ;
-    GxB_get (GxB_API_URL,              char **) ; \end{verbatim} }
+The generic name for this operation is \verb'GrB_eWiseAdd', which can be used
+for both matrices and vectors.
 
-\noindent
-To set/get a matrix option or status
+There is another minor difference in two variants of the element-wise
+functions.  If given a \verb'semiring', the \verb'eWiseAdd' functions use the
+binary operator of the semiring's monoid, while the \verb'eWiseMult' functions
+use the multiplicative operator of the semiring.
 
-    {\footnotesize
-    \begin{verbatim}
-    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, double h) ;
-    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
-    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
-    GxB_get (GrB_Matrix A, GxB_HYPER_SWITCH, double *h) ;
-    GxB_set (GrB_Matrix A, GxB_BITMAP_SWITCH, double b) ;
-    GxB_get (GrB_Matrix A, GxB_BITMAP_SWITCH, double *b) ;
-    GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_ROW) ;
-    GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_COL) ;
-    GxB_get (GrB_Matrix A, GxB_FORMAT, GxB_Format_Value *s) ;
-    GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
-    GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, scontrol) ;
-    GxB_get (GrB_Matrix A, GxB_SPARSITY_CONTROL, int *scontrol) ;
-    GxB_get (GrB_Matrix A, GxB_SPARSITY_STATUS, int *sparsity) ; \end{verbatim} }
+% \newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Vector\_eWiseAdd:} element-wise vector addition}
+%-------------------------------------------------------------------------------
+\label{eWiseAdd_vector}
 
-\noindent
-To set/get a vector option or status:
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_eWiseAdd               // w<mask> = accum (w, u+v)
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const <operator> add,           // defines '+' for t=u+v
+    const GrB_Vector u,             // first input:  vector u
+    const GrB_Vector v,             // second input: vector v
+    const GrB_Descriptor desc       // descriptor for w and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-    {\footnotesize
-    \begin{verbatim}
-    GxB_set (GrB_Vector v, GxB_BITMAP_SWITCH, double b) ;
-    GxB_get (GrB_Vector v, GxB_BITMAP_SWITCH, double *b) ;
-    GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_ROW) ;
-    GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_COL) ;
-    GxB_get (GrB_Vector v, GxB_FORMAT, GxB_Format_Value *s) ;
-    GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
-    GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, scontrol) ;
-    GxB_get (GrB_Vector v, GxB_SPARSITY_CONTROL, int *scontrol) ;
-    GxB_get (GrB_Vector v, GxB_SPARSITY_STATUS, int *sparsity) ; \end{verbatim} }
+\verb'GrB_Vector_eWiseAdd' computes the element-wise ``addition'' of two
+vectors \verb'u' and \verb'v', element-wise using any binary operator (not just
+plus).  The vectors are not transposed via the descriptor.  Entries in the
+intersection of \verb'u' and \verb'v' are first typecasted into the first and
+second inputs of the \verb'add' operator.  Next, a column vector \verb't' is
+computed, denoted ${\bf t = u \oplus v}$.  The pattern of \verb't' is the set
+union of \verb'u' and \verb'v'.  The result \verb't' has the type of the output
+\verb'ztype' of the \verb'add' operator.
 
-\noindent
-To set/get a descriptor field:
+The \verb'add' operator is typically a \verb'GrB_BinaryOp', but the method is
+type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
+additive operator of the monoid is used as the \verb'add' binary operator.  If
+given a semiring (\verb'GrB_Semiring'), the additive operator of the monoid of
+the semiring is used as the \verb'add' binary operator.
 
-    {\footnotesize
-    \begin{verbatim}
-    GxB_set (GrB_Descriptor d, GrB_OUTP, GxB_DEFAULT) ;
-    GxB_set (GrB_Descriptor d, GrB_OUTP, GrB_REPLACE) ;
-    GxB_get (GrB_Descriptor d, GrB_OUTP, GrB_Desc_Value *v) ;
-    GxB_set (GrB_Descriptor d, GrB_MASK, GxB_DEFAULT) ;
-    GxB_set (GrB_Descriptor d, GrB_MASK, GrB_COMP) ;
-    GxB_set (GrB_Descriptor d, GrB_MASK, GrB_STRUCTURE) ;
-    GxB_set (GrB_Descriptor d, GrB_MASK, GrB_COMP+GrB_STRUCTURE) ;
-    GxB_get (GrB_Descriptor d, GrB_MASK, GrB_Desc_Value *v) ;
-    GxB_set (GrB_Descriptor d, GrB_INP0, GxB_DEFAULT) ;
-    GxB_set (GrB_Descriptor d, GrB_INP0, GrB_TRAN) ;
-    GxB_get (GrB_Descriptor d, GrB_INP0, GrB_Desc_Value *v) ;
-    GxB_set (GrB_Descriptor d, GrB_INP1, GxB_DEFAULT) ;
-    GxB_set (GrB_Descriptor d, GrB_INP1, GrB_TRAN) ;
-    GxB_get (GrB_Descriptor d, GrB_INP1, GrB_Desc_Value *v) ;
-    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_DEFAULT) ;
-    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON) ;
-    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HASH) ;
-    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_SAXPY) ;
-    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_DOT) ;
-    GxB_get (GrB_Descriptor d, GrB_AxB_METHOD, GrB_Desc_Value *v) ;
-    GxB_set (GrB_Descriptor d, GxB_NTHREADS, int nthreads) ;
-    GxB_get (GrB_Descriptor d, GxB_NTHREADS, int *nthreads) ;
-    GxB_set (GrB_Descriptor d, GxB_CHUNK, double chunk) ;
-    GxB_get (GrB_Descriptor d, GxB_CHUNK, double *chunk) ;
-    GxB_set (GrB_Descriptor d, GxB_SORT, sort) ;
-    GxB_get (GrB_Descriptor d, GxB_SORT, int *sort) ;
-    GxB_set (GrB_Descriptor d, GxB_COMPRESSION, GxB_FAST_IMPORT) ;
-    GxB_set (GrB_Descriptor d, GxB_COMPRESSION, GxB_SECURE_IMPORT) ;
-    GxB_get (GrB_Descriptor d, GxB_COMPRESSION, GrB_Desc_Value *method) ;
-    GxB_set (GrB_Descriptor d, GxB_IMPORT, int method) ;
-    GxB_get (GrB_Descriptor d, GxB_IMPORT, int *method) ; \end{verbatim} }
+The final step is ${\bf w \langle m \rangle  = w \odot t}$, as described in
+Section~\ref{accummask}, except that all the terms are column vectors instead
+of matrices.
+
+% \newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_eWiseAdd:} element-wise matrix addition}
+%-------------------------------------------------------------------------------
+\label{eWiseAdd_matrix}
 
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{colon}
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_eWiseAdd               // C<Mask> = accum (C, A+B)
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const <operator> add,           // defines '+' for T=A+B
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+) ;
+\end{verbatim} } \end{mdframed}
 
-Octave/MATLAB uses a colon notation to index into matrices, such as
-\verb'C=A(2:4,3:8)', which extracts \verb'C' as 3-by-6 submatrix from \verb'A',
-from rows 2 through 4 and columns 3 to 8 of the matrix \verb'A'.  A single
-colon is used to denote all rows, \verb'C=A(:,9)', or all columns,
-\verb'C=A(12,:)', which refers to the 9th column and 12th row of \verb'A',
-respectively.  An arbitrary integer list can be given as well, such as the
-Octave/MATLAB statements:
+\verb'GrB_Matrix_eWiseAdd' computes the element-wise ``addition'' of two
+matrices \verb'A' and \verb'B', element-wise using any binary operator (not
+just plus).  The input matrices may be transposed first, according to the
+descriptor \verb'desc'.  Entries in the intersection then typecasted into the
+first and second inputs of the \verb'add' operator.  Next, a matrix \verb'T' is
+computed, denoted ${\bf T = A \oplus B}$.  The pattern of \verb'T' is the set
+union of \verb'A' and \verb'B'.  The result \verb'T' has the type of the output
+\verb'ztype' of the \verb'add' operator.
+
+The \verb'add' operator is typically a \verb'GrB_BinaryOp', but the method is
+type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
+additive operator of the monoid is used as the \verb'add' binary operator.  If
+given a semiring (\verb'GrB_Semiring'), the additive operator of the monoid of
+the semiring is used as the \verb'add' binary operator.
 
+\vspace{0.05in}
+The operation can be expressed in MATLAB notation as:
     {\footnotesize
     \begin{verbatim}
-    I = [2 1 4] ;
-    J = [3 5] ;
-    C = A (I,J) ; \end{verbatim} }
-\noindent
-which creates the 3-by-2 matrix \verb'C' as follows:
-\[
-C =
-\left[
-\begin{array}{cc}
-a_{2,3} & a_{2,5} \\
-a_{1,3} & a_{1,5} \\
-a_{4,3} & a_{4,5} \\
-\end{array}
-\right]
-\]
-
-The GraphBLAS API can do the equivalent of \verb'C=A(I,J)',
-\verb'C=A(:,J)', \verb'C=A(I,:)', and \verb'C=A(:,:)', by passing a parameter
-\verb'const GrB_Index *I' as either an array of size \verb'ni', or as the
-special value \verb'GrB_ALL', which corresponds to the stand-alone colon
-\verb'C=A(:,J)', and the same can be done for \verb'J'..  To compute
-\verb'C=A(2:4,3:8)' in GraphBLAS requires the user application to create two
-explicit integer arrays \verb'I' and \verb'J' of size 3 and 5, respectively,
-and then fill them with the explicit values \verb'[2,3,4]' and
-\verb'[3,4,5,6,7,8]'.  This works well if the lists are small, or if the matrix
-has more entries than rows or columns.
+    [nrows, ncols] = size (A.matrix) ;
+    T.matrix = zeros (nrows, ncols, add.ztype) ;
+    p = A.pattern & B.pattern ;
+    A = GB_mex_cast (A.matrix (p), add.xtype) ;
+    B = GB_mex_cast (B.matrix (p), add.ytype) ;
+    T.matrix (p) = add (A, B) ;
+    p =  A.pattern & ~B.pattern ; T.matrix (p) = cast (A.matrix (p), add.ztype) ;
+    p = ~A.pattern &  B.pattern ; T.matrix (p) = cast (B.matrix (p), add.ztype) ;
+    T.pattern = A.pattern | B.pattern ;
+    T.class = add.ztype ; \end{verbatim} }
+Except for when typecasting is performed, this is identical to how the
+\verb'accum' operator is applied in Figure~\ref{fig_accummask}.
 
-However, particularly with hypersparse matrices, the size of the explicit
-arrays \verb'I' and \verb'J' can vastly exceed the number of entries in the
-matrix.  When using its hypersparse format, SuiteSparse:GraphBLAS allows the
-user application to create a \verb'GrB_Matrix' with dimensions up to $2^{60}$,
-with no memory constraints.  The only constraint on memory usage in a
-hypersparse matrix is the number of entries in the matrix.
+The final step is ${\bf C \langle M \rangle  = C \odot T}$, as described in
+Section~\ref{accummask}.
 
-For example, creating a $n$-by-$n$ matrix \verb'A' of type \verb'GrB_FP64' with
-$n=2^{60}$ and one million entries is trivial to do in Version 2.1 (and later)
-of SuiteSparse:GraphBLAS, taking at most 24MB of space.  SuiteSparse:GraphBLAS
-Version 2.1 (or later) could do this on an old smartphone.  However, using just
-the pure GraphBLAS API, constructing \verb'C=A(0:(n/2),0:(n/2))'
-in SuiteSparse Version 2.0 would require the creation of an integer array
-\verb'I' of size $2^{59}$, containing the sequence 0, 1, 2, 3, ...., requiring
-about 4 ExaBytes of memory (4 million terabytes).  This is roughly 1000 times
-larger than the memory size of the world's largest computer in 2018.
+\newpage
+%===============================================================================
+\subsection{{\sf GxB\_eWiseUnion:} element-wise operations, set union} %========
+%===============================================================================
+\label{eWiseUnion}
 
-SuiteSparse:GraphBLAS Version 2.1 and later extends the GraphBLAS API with a
-full implementation of the MATLAB colon notation for integers,
-\verb'I=begin:inc:end'.  This extension allows the construction of the matrix
-\verb'C=A(0:(n/2),0:(n/2))' in this example, with dimension $2^{59}$, probably
-taking just milliseconds on an old smartphone.
+\verb'GxB_eWiseUnion' computes a result with the same pattern
+\verb'GrB_eWiseAdd', namely, a set union of its two inputs.  It differs in how
+the binary operator is applied.
 
-The \verb'GrB_extract', \verb'GrB_assign', and \verb'GxB_subassign' operations
-(described in the Section~\ref{operations}) each have parameters that define a
-list of integer indices, using two parameters:
+Let $\oplus$ denote the binary operator to be used.  The operator is applied to
+every entry in $\bf A$ and $\bf B$.  A pair of scalars, $\alpha$ and $\beta$
+(\verb'alpha' and \verb'beta' in the API, respectively) define the
+inputs to the operator when entries are present in one matrix but not the
+other.
 
-    \vspace{-0.05in}
-    {\footnotesize
-    \begin{verbatim}
-    const GrB_Index *I ;    // an array, or a special value GrB_ALL
-    GrB_Index ni ;          // the size of I, or a special value \end{verbatim}}
+    \vspace{-0.2in}
+    {\small
+    \begin{tabbing}
+    \hspace{2em} \= \hspace{2em} \= \hspace{2em} \= \\
+    \> for all entries $(i,j)$ in ${\bf A \cap B}$ \\
+    \> \> $t_{ij} = a_{ij} \oplus b_{ij}$ \\
+    \> for all entries $(i,j)$ in ${\bf A \setminus B}$ \\
+    \> \> $t_{ij} = a_{ij} \oplus \beta $ \\
+    \> for all entries $(i,j)$ in ${\bf B \setminus A}$ \\
+    \> \> $t_{ij} = \alpha \oplus b_{ij}$
+    \end{tabbing}
+    }
 
-\vspace{-0.05in}
-These two parameters define five kinds of index lists, which can be used to
-specify either an explicit or implicit list of row indices and/or column
-indices.  The length of the list of indices is denoted \verb'|I|'.  This
-discussion applies equally to the row indices \verb'I' and the column indices
-\verb'J'.  The five kinds are listed below.
+\verb'GxB_eWiseUnion' is useful in contexts where \verb'GrB_eWiseAdd' cannot be
+used because of the typecasting rules of GraphBLAS.  In particular, suppose
+\verb'A' and \verb'B' are matrices with a user-defined type, and suppose
+\verb'<' is a user-defined operator that compares two entries of this type and
+returns a Boolean value.  Then \verb'C=A<B' can be computed with
+\verb'GxB_eWiseUnion' but not with \verb'GrB_eWiseAdd'.  In the latter, if
+\verb'A(i,j)' is present but \verb'B(i,j)' is not, then \verb'A(i,j)' must
+typecasted to the type of \verb'C' (\verb'GrB_BOOL' in this case), and the
+assigment \verb'C(i,j) = (bool) A(i,j)' would be performed.  This is not
+possible because user-defined types cannot be typecasted to any other type.
 
-\begin{enumerate}
-\item
-    An explicit list of indices, such as \verb'I = [2 1 4 7 2]' in MATLAB
-    notation, is handled by passing in \verb'I' as a pointer to an array of
-    size 5, and passing \verb'ni=5' as the size of the list.
-    The length of the explicit list is \verb'ni=|I|'.
-    Duplicates may appear, except that for some uses of \verb'GrB_assign'
-    and \verb'GxB_subassign', duplicates lead to undefined behavior
-    according to the GraphBLAS C API Specification.
-    SuiteSparse:GraphBLAS specifies how duplicates are handled in all cases,
-    as an addition to the specification.
-    See Section~\ref{duplicates} for details.
+Another advantage of \verb'GxB_eWiseUnion' is its performance.  For example,
+the Octave/MATLAB expression \verb'C=A-B' computes \verb'C(i,j)=-B(i,j)' when
+\verb'A(i,j)' is not present.  This cannot be done with a single call
+\verb'GrB_eWiseAdd', but it can be done with a single call to
+\verb'GxB_eWiseUnion', with the \verb'GrB_MINUS_FP64' operator, and with both
+\verb'alpha' and \verb'beta' scalars equal to zero.  It is possible to
+compute this result with a temporary matrix, \verb'E=-B', computed with
+\verb'GrB_apply' and \verb'GrB_AINV_FP64', followed by a call to
+\verb'GrB_eWiseAdd' to compute \verb'C=A+E', but this is slower than a single
+call to \verb'GxB_eWiseUnion', and uses more memory.
 
-\item To specify all rows of a matrix, use \verb'I = GrB_ALL'.  The
-    parameter \verb'ni' is ignored.  This is equivalent to \verb'C=A(:,J)'
-    in MATLAB.  In GraphBLAS, this is the sequence \verb'0:(m-1)' if \verb'A'
-    has \verb'm' rows, with length \verb'|I|=m'.  If \verb'J' is used the
-    columns of an \verb'm'-by-\verb'n' matrix, then \verb'J=GrB_ALL' refers to
-    all columns, and is the sequence \verb'0:(n-1)', of length \verb'|J|=n'.
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Vector\_eWiseUnion:} element-wise vector addition}
+%-------------------------------------------------------------------------------
+\label{eWiseUnion_vector}
 
-    \begin{alert}
-    {\bf SPEC:} If \verb'I' or \verb'J' are \verb'GrB_ALL', the specification
-    requires that \verb'ni' be passed in as \verb'm' (the number of rows)
-    and \verb'nj' be passed in as \verb'n'.  Any other value is an error.
-    SuiteSparse:GraphBLAS ignores these scalar inputs and treats them as if
-    they are equal to their only possible correct value.
-    \end{alert}
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_eWiseUnion             // w<mask> = accum (w, u+v)
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_BinaryOp add,         // defines '+' for t=u+v
+    const GrB_Vector u,             // first input:  vector u
+    const GrB_Scalar alpha,
+    const GrB_Vector v,             // second input: vector v
+    const GrB_Scalar beta,
+    const GrB_Descriptor desc       // descriptor for w and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-\item To specify a contiguous range of indices, such as \verb'I=10:20'
-    in MATLAB, the array \verb'I' has size 2, and \verb'ni' is passed to
-    SuiteSparse:GraphBLAS as the special value \verb'ni = GxB_RANGE'.  The
-    beginning index is \verb'I[GxB_BEGIN]' and the ending index is
-    \verb'I[GxB_END]'.   Both values must be non-negative since
-    \verb'GrB_Index' is an unsigned integer (\verb'uint64_t').  The value of
-    \verb'I[GxB_INC]' is ignored.
+Identical to \verb'GrB_Vector_eWiseAdd' except that two scalars are used
+to define how to compute the result when entries are present in one of
+the two input vectors (\verb'u' and \verb'v'), but not the other.
+Each of the two input scalars, \verb'alpha' and \verb'beta'
+must contain an entry.
+When computing the result \verb't=u+v',
+if \verb'u(i)' is present but \verb'v(i)' is not, then \verb't(i)=u(i)+beta'.
+Likewise,
+if \verb'v(i)' is present but \verb'u(i)' is not, then \verb't(i)=alpha+v(i)',
+where \verb'+' denotes the binary operator, \verb'add'.
 
-    \vspace{-0.05in}
-    {\footnotesize
-    \begin{verbatim}
-    // to specify I = 10:20
-    GrB_Index I [2], ni = GxB_RANGE ;
-    I [GxB_BEGIN] = 10 ;      // the start of the sequence
-    I [GxB_END  ] = 20 ;      // the end of the sequence \end{verbatim}}
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_eWiseUnion:} element-wise matrix addition}
+%-------------------------------------------------------------------------------
+\label{eWiseUnion_matrix}
 
-    \vspace{-0.05in}
-    Let $b$ = \verb'I[GxB_BEGIN]', let $e$ = \verb'I[GxB_END]',
-    The sequence has length zero if $b > e$; otherwise the length is
-    $|I| = (e-b) + 1$.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_eWiseUnion             // C<M> = accum (C, A+B)
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_BinaryOp add,         // defines '+' for T=A+B
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Scalar alpha,
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Scalar beta,
+    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+) ;
+\end{verbatim} } \end{mdframed}
 
-\item To specify a strided range of indices with a non-negative stride,
-    such as \verb'I=3:2:10', the array \verb'I' has size 3, and \verb'ni' has
-    the special value \verb'GxB_STRIDE'.  This is the sequence 3, 5, 7, 9, of
-    length 4.  Note that 10 does not appear in the list.  The end point need
-    not appear if the increment goes past it.
+Identical to \verb'GrB_Matrix_eWiseAdd' except that two scalars are used
+to define how to compute the result when entries are present in one of
+the two input matrices (\verb'A' and \verb'B'), but not the other.
+Each of the two input scalars, \verb'alpha' and \verb'beta'
+must contain an entry.
+When computing the result \verb'T=A+B',
+if \verb'A(i,j)' is present but \verb'B(i,j))' is not, then \verb'T(i,j)=A(i,j)+beta'.
+Likewise,
+if \verb'B(i,j)' is present but \verb'A(i,j)' is not, then \verb'T(i,j)=alpha+B(i,j)',
+where \verb'+' denotes the binary operator, \verb'add'.
 
-    \vspace{-0.05in}
-    {\footnotesize
-    \begin{verbatim}
-    // to specify I = 3:2:10
-    GrB_Index I [3], ni = GxB_STRIDE ;
-    I [GxB_BEGIN ] = 3 ;      // the start of the sequence
-    I [GxB_INC   ] = 2 ;      // the increment
-    I [GxB_END   ] = 10 ;     // the end of the sequence \end{verbatim}}
+\newpage
+%===============================================================================
+\subsection{{\sf GrB\_extract:} submatrix extraction } %========================
+%===============================================================================
+\label{extract}
 
-    \vspace{-0.05in}
-    The \verb'GxB_STRIDE' sequence is the same as the \verb'List' generated by
-    the following for loop:
+The \verb'GrB_extract' function is a generic name for three specific functions:
+\verb'GrB_Vector_extract', \verb'GrB_Col_extract', and
+\verb'GrB_Matrix_extract'.  The generic name appears in the function signature,
+but the specific function name is used when describing what each variation
+does.
 
-    \vspace{-0.05in}
-    {\footnotesize
-    \begin{verbatim}
-    int64_t k = 0 ;
-    GrB_Index *List = (a pointer to an array of large enough size)
-    for (int64_t i = I [GxB_BEGIN] ; i <= I [GxB_END] ; i += I [GxB_INC])
-    {
-        // i is the kth entry in the sequence
-        List [k++] = i ;
-    } \end{verbatim}}
+% \newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Vector\_extract:} extract subvector from vector}
+%-------------------------------------------------------------------------------
+\label{extract_vector}
 
-    \vspace{-0.05in}
-    Then passing the explicit array \verb'List' and its length \verb'ni=k' has
-    the same effect as passing in the array \verb'I' of size 3, with
-    \verb'ni=GxB_STRIDE'.  The latter is simply much faster to produce, and
-    much more efficient for SuiteSparse:GraphBLAS to process.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_extract                // w<mask> = accum (w, u(I))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_Vector u,             // first input:  vector u
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Descriptor desc       // descriptor for w and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-    Let $b$ = \verb'I[GxB_BEGIN]', let $e$ = \verb'I[GxB_END]', and let
-    $\Delta$ = \verb'I[GxB_INC]'.  The sequence has length zero if $b > e$ or
-    $\Delta=0$.  Otherwise, the length of the sequence is
-    \[
-    |I| = \Bigl\lfloor\dfrac{e-b}{\Delta}\Bigr\rfloor + 1
-    \]
+\verb'GrB_Vector_extract' extracts a subvector from another vector, identical
+to \verb't = u (I)' in MATLAB where \verb'I' is an integer vector of row
+indices.  Refer to \verb'GrB_Matrix_extract' for further details; vector
+extraction is the same as matrix extraction with \verb'n'-by-1 matrices.
+See Section~\ref{colon} for a description of \verb'I' and \verb'ni'.
+The final step is ${\bf w \langle m \rangle  = w \odot
+t}$, as described in Section~\ref{accummask}, except that all the terms are
+column vectors instead of matrices.
 
-\item
-    In MATLAB notation, if the stride is negative, the sequence is decreasing.
-    For example, \verb'10:-2:1' is the sequence 10, 8, 6, 4, 2, in that order.
-    In SuiteSparse:GraphBLAS, use \verb'ni = GxB_BACKWARDS', with an array
-    \verb'I' of size 3.  The following example specifies defines the equivalent
-    of the MATLAB expression \verb'10:-2:1' in SuiteSparse:GraphBLAS:
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_extract:} extract submatrix from matrix}
+%-------------------------------------------------------------------------------
+\label{extract_matrix}
 
-    \vspace{-0.1in}
-    {\footnotesize
-    \begin{verbatim}
-    // to specify I = 10:-2:1
-    GrB_Index I [3], ni = GxB_BACKWARDS ;
-    I [GxB_BEGIN ] = 10 ;     // the start of the sequence
-    I [GxB_INC   ] = 2 ;      // the magnitude of the increment
-    I [GxB_END   ] = 1 ;      // the end of the sequence \end{verbatim}}
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_extract                // C<Mask> = accum (C, A(I,J))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index *J,             // column indices
+    const GrB_Index nj,             // number of column indices
+    const GrB_Descriptor desc       // descriptor for C, Mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-    \vspace{-0.1in}
-    The value -2 cannot be assigned to the \verb'GrB_Index' array \verb'I',
-    since that is an unsigned type.  The signed increment is represented
-    instead with the special value \verb'ni = GxB_BACKWARDS'.
-    The \verb'GxB_BACKWARDS' sequence is the same as generated by the following
-    for loop:
+\verb'GrB_Matrix_extract' extracts a submatrix from another matrix, identical
+to \verb'T = A(I,J)' in MATLAB where \verb'I' and \verb'J' are integer vectors
+of row and column indices, respectively, except that indices are zero-based in
+GraphBLAS and one-based in MATLAB.  The input matrix \verb'A' may be transposed
+first, via the descriptor.  The type of \verb'T' and \verb'A' are the same.
+The size of \verb'C' is \verb'|I|'-by-\verb'|J|'.
+Entries outside \verb'A(I,J)' are not accessed and do not take part in the
+computation.  More precisely, assuming the matrix \verb'A' is not transposed,
+the matrix \verb'T' is defined as follows:
 
     \vspace{-0.1in}
     {\footnotesize
     \begin{verbatim}
-    int64_t k = 0 ;
-    GrB_Index *List = (a pointer to an array of large enough size)
-    for (int64_t i = I [GxB_BEGIN] ; i >= I [GxB_END] ; i -= I [GxB_INC])
-    {
-        // i is the kth entry in the sequence
-        List [k++] = i ;
-    } \end{verbatim}}
-
-    \vspace{-0.1in}
-    Let $b$ = \verb'I[GxB_BEGIN]', let $e$ = \verb'I[GxB_END]', and let
-    $\Delta$ = \verb'I[GxB_INC]' (note that $\Delta$ is not negative).  The
-    sequence has length zero if $b < e$ or $\Delta=0$.  Otherwise, the length
-    of the sequence is
-    \[
-    |I| = \Bigl\lfloor\dfrac{b-e}{\Delta}\Bigr\rfloor + 1
-    \]
-
-\end{enumerate}
-
-Since \verb'GrB_Index' is an unsigned integer, all three values
-\verb'I[GxB_BEGIN]', \verb'I[GxB_INC]', and \verb'I[GxB_END]' must
-be non-negative.
-
-Just as in MATLAB, it is valid to specify an empty sequence of length zero.
-For example, \verb'I = 5:3' has length zero in MATLAB and the same is
-true for a \verb'GxB_RANGE' sequence in SuiteSparse:GraphBLAS, with
-\verb'I[GxB_BEGIN]=5' and \verb'I[GxB_END]=3'.  This has the same
-effect as array \verb'I' with \verb'ni=0'.
-
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{GraphBLAS Operations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{operations}
-
-The next sections define each of the GraphBLAS operations, also listed in the
-table below.
+    T.matrix  = zeros (ni, nj) ;    % a matrix of size ni-by-nj
+    T.pattern = false (ni, nj) ;
+    for i = 1:ni
+        for j = 1:nj
+            if (A (I(i),J(j)).pattern)
+                T (i,j).matrix  = A (I(i),J(j)).matrix ;
+                T (i,j).pattern = true ;
+            end
+        end
+    end \end{verbatim}}
 
-\vspace{0.2in}
-{\small
-\begin{tabular}{lll}
-\hline
-\verb'GrB_mxm'       & matrix-matrix multiply  & ${\bf C \langle M \rangle = C \odot AB}$ \\
-\verb'GrB_vxm'       & vector-matrix multiply  & ${\bf w^{\sf T}\langle m^{\sf T}\rangle = w^{\sf T}\odot u^{\sf T}A}$ \\
-\verb'GrB_mxv'       & matrix-vector multiply  & ${\bf w \langle m \rangle = w \odot Au}$ \\
-\hline
-\verb'GrB_eWiseMult' & element-wise,           & ${\bf C \langle M \rangle = C \odot (A \otimes B)}$ \\
-                     & set intersection        & ${\bf w \langle m \rangle = w \odot (u \otimes v)}$ \\
-\hline
-\verb'GrB_eWiseAdd'  & element-wise,           & ${\bf C \langle M \rangle = C \odot (A \oplus  B)}$ \\
-                     & set union               & ${\bf w \langle m \rangle = w \odot (u \oplus  v)}$ \\
-\hline
-\verb'GxB_eWiseUnion'& element-wise,           & ${\bf C \langle M \rangle = C \odot (A \oplus  B)}$ \\
-                     & set union               & ${\bf w \langle m \rangle = w \odot (u \oplus  v)}$ \\
-\hline
-\verb'GrB_extract'   & extract submatrix       & ${\bf C \langle M \rangle = C \odot A(I,J)}$ \\
-                     &                         & ${\bf w \langle m \rangle = w \odot u(i)}$ \\
-\hline
-\verb'GxB_subassign' & assign submatrix,       & ${\bf C (I,J) \langle M \rangle = C(I,J) \odot A}$ \\
-                     & with submask for ${\bf C(I,J)}$
-                                               & ${\bf w (i)   \langle m \rangle = w(i)   \odot u}$ \\
-\hline
-\verb'GrB_assign'    & assign submatrix        & ${\bf C \langle M \rangle (I,J) = C(I,J) \odot A}$ \\
-                     & with submask for ${\bf C}$
-                                               & ${\bf w \langle m \rangle (i)   = w(i)   \odot u}$ \\
-\hline
-\verb'GrB_apply'     & apply unary operator    & ${\bf C \langle M \rangle = C \odot} f{\bf (A)}$ \\
-                     &                         & ${\bf w \langle m \rangle = w \odot} f{\bf (u)}$ \\
-                     & apply binary operator   & ${\bf C \langle M \rangle = C \odot} f(x,{\bf A})$ \\
-                     &                         & ${\bf C \langle M \rangle = C \odot} f({\bf A},y)$ \\
-                     &                         & ${\bf w \langle m \rangle = w \odot} f(x,{\bf x})$ \\
-                     &                         & ${\bf w \langle m \rangle = w \odot} f({\bf u},y)$ \\
-                     & apply index-unary op    & ${\bf C \langle M \rangle = C \odot} f({\bf A},i,j,k)$ \\
-                     &                         & ${\bf w \langle m \rangle = w \odot} f({\bf u},i,0,k)$ \\
-\hline
-\verb'GrB_select'    & select entries          & ${\bf C \langle M \rangle = C \odot} \mbox{select}({\bf A},i,j,k)$ \\
-                     &                         & ${\bf w \langle m \rangle = w \odot} \mbox{select}({\bf u},i,0,k)$ \\
-\hline
-\verb'GrB_reduce'    & reduce to vector        & ${\bf w \langle m \rangle = w \odot} [{\oplus}_j {\bf A}(:,j)]$ \\
-                     & reduce to scalar        & $s = s \odot [{\oplus}_{ij}  {\bf A}(I,J)]$ \\
-\hline
-\verb'GrB_transpose' & transpose               & ${\bf C \langle M \rangle = C \odot A^{\sf T}}$ \\
-\hline
-\verb'GrB_kronecker' & Kronecker product       & ${\bf C \langle M \rangle = C \odot \mbox{kron}(A, B)}$ \\
-\hline
-\end{tabular}
-}
-\vspace{0.2in}
+\vspace{-0.1in}
+If duplicate indices are present in \verb'I' or \verb'J', the above method
+defines the result in \verb'T'.  Duplicates result in the same values of
+\verb'A' being copied into different places in \verb'T'.
+See Section~\ref{colon} for a description of the row indices
+\verb'I' and \verb'ni', and the column indices
+\verb'J' and \verb'nj'.
+The final step is ${\bf C \langle M \rangle  = C \odot
+T}$, as described in Section~\ref{accummask}.
 
-If an error occurs, \verb'GrB_error(&err,C)' or \verb'GrB_error(&err,w)'
-returns details about the error, for operations that return a modified matrix
-\verb'C' or vector \verb'w'.  The only operation that cannot return an error
-string is reduction to a scalar with \verb'GrB_reduce'.
+\paragraph{\bf Performance considerations:} % C=A(I,J)
+If \verb'A' is not transposed via input descriptor: if \verb'|I|' is small,
+then it is fastest if \verb'A' is \verb'GxB_BY_ROW'; if
+\verb'|J|' is small, then it is fastest if \verb'A' is
+\verb'GxB_BY_COL'.  The opposite is true if \verb'A' is transposed.
 
-\newpage
-%===============================================================================
-\subsection{{\sf GrB\_mxm:} matrix-matrix multiply} %===========================
-%===============================================================================
-\label{mxm}
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Col\_extract:} extract column vector from matrix}
+%-------------------------------------------------------------------------------
+\label{extract_column}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_mxm                    // C<Mask> = accum (C, A*B)
+GrB_Info GrB_extract                // w<mask> = accum (w, A(I,j))
 (
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_Semiring semiring,    // defines '+' and '*' for A*B
+    GrB_Vector w,                   // input/output matrix for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
     const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index j,              // column index
+    const GrB_Descriptor desc       // descriptor for w, mask, and A
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_mxm' multiplies two sparse matrices \verb'A' and \verb'B' using the
-\verb'semiring'.  The input matrices \verb'A' and \verb'B' may be transposed
-according to the descriptor, \verb'desc' (which may be \verb'NULL') and then
-typecasted to match the multiply operator of the \verb'semiring'.  Next,
-\verb'T=A*B' is computed on the \verb'semiring', precisely defined in the
-\verb'GB_spec_mxm.m' script in \verb'GraphBLAS/Test'.  The actual algorithm
-exploits sparsity and does not take $O(n^3)$ time, but it computes the
-following:
-
-{\footnotesize
-\begin{verbatim}
-[m s] = size (A.matrix) ;
-[s n] = size (B.matrix) ;
-T.matrix  = zeros (m, n, multiply.ztype) ;
-T.pattern = zeros (m, n, 'logical') ;
-T.matrix (:,:) = identity ;             % the identity of the semiring's monoid
-T.class = multiply.ztype ;              % the ztype of the semiring's multiply op
-A = cast (A.matrix, multiply.xtype) ;   % the xtype of the semiring's multiply op
-B = cast (B.matrix, multiply.ytype) ;   % the ytype of the semiring's multiply op
-for j = 1:n
-    for i = 1:m
-        for k = 1:s
-            % T (i,j) += A (i,k) * B (k,j), using the semiring
-            if (A.pattern (i,k) && B.pattern (k,j))
-                z = multiply (A (i,k), B (k,j)) ;
-                T.matrix  (i,j) = add (T.matrix (i,j),  z) ;
-                T.pattern (i,j) = true ;
-            end
-        end
-    end
-end \end{verbatim}}
-
-Finally, \verb'T' is typecasted into the type of \verb'C', and the results are
-written back into \verb'C' via the \verb'accum' and \verb'Mask', ${\bf C
-\langle M \rangle  = C \odot T}$.  The latter step is reflected in the MATLAB
-function \verb'GB_spec_accum_mask.m', discussed in Section~\ref{accummask}.
-
-\paragraph{\bf Performance considerations:}
-Suppose all matrices are in \verb'GxB_BY_COL' format, and \verb'B' is extremely
-sparse but \verb'A' is not as sparse.  Then computing \verb'C=A*B' is very
-fast, and much faster than when \verb'A' is extremely sparse.  For example, if
-\verb'A' is square and \verb'B' is a column vector that is all nonzero except
-for one entry \verb'B(j,0)=1', then \verb'C=A*B' is the same as extracting
-column \verb'A(:,j)'.  This is very fast if \verb'A' is stored by column but
-slow if \verb'A' is stored by row.  If \verb'A' is a sparse row with a single
-entry \verb'A(0,i)=1', then \verb'C=A*B' is the same as extracting row
-\verb'B(i,:)'.  This is fast if \verb'B' is stored by row but slow if \verb'B'
-is stored by column.
+\verb'GrB_Col_extract' extracts a subvector from a matrix, identical to
+\verb't = A (I,j)' in MATLAB where \verb'I' is an integer vector of row indices
+and where \verb'j' is a single column index.  The input matrix \verb'A' may be
+transposed first, via the descriptor, which results in the extraction of a
+single row \verb'j' from the matrix \verb'A', the result of which is a column
+vector \verb'w'.  The type of \verb't' and \verb'A' are the same.
+The size of \verb'w' is \verb'|I|'-by-1.
 
-If the user application needs to repeatedly extract rows and columns from a
-matrix, whether by matrix multiplication or by \verb'GrB_extract', then keep
-two copies: one stored by row, and other by column, and use the copy that
-results in the fastest computation.
+See Section~\ref{colon} for a description of the row indices
+\verb'I' and \verb'ni'.
+The final step is ${\bf w \langle m
+\rangle  = w \odot t}$, as described in Section~\ref{accummask}, except that
+all the terms are column vectors instead of matrices.
 
-By default, \verb'GrB_mxm', \verb'GrB_mxv', \verb'GrB_vxm', and
-\verb'GrB_reduce' (to vector) can return their result in a jumbled state, with
-the sort left pending.  It can sometimes be faster for these methods to do the
-sort as they compute their result.  Use the \verb'GxB_SORT' descriptor setting
-to select this option.  Refer to Section~\ref{descriptor} for details.
+\paragraph{\bf Performance considerations:} % w = A(I,j)
+If \verb'A' is not transposed: it is fastest if the format of \verb'A' is
+\verb'GxB_BY_COL'.  The opposite is true if \verb'A' is transposed.
 
 \newpage
 %===============================================================================
-\subsection{{\sf GrB\_vxm:} vector-matrix multiply} %===========================
+\subsection{{\sf GxB\_subassign:} submatrix assignment} %=======================
 %===============================================================================
-\label{vxm}
+\label{subassign}
+
+The methods described in this section are all variations of the form
+\verb'C(I,J)=A', which modifies a submatrix of the matrix \verb'C'.  All
+methods can be used in their generic form with the single name
+\verb'GxB_subassign'.  This is reflected in the prototypes.  However, to avoid
+confusion between the different kinds of assignment, the name of the specific
+function is used when describing each variation.  If the discussion applies to
+all variations, the simple name \verb'GxB_subassign' is used.
+
+See Section~\ref{colon} for a description of the row indices
+\verb'I' and \verb'ni', and the column indices
+\verb'J' and \verb'nj'.
+
+\verb'GxB_subassign' is very similar to \verb'GrB_assign', described in
+Section~\ref{assign}.  The two operations are compared and contrasted in
+Section~\ref{compare_assign}.  For a discussion of how duplicate indices
+are handled in \verb'I' and \verb'J', see Section~\ref{duplicates}.
+
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Vector\_subassign:} assign to a subvector }
+%-------------------------------------------------------------------------------
+\label{subassign_vector}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_vxm                    // w'<mask> = accum (w, u'*A)
+GrB_Info GxB_subassign              // w(I)<mask> = accum (w(I),u)
 (
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_Semiring semiring,    // defines '+' and '*' for u'*A
+    GrB_Vector w,                   // input/output matrix for results
+    const GrB_Vector mask,          // optional mask for w(I), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),t)
     const GrB_Vector u,             // first input:  vector u
-    const GrB_Matrix A,             // second input: matrix A
-    const GrB_Descriptor desc       // descriptor for w, mask, and A
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Descriptor desc       // descriptor for w(I) and mask
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_vxm' multiplies a row vector \verb"u'" times a matrix \verb'A'.  The
-matrix \verb'A' may be first transposed according to \verb'desc' (as the second
-input, \verb'GrB_INP1'); the column vector \verb'u' is never transposed via the
-descriptor.  The inputs \verb'u' and \verb'A' are typecasted to match the
-\verb'xtype' and \verb'ytype' inputs, respectively, of the multiply operator of
-the \verb'semiring'.  Next, an intermediate column vector \verb"t=A'*u" is
-computed on the \verb'semiring' using the same method as \verb'GrB_mxm'.
-Finally, the column vector \verb't' is typecasted from the \verb'ztype' of the
-multiply operator of the \verb'semiring' into the type of \verb'w', and the
-results are written back into \verb'w' using the optional accumulator
-\verb'accum' and \verb'mask'.
+\verb'GxB_Vector_subassign' operates on a subvector \verb'w(I)' of \verb'w',
+modifying it with the vector \verb'u'.  The method is identical to
+\verb'GxB_Matrix_subassign' described in Section~\ref{subassign_matrix}, where
+all matrices have a single column each.  The \verb'mask' has the same size as
+\verb'w(I)' and \verb'u'.  The only other difference is that the input \verb'u'
+in this method is not transposed via the \verb'GrB_INP0' descriptor.
 
-The last step is ${\bf w \langle m \rangle  = w \odot t}$, as described
-in Section~\ref{accummask}, except that all the
-terms are column vectors instead of matrices.
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_subassign:} assign to a submatrix }
+%-------------------------------------------------------------------------------
+\label{subassign_matrix}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_subassign              // C(I,J)<Mask> = accum (C(I,J),A)
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C(I,J), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),T)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index *J,             // column indices
+    const GrB_Index nj,             // number of column indices
+    const GrB_Descriptor desc       // descriptor for C(I,J), Mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
+
+\verb'GxB_Matrix_subassign' operates only on a submatrix \verb'S' of \verb'C',
+modifying it with the matrix \verb'A'.   For this operation, the result is not
+the entire matrix \verb'C', but a submatrix \verb'S=C(I,J)' of \verb'C'.  The
+steps taken are as follows, except that ${\bf A}$ may be optionally transposed
+via the \verb'GrB_INP0' descriptor option.
+
+\vspace{0.1in}
+\begin{tabular}{lll}
+\hline
+Step & GraphBLAS & description \\
+     & notation  & \\
+\hline
+1 & ${\bf S} = {\bf C(I,J)}$                             & extract the ${\bf C(I,J)}$ submatrix \\
+2 & ${\bf S \langle M \rangle} = {\bf S} \odot {\bf A}$  & apply the accumulator/mask to the submatrix ${\bf S}$\\
+3 & ${\bf C(I,J)}= {\bf S}$                              & put the submatrix ${\bf S}$ back into ${\bf C(I,J)}$ \\
+\hline
+\end{tabular}
+\vspace{0.1in}
+
+The accumulator/mask step in Step 2 is the same as for all other GraphBLAS
+operations, described in Section~\ref{accummask}, except that for
+\verb'GxB_subassign', it is applied to just the submatrix ${\bf S} = {\bf
+C(I,J)}$, and thus the \verb'Mask' has the same size as ${\bf A}$,
+${\bf S}$, and ${\bf C(I,J)}$.
+
+The \verb'GxB_subassign' operation is the reverse of matrix extraction:
+
+\begin{itemize}
+\item
+For submatrix extraction, \verb'GrB_Matrix_extract',
+the submatrix \verb'A(I,J)' appears on the right-hand side of the assignment,
+\verb'C=A(I,J)', and entries outside of the submatrix are not accessed and do
+not take part in the computation.
+
+\item
+For submatrix assignment, \verb'GxB_Matrix_subassign',
+the submatrix \verb'C(I,J)' appears on the left-hand-side of the assignment,
+\verb'C(I,J)=A', and entries outside of the submatrix are not accessed and do
+not take part in the computation.
+
+\end{itemize}
+
+In both methods, the accumulator and mask modify the submatrix of the
+assignment; they simply differ on which side of the assignment the submatrix
+resides on.  In both cases, if the \verb'Mask' matrix is present it is the same
+size as the submatrix:
+
+\begin{itemize}
+
+\item
+For submatrix extraction,
+${\bf C \langle M \rangle = C \odot A(I,J)}$ is computed,
+where the submatrix is on the right.
+The mask ${\bf M}$ has the same size as the submatrix ${\bf A(I,J)}$.
+
+\item
+For submatrix assignment,
+${\bf C(I,J) \langle M \rangle = C(I,J) \odot A}$ is computed,
+where the submatrix is on the left.
+The mask ${\bf M}$ has the same size as the submatrix ${\bf C(I,J)}$.
+
+\end{itemize}
+
+In Step 1, the submatrix \verb'S' is first computed by the
+\verb'GrB_Matrix_extract' operation, \verb'S=C(I,J)'.
+
+Step 2 accumulates the results ${\bf S \langle M \rangle  = S \odot T}$,
+exactly as described in Section~\ref{accummask}, but operating on the submatrix
+${\bf S}$, not ${\bf C}$, using the optional \verb'Mask' and \verb'accum'
+operator.  The matrix ${\bf T}$ is simply ${\bf T}={\bf A}$, or ${\bf T}={\bf
+A}^{\sf T}$ if ${\bf A}$ is transposed via the \verb'desc' descriptor,
+\verb'GrB_INP0'.  The \verb'GrB_REPLACE' option in the descriptor clears ${\bf
+S}$ after computing ${\bf Z = T}$ or ${\bf Z = C \odot T}$, not all of ${\bf
+C}$ since this operation can only modify the specified submatrix of ${\bf C}$.
+
+Finally, Step 3 writes the result (which is the modified submatrix \verb'S' and
+not all of \verb'C') back into the \verb'C' matrix that contains it, via the
+assignment \verb'C(I,J)=S', using the reverse operation from the method
+described for matrix extraction:
+
+    {\footnotesize
+    \begin{verbatim}
+    for i = 1:ni
+        for j = 1:nj
+            if (S (i,j).pattern)
+                C (I(i),J(j)).matrix = S (i,j).matrix ;
+                C (I(i),J(j)).pattern = true ;
+            end
+        end
+    end \end{verbatim}}
 
-\paragraph{\bf Performance considerations:} % u'=u'*A
-If the \verb'GxB_FORMAT' of \verb'A' is \verb'GxB_BY_ROW', and the default
-descriptor is used (\verb'A' is not transposed), then \verb'GrB_vxm' is faster
-than than \verb'GrB_mxv' with its default descriptor, when the vector \verb'u'
-is very sparse.
-However, if the \verb'GxB_FORMAT' of \verb'A' is \verb'GxB_BY_COL', then
-\verb'GrB_mxv' with its default descriptor is faster than \verb'GrB_vxm' with
-its default descriptor, when the vector \verb'u' is very sparse.
-Using the non-default \verb'GrB_TRAN' descriptor for \verb'A' makes the
-\verb'GrB_vxm' operation equivalent to \verb'GrB_mxv' with its default
-descriptor (with the operands reversed in the multiplier, as well).  The
-reverse is true as well; \verb'GrB_mxv' with \verb'GrB_TRAN' is the same as
-\verb'GrB_vxm' with a default descriptor.
+\paragraph{\bf Performance considerations:} % C(I,J) = A
+If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
+the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
+fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
+if \verb'A' is transposed.
 
 \newpage
-%===============================================================================
-\subsection{{\sf GrB\_mxv:} matrix-vector multiply} %===========================
-%===============================================================================
-\label{mxv}
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Col\_subassign:} assign to a sub-column of a matrix}
+%-------------------------------------------------------------------------------
+\label{subassign_column}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_mxv                    // w<mask> = accum (w, A*u)
+GrB_Info GxB_subassign              // C(I,j)<mask> = accum (C(I,j),u)
 (
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_Semiring semiring,    // defines '+' and '*' for A*B
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Vector u,             // second input: vector u
-    const GrB_Descriptor desc       // descriptor for w, mask, and A
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Vector mask,          // optional mask for C(I,j), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(C(I,j),t)
+    const GrB_Vector u,             // input vector
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index j,              // column index
+    const GrB_Descriptor desc       // descriptor for C(I,j) and mask
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_mxv' multiplies a matrix \verb'A' times a column vector \verb'u'.
-The matrix \verb'A' may be first transposed according to \verb'desc' (as the
-first input); the column vector \verb'u' is never transposed via the
-descriptor.  The inputs \verb'A' and \verb'u' are typecasted to match the
-\verb'xtype' and \verb'ytype' inputs, respectively, of the multiply operator of
-the \verb'semiring'. Next, an intermediate column vector \verb't=A*u' is
-computed on the \verb'semiring' using the same method as \verb'GrB_mxm'.
-Finally, the column vector \verb't' is typecasted from the \verb'ztype' of the
-multiply operator of the \verb'semiring' into the type of \verb'w', and the
-results are written back into \verb'w' using the optional accumulator
-\verb'accum' and \verb'mask'.
-
-The last step is ${\bf w \langle m \rangle  = w \odot t}$, as described
-in Section~\ref{accummask}, except that all the terms are column vectors instead
-of matrices.
-
-\paragraph{\bf Performance considerations:} % u=A*u
-Refer to the discussion of \verb'GrB_vxm'.  In SuiteSparse:GraphBLAS,
-\verb'GrB_mxv' is very efficient when \verb'u' is sparse or dense, when the
-default descriptor is used, and when the matrix is \verb'GxB_BY_COL'.  When
-\verb'u' is very sparse and \verb'GrB_INP0' is set to its non-default
-\verb'GrB_TRAN', then this method is not efficient if the matrix is in
-\verb'GxB_BY_COL' format.  If an application needs to perform \verb"A'*u"
-repeatedly where \verb'u' is very sparse, then use the \verb'GxB_BY_ROW' format
-for \verb'A' instead.
-
-\newpage
-%===============================================================================
-\subsection{{\sf GrB\_eWiseMult:} element-wise operations, set intersection} %==
-%===============================================================================
-\label{eWiseMult}
+\verb'GxB_Col_subassign' modifies a single sub-column of a matrix \verb'C'.  It
+is the same as \verb'GxB_Matrix_subassign' where the index vector \verb'J[0]=j'
+is a single column index (and thus \verb'nj=1'), and where all matrices in
+\verb'GxB_Matrix_subassign' (except \verb'C') consist of a single column.  The
+\verb'mask' vector has the same size as \verb'u' and the sub-column
+\verb'C(I,j)'.  The input descriptor \verb'GrB_INP0' is ignored; the input
+vector \verb'u' is not transposed.  Refer to \verb'GxB_Matrix_subassign' for
+further details.
 
-Element-wise ``multiplication'' is shorthand for applying a binary operator
-element-wise on two matrices or vectors \verb'A' and \verb'B', for all entries
-that appear in the set intersection of the patterns of \verb'A' and \verb'B'.
-This is like \verb'A.*B' for two sparse matrices in MATLAB, except that in
-GraphBLAS any binary operator can be used, not just multiplication.
+\paragraph{\bf Performance considerations:} % C(I,j) = u
+\verb'GxB_Col_subassign' is much faster than \verb'GxB_Row_subassign' if the
+format of \verb'C' is \verb'GxB_BY_COL'.  \verb'GxB_Row_subassign' is much
+faster than \verb'GxB_Col_subassign' if the format of \verb'C' is
+\verb'GxB_BY_ROW'.
 
-The pattern of the result of the element-wise ``multiplication'' is exactly
-this set intersection.  Entries in \verb'A' but not \verb'B', or visa versa, do
-not appear in the result.
+% \newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Row\_subassign:} assign to a sub-row of a matrix}
+%-------------------------------------------------------------------------------
+\label{subassign_row}
 
-Let $\otimes$ denote the binary operator to be used.  The computation ${\bf T =
-A \otimes B}$ is given below.  Entries not in the intersection of ${\bf A}$ and
-${\bf B}$ do not appear in the pattern of ${\bf T}$.  That is:
-    \vspace{-0.2in}
-    {\small
-    \begin{tabbing}
-    \hspace{2em} \= \hspace{2em} \= \hspace{2em} \= \\
-    \> for all entries $(i,j)$ in ${\bf A \cap B}$ \\
-    \> \> $t_{ij} = a_{ij} \otimes b_{ij}$ \\
-    \end{tabbing} }
-    \vspace{-0.2in}
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_subassign              // C(i,J)<mask'> = accum (C(i,J),u')
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Vector mask,          // optional mask for C(i,J), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(C(i,J),t)
+    const GrB_Vector u,             // input vector
+    const GrB_Index i,              // row index
+    const GrB_Index *J,             // column indices
+    const GrB_Index nj,             // number of column indices
+    const GrB_Descriptor desc       // descriptor for C(i,J) and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-Depending on what kind of operator is used and what the implicit value is
-assumed to be, this can give the Hadamard product.  This is the case for
-\verb'A.*B' in MATLAB since the implicit value is zero.  However, computing a
-Hadamard product is not necessarily the goal of the \verb'eWiseMult' operation.
-It simply applies any binary operator, built-in or user-defined, to the set
-intersection of \verb'A' and \verb'B', and discards any entry outside this
-intersection.  Its usefulness in a user's application does not depend upon it
-computing a Hadamard product in all cases.  The operator need not be
-associative, commutative, nor have any particular property except for type
-compatibility with \verb'A' and \verb'B', and the output matrix \verb'C'.
+\verb'GxB_Row_subassign' modifies a single sub-row of a matrix \verb'C'.  It is
+the same as \verb'GxB_Matrix_subassign' where the index vector \verb'I[0]=i' is
+a single row index (and thus \verb'ni=1'), and where all matrices in
+\verb'GxB_Matrix_subassign' (except \verb'C') consist of a single row.  The
+\verb'mask' vector has the same size as \verb'u' and the sub-column
+\verb'C(I,j)'.  The input descriptor \verb'GrB_INP0' is ignored; the input
+vector \verb'u' is not transposed.  Refer to \verb'GxB_Matrix_subassign' for
+further details.
 
-The generic name for this operation is \verb'GrB_eWiseMult', which can be used
-for both matrices and vectors.
+\paragraph{\bf Performance considerations:} % C(i,J) = u'
+\verb'GxB_Col_subassign' is much faster than \verb'GxB_Row_subassign' if the
+format of \verb'C' is \verb'GxB_BY_COL'.  \verb'GxB_Row_subassign' is much
+faster than \verb'GxB_Col_subassign' if the format of \verb'C' is
+\verb'GxB_BY_ROW'.
 
-\newpage
+% \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_eWiseMult:} element-wise vector multiply}
+\subsubsection{{\sf GxB\_Vector\_subassign\_$<$type$>$:} assign a scalar to a subvector}
 %-------------------------------------------------------------------------------
-\label{eWiseMult_vector}
+\label{subassign_vector_scalar}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_eWiseMult              // w<mask> = accum (w, u.*v)
+GrB_Info GxB_subassign              // w(I)<mask> = accum (w(I),x)
 (
     GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const <operator> multiply,      // defines '.*' for t=u.*v
-    const GrB_Vector u,             // first input:  vector u
-    const GrB_Vector v,             // second input: vector v
-    const GrB_Descriptor desc       // descriptor for w and mask
+    const GrB_Vector mask,          // optional mask for w(I), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),x)
+    const <type> x,                 // scalar to assign to w(I)
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Descriptor desc       // descriptor for w(I) and mask
 ) ;
-\end{verbatim}
-} \end{mdframed}
-
-\verb'GrB_Vector_eWiseMult' computes the element-wise ``multiplication'' of two
-vectors \verb'u' and \verb'v', element-wise using any binary operator (not just
-times).  The vectors are not transposed via the descriptor.  The vectors
-\verb'u' and \verb'v' are first typecasted into the first and second inputs of
-the \verb'multiply' operator.  Next, a column vector \verb't' is computed,
-denoted ${\bf t = u \otimes v}$.  The pattern of \verb't' is the set
-intersection of \verb'u' and \verb'v'.  The result \verb't' has the type of the
-output \verb'ztype' of the \verb'multiply' operator.
-
-The \verb'operator' is typically a \verb'GrB_BinaryOp', but the method is
-type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
-additive operator of the monoid is used as the \verb'multiply' binary operator.
-If given a semiring (\verb'GrB_Semiring'), the multiply operator of the
-semiring is used as the \verb'multiply' binary operator.
+\end{verbatim} } \end{mdframed}
 
-The next and final step is ${\bf w \langle m \rangle  = w \odot t}$, as
-described in Section~\ref{accummask}, except that all the terms are column
-vectors instead of matrices.  Note for all GraphBLAS operations, including this
-one, the accumulator ${\bf w \odot t}$ is always applied in a set union manner,
-even though ${\bf t = u \otimes v}$ for this operation is applied in a set
-intersection manner.
+\verb'GxB_Vector_subassign_<type>' assigns a single scalar to an entire
+subvector of the vector \verb'w'.  The operation is exactly like setting a
+single entry in an \verb'n'-by-1 matrix, \verb'A(I,0) = x', where the column
+index for a vector is implicitly \verb'j=0'.  For further details of this
+function, see \verb'GxB_Matrix_subassign_<type>' in
+Section~\ref{subassign_matrix_scalar}.
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_eWiseMult:} element-wise matrix multiply}
+\subsubsection{{\sf GxB\_Matrix\_subassign\_$<$type$>$:} assign a scalar to a submatrix}
 %-------------------------------------------------------------------------------
-\label{eWiseMult_matrix}
+\label{subassign_matrix_scalar}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_eWiseMult              // C<Mask> = accum (C, A.*B)
+GrB_Info GxB_subassign              // C(I,J)<Mask> = accum (C(I,J),x)
 (
     GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const <operator> multiply,      // defines '.*' for T=A.*B
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+    const GrB_Matrix Mask,          // optional mask for C(I,J), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),x)
+    const <type> x,                 // scalar to assign to C(I,J)
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index *J,             // column indices
+    const GrB_Index nj,             // number of column indices
+    const GrB_Descriptor desc       // descriptor for C(I,J) and Mask
 ) ;
-\end{verbatim}
-} \end{mdframed}
-
-\verb'GrB_Matrix_eWiseMult' computes the element-wise ``multiplication'' of two
-matrices \verb'A' and \verb'B', element-wise using any binary operator (not
-just times).  The input matrices may be transposed first, according to the
-descriptor \verb'desc'.  They are then typecasted into the first and second
-inputs of the \verb'multiply' operator.  Next, a matrix \verb'T' is computed,
-denoted ${\bf T = A \otimes B}$.  The pattern of \verb'T' is the set
-intersection of \verb'A' and \verb'B'.  The result \verb'T' has the type of the
-output \verb'ztype' of the \verb'multiply' operator.
+\end{verbatim} } \end{mdframed}
 
-The \verb'multiply' operator is typically a \verb'GrB_BinaryOp', but the method
-is type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
-additive operator of the monoid is used as the \verb'multiply' binary operator.
-If given a semiring (\verb'GrB_Semiring'), the multiply operator of the
-semiring is used as the \verb'multiply' binary operator.
+\verb'GxB_Matrix_subassign_<type>' assigns a single scalar to an entire
+submatrix of \verb'C', like the {\em scalar expansion} \verb'C(I,J)=x' in
+MATLAB.  The scalar \verb'x' is implicitly expanded into a matrix \verb'A' of
+size \verb'ni' by \verb'nj', with all entries present and equal to \verb'x',
+and then the matrix \verb'A' is assigned to
+\verb'C(I,J)' using the same method as in \verb'GxB_Matrix_subassign'.  Refer
+to that function in Section~\ref{subassign_matrix} for further details.
+For the accumulation step, the scalar \verb'x' is typecasted directly into the
+type of \verb'C' when the \verb'accum' operator is not applied to it, or into
+the \verb'ytype' of the \verb'accum' operator, if \verb'accum' is not NULL, for
+entries that are already present in \verb'C'.
 
-\vspace{0.05in}
-The operation can be expressed in MATLAB notation as:
-    {\footnotesize
-    \begin{verbatim}
-    [nrows, ncols] = size (A.matrix) ;
-    T.matrix = zeros (nrows, ncols, multiply.ztype) ;
-    T.class = multiply.ztype ;
-    p = A.pattern & B.pattern ;
-    A = cast (A.matrix (p), multiply.xtype) ;
-    B = cast (B.matrix (p), multiply.ytype) ;
-    T.matrix (p) = multiply (A, B) ;
-    T.pattern = p ; \end{verbatim} }
+The \verb'<type> x' notation is otherwise the same as
+\verb'GrB_Matrix_setElement' (see Section~\ref{matrix_setElement}).  Any value
+can be passed to this function and its type will be detected, via the
+\verb'_Generic' feature of ANSI C11.  For a user-defined type, \verb'x' is a
+\verb'void *' pointer that points to a memory space holding a single entry of a
+scalar that has exactly the same user-defined type as the matrix \verb'C'.
+This user-defined type must exactly match the user-defined type of \verb'C'
+since no typecasting is done between user-defined types.
 
-The final step is ${\bf C \langle M \rangle  = C \odot T}$, as described in
-Section~\ref{accummask}.  Note for all GraphBLAS operations, including this
-one, the accumulator ${\bf C \odot T}$ is always applied in a set union manner,
-even though ${\bf T = A \otimes B}$ for this operation is applied in a set
-intersection manner.
+If a \verb'void *' pointer is passed in and the type of the underlying scalar
+does not exactly match the user-defined type of \verb'C', then results are
+undefined.  No error status will be returned since GraphBLAS has no way of
+catching this error.
+If \verb'x' is a \verb'GrB_Scalar' with no entry, then it is implicitly
+expanded into a matrix \verb'A' of size \verb'ni' by \verb'nj', with no entries
+present.
+
+\paragraph{\bf Performance considerations:} % C(I,J) = scalar
+If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
+the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
+fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
+if \verb'A' is transposed.
 
 \newpage
 %===============================================================================
-\subsection{{\sf GrB\_eWiseAdd:} element-wise operations, set union} %==========
+\subsection{{\sf GrB\_assign:} submatrix assignment} %==========================
 %===============================================================================
-\label{eWiseAdd}
-
-Element-wise ``addition'' is shorthand for applying a binary operator
-element-wise on two matrices or vectors \verb'A' and \verb'B', for all entries
-that appear in the set intersection of the patterns of \verb'A' and \verb'B'.
-This is like \verb'A+B' for two sparse matrices in MATLAB, except that in
-GraphBLAS any binary operator can be used, not just addition.  The pattern of
-the result of the element-wise ``addition'' is the set union of the pattern of
-\verb'A' and \verb'B'.  Entries in neither in \verb'A' nor in \verb'B' do
-not appear in the result.
-
-Let $\oplus$ denote the binary operator to be used.  The computation ${\bf T =
-A \oplus B}$ is exactly the same as the computation with accumulator operator
-as described in Section~\ref{accummask}.  It acts like a sparse matrix
-addition, except that any operator can be used.  The pattern of ${\bf A \oplus
-B}$ is the set union of the patterns of ${\bf A}$ and ${\bf B}$, and the
-operator is applied only on the set intersection of ${\bf A}$ and ${\bf B}$.
-Entries not in either the pattern of ${\bf A}$ or ${\bf B}$ do not appear in
-the pattern of ${\bf T}$.  That is:
-    \vspace{-0.2in}
-    {\small
-    \begin{tabbing}
-    \hspace{2em} \= \hspace{2em} \= \hspace{2em} \= \\
-    \> for all entries $(i,j)$ in ${\bf A \cap B}$ \\
-    \> \> $t_{ij} = a_{ij} \oplus b_{ij}$ \\
-    \> for all entries $(i,j)$ in ${\bf A \setminus B}$ \\
-    \> \> $t_{ij} = a_{ij}$ \\
-    \> for all entries $(i,j)$ in ${\bf B \setminus A}$ \\
-    \> \> $t_{ij} = b_{ij}$
-    \end{tabbing}
-    }
-
-The only difference between element-wise ``multiplication'' (${\bf T =A \otimes
-B}$) and ``addition'' (${\bf T = A \oplus B}$) is the pattern of the result,
-and what happens to entries outside the intersection.  With $\otimes$ the
-pattern of ${\bf T}$ is the intersection; with $\oplus$ it is the set union.
-Entries outside the set intersection are dropped for $\otimes$, and kept for
-$\oplus$; in both cases the operator is only applied to those (and only those)
-entries in the intersection.  Any binary operator can be used interchangeably
-for either operation.
-
-Element-wise operations do not operate on the implicit values, even implicitly,
-since the operations make no assumption about the semiring.  As a result, the
-results can be different from MATLAB, which can always assume the implicit
-value is zero.  For example, \verb'C=A-B' is the conventional matrix
-subtraction in MATLAB.  Computing \verb'A-B' in GraphBLAS with \verb'eWiseAdd'
-will apply the \verb'MINUS' operator to the intersection, entries in \verb'A'
-but not \verb'B' will be unchanged and appear in \verb'C', and entries in
-neither \verb'A' nor \verb'B' do not appear in \verb'C'.  For these cases, the
-results matches the MATLAB \verb'C=A-B'.  Entries in \verb'B' but not \verb'A'
-do appear in \verb'C' but they are not negated; they cannot be subtracted from
-an implicit value in \verb'A'.  This is by design.  If conventional matrix
-subtraction of two sparse matrices is required, and the implicit value is known
-to be zero, use \verb'GrB_apply' to negate the values in \verb'B', and then
-use \verb'eWiseAdd' with the \verb'PLUS' operator, to compute \verb'A+(-B)'.
+\label{assign}
 
-The generic name for this operation is \verb'GrB_eWiseAdd', which can be used
-for both matrices and vectors.
+The methods described in this section are all variations of the form
+\verb'C(I,J)=A', which modifies a submatrix of the matrix \verb'C'.  All
+methods can be used in their generic form with the single name
+\verb'GrB_assign'.  These methods are very similar to their
+\verb'GxB_subassign' counterparts in Section~\ref{subassign}.  They differ
+primarily in the size of the \verb'Mask', and how the \verb'GrB_REPLACE' option
+works.  Section~\ref{compare_assign} compares
+\verb'GxB_subassign' and \verb'GrB_assign'.
 
-There is another minor difference in two variants of the element-wise
-functions.  If given a \verb'semiring', the \verb'eWiseAdd' functions use the
-binary operator of the semiring's monoid, while the \verb'eWiseMult' functions
-use the multiplicative operator of the semiring.
+See Section~\ref{colon} for a description of
+\verb'I', \verb'ni', \verb'J', and \verb'nj'.
 
-% \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_eWiseAdd:} element-wise vector addition}
+\subsubsection{{\sf GrB\_Vector\_assign:} assign to a subvector }
 %-------------------------------------------------------------------------------
-\label{eWiseAdd_vector}
+\label{assign_vector}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_eWiseAdd               // w<mask> = accum (w, u+v)
+GrB_Info GrB_assign                 // w<mask>(I) = accum (w(I),u)
 (
-    GrB_Vector w,                   // input/output vector for results
+    GrB_Vector w,                   // input/output matrix for results
     const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const <operator> add,           // defines '+' for t=u+v
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),t)
     const GrB_Vector u,             // first input:  vector u
-    const GrB_Vector v,             // second input: vector v
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
     const GrB_Descriptor desc       // descriptor for w and mask
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_eWiseAdd' computes the element-wise ``addition'' of two
-vectors \verb'u' and \verb'v', element-wise using any binary operator (not just
-plus).  The vectors are not transposed via the descriptor.  Entries in the
-intersection of \verb'u' and \verb'v' are first typecasted into the first and
-second inputs of the \verb'add' operator.  Next, a column vector \verb't' is
-computed, denoted ${\bf t = u \oplus v}$.  The pattern of \verb't' is the set
-union of \verb'u' and \verb'v'.  The result \verb't' has the type of the output
-\verb'ztype' of the \verb'add' operator.
-
-The \verb'add' operator is typically a \verb'GrB_BinaryOp', but the method is
-type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
-additive operator of the monoid is used as the \verb'add' binary operator.  If
-given a semiring (\verb'GrB_Semiring'), the additive operator of the monoid of
-the semiring is used as the \verb'add' binary operator.
-
-The final step is ${\bf w \langle m \rangle  = w \odot t}$, as described in
-Section~\ref{accummask}, except that all the terms are column vectors instead
-of matrices.
+\verb'GrB_Vector_assign' operates on a subvector \verb'w(I)' of \verb'w',
+modifying it with the vector \verb'u'.  The \verb'mask' vector has the same
+size as \verb'w'.  The method is identical to \verb'GrB_Matrix_assign'
+described in Section~\ref{assign_matrix}, where all matrices have a single
+column each.  The only other difference is that the input \verb'u' in this
+method is not transposed via the \verb'GrB_INP0' descriptor.
 
-% \newpage
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_eWiseAdd:} element-wise matrix addition}
+\subsubsection{{\sf GrB\_Matrix\_assign:} assign to a submatrix }
 %-------------------------------------------------------------------------------
-\label{eWiseAdd_matrix}
+\label{assign_matrix}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_eWiseAdd               // C<Mask> = accum (C, A+B)
+GrB_Info GrB_assign                 // C<Mask>(I,J) = accum (C(I,J),A)
 (
     GrB_Matrix C,                   // input/output matrix for results
     const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const <operator> add,           // defines '+' for T=A+B
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),T)
     const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index *J,             // column indices
+    const GrB_Index nj,             // number of column indices
+    const GrB_Descriptor desc       // descriptor for C, Mask, and A
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_eWiseAdd' computes the element-wise ``addition'' of two
-matrices \verb'A' and \verb'B', element-wise using any binary operator (not
-just plus).  The input matrices may be transposed first, according to the
-descriptor \verb'desc'.  Entries in the intersection then typecasted into the
-first and second inputs of the \verb'add' operator.  Next, a matrix \verb'T' is
-computed, denoted ${\bf T = A \oplus B}$.  The pattern of \verb'T' is the set
-union of \verb'A' and \verb'B'.  The result \verb'T' has the type of the output
-\verb'ztype' of the \verb'add' operator.
+\verb'GrB_Matrix_assign' operates on a submatrix \verb'S' of \verb'C',
+modifying it with the matrix \verb'A'.  It may also modify all of \verb'C',
+depending on the input descriptor \verb'desc' and the \verb'Mask'.
 
-The \verb'add' operator is typically a \verb'GrB_BinaryOp', but the method is
-type-generic for this parameter.  If given a monoid (\verb'GrB_Monoid'), the
-additive operator of the monoid is used as the \verb'add' binary operator.  If
-given a semiring (\verb'GrB_Semiring'), the additive operator of the monoid of
-the semiring is used as the \verb'add' binary operator.
+\vspace{0.1in}
+\begin{tabular}{lll}
+\hline
+Step & GraphBLAS & description \\
+     & notation  & \\
+\hline
+1 & ${\bf S} = {\bf C(I,J)}$                & extract ${\bf C(I,J)}$ submatrix \\
+2 & ${\bf S} = {\bf S} \odot {\bf A}$       & apply the accumulator (but not the mask) to ${\bf S}$\\
+3 & ${\bf Z} = {\bf C}$                     & make a copy of ${\bf C}$ \\
+4 & ${\bf Z(I,J)} = {\bf S}$                & put the submatrix into ${\bf Z(I,J)}$ \\
+5 & ${\bf C \langle M \rangle = Z}$         & apply the mask/replace phase to all of ${\bf C}$ \\
+\hline
+\end{tabular}
+\vspace{0.1in}
 
-\vspace{0.05in}
-The operation can be expressed in MATLAB notation as:
-    {\footnotesize
-    \begin{verbatim}
-    [nrows, ncols] = size (A.matrix) ;
-    T.matrix = zeros (nrows, ncols, add.ztype) ;
-    p = A.pattern & B.pattern ;
-    A = GB_mex_cast (A.matrix (p), add.xtype) ;
-    B = GB_mex_cast (B.matrix (p), add.ytype) ;
-    T.matrix (p) = add (A, B) ;
-    p =  A.pattern & ~B.pattern ; T.matrix (p) = cast (A.matrix (p), add.ztype) ;
-    p = ~A.pattern &  B.pattern ; T.matrix (p) = cast (B.matrix (p), add.ztype) ;
-    T.pattern = A.pattern | B.pattern ;
-    T.class = add.ztype ; \end{verbatim} }
-Except for when typecasting is performed, this is identical to how the
-\verb'accum' operator is applied in Figure~\ref{fig_accummask}.
+In contrast to \verb'GxB_subassign', the \verb'Mask' has the same as \verb'C'.
+
+Step 1 extracts the submatrix and then Step 2 applies the accumulator
+(or ${\bf S}={\bf A}$ if \verb'accum' is \verb'NULL').  The \verb'Mask' is
+not yet applied.
+
+Step 3 makes a copy of the ${\bf C}$ matrix, and then Step 4 writes the
+submatrix ${\bf S}$ into ${\bf Z}$.  This is the same as Step 3 of
+\verb'GxB_subassign', except that it operates on a temporary matrix ${\bf Z}$.
+
+Finally, Step 5 writes ${\bf Z}$ back into ${\bf C}$ via the \verb'Mask', using
+the Mask/Replace Phase described in Section~\ref{accummask}.  If
+\verb'GrB_REPLACE' is enabled, then all of ${\bf C}$ is cleared prior to
+writing ${\bf Z}$ via the mask.  As a result, the \verb'GrB_REPLACE' option can
+delete entries outside the ${\bf C(I,J)}$ submatrix.
+
+\paragraph{\bf Performance considerations:} % C(I,J) = A
+If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
+the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
+fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
+if \verb'A' is transposed.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Col\_assign:} assign to a sub-column of a matrix}
+%-------------------------------------------------------------------------------
+\label{assign_column}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_assign                 // C<mask>(I,j) = accum (C(I,j),u)
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Vector mask,          // optional mask for C(:,j), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(C(I,j),t)
+    const GrB_Vector u,             // input vector
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index j,              // column index
+    const GrB_Descriptor desc       // descriptor for C(:,j) and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-The final step is ${\bf C \langle M \rangle  = C \odot T}$, as described in
-Section~\ref{accummask}.
+\verb'GrB_Col_assign' modifies a single sub-column of a matrix \verb'C'.  It is
+the same as \verb'GrB_Matrix_assign' where the index vector \verb'J[0]=j' is a
+single column index, and where all matrices in \verb'GrB_Matrix_assign' (except
+\verb'C') consist of a single column.
+
+Unlike \verb'GrB_Matrix_assign', the \verb'mask' is a vector with the same size
+as a single column of \verb'C'.
+
+The input descriptor \verb'GrB_INP0' is ignored; the input vector \verb'u' is
+not transposed.  Refer to \verb'GrB_Matrix_assign' for further details.
+
+\paragraph{\bf Performance considerations:} % C(I,j) = u
+\verb'GrB_Col_assign' is much faster than \verb'GrB_Row_assign' if the format
+of \verb'C' is \verb'GxB_BY_COL'.  \verb'GrB_Row_assign' is much faster than
+\verb'GrB_Col_assign' if the format of \verb'C' is \verb'GxB_BY_ROW'.
 
 \newpage
-%===============================================================================
-\subsection{{\sf GxB\_eWiseUnion:} element-wise operations, set union} %========
-%===============================================================================
-\label{eWiseUnion}
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Row\_assign:} assign to a sub-row of a matrix}
+%-------------------------------------------------------------------------------
+\label{assign_row}
 
-\verb'GxB_eWiseUnion' computes a result with the same pattern
-\verb'GrB_eWiseAdd', namely, a set union of its two inputs.  It differs in how
-the binary operator is applied.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_assign                 // C<mask'>(i,J) = accum (C(i,J),u')
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Vector mask,          // optional mask for C(i,:), unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(C(i,J),t)
+    const GrB_Vector u,             // input vector
+    const GrB_Index i,              // row index
+    const GrB_Index *J,             // column indices
+    const GrB_Index nj,             // number of column indices
+    const GrB_Descriptor desc       // descriptor for C(i,:) and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-Let $\oplus$ denote the binary operator to be used.  The operator is applied to
-every entry in $\bf A$ and $\bf B$.  A pair of scalars, $\alpha$ and $\beta$
-(\verb'alpha' and \verb'beta' in the API, respectively) define the
-inputs to the operator when entries are present in one matrix but not the
-other.
+\verb'GrB_Row_assign' modifies a single sub-row of a matrix \verb'C'.  It is
+the same as \verb'GrB_Matrix_assign' where the index vector \verb'I[0]=i' is
+a single row index, and where all matrices in \verb'GrB_Matrix_assign'
+(except \verb'C') consist of a single row.
 
-    \vspace{-0.2in}
-    {\small
-    \begin{tabbing}
-    \hspace{2em} \= \hspace{2em} \= \hspace{2em} \= \\
-    \> for all entries $(i,j)$ in ${\bf A \cap B}$ \\
-    \> \> $t_{ij} = a_{ij} \oplus b_{ij}$ \\
-    \> for all entries $(i,j)$ in ${\bf A \setminus B}$ \\
-    \> \> $t_{ij} = a_{ij} \oplus \beta $ \\
-    \> for all entries $(i,j)$ in ${\bf B \setminus A}$ \\
-    \> \> $t_{ij} = \alpha \oplus b_{ij}$
-    \end{tabbing}
-    }
+Unlike \verb'GrB_Matrix_assign', the \verb'mask' is a vector with the same size
+as a single row of \verb'C'.
 
-\verb'GxB_eWiseUnion' is useful in contexts where \verb'GrB_eWiseAdd' cannot be
-used because of the typecasting rules of GraphBLAS.  In particular, suppose
-\verb'A' and \verb'B' are matrices with a user-defined type, and suppose
-\verb'<' is a user-defined operator that compares two entries of this type and
-returns a Boolean value.  Then \verb'C=A<B' can be computed with
-\verb'GxB_eWiseUnion' but not with \verb'GrB_eWiseAdd'.  In the latter, if
-\verb'A(i,j)' is present but \verb'B(i,j)' is not, then \verb'A(i,j)' must
-typecasted to the type of \verb'C' (\verb'GrB_BOOL' in this case), and the
-assigment \verb'C(i,j) = (bool) A(i,j)' would be performed.  This is not
-possible because user-defined types cannot be typecasted to any other type.
+The input descriptor \verb'GrB_INP0' is ignored; the input vector \verb'u' is
+not transposed.  Refer to \verb'GrB_Matrix_assign' for further details.
 
-Another advantage of \verb'GxB_eWiseUnion' is its performance.  For example,
-the Octave/MATLAB expression \verb'C=A-B' computes \verb'C(i,j)=-B(i,j)' when
-\verb'A(i,j)' is not present.  This cannot be done with a single call
-\verb'GrB_eWiseAdd', but it can be done with a single call to
-\verb'GxB_eWiseUnion', with the \verb'GrB_MINUS_FP64' operator, and with both
-\verb'alpha' and \verb'beta' scalars equal to zero.  It is possible to
-compute this result with a temporary matrix, \verb'E=-B', computed with
-\verb'GrB_apply' and \verb'GrB_AINV_FP64', followed by a call to
-\verb'GrB_eWiseAdd' to compute \verb'C=A+E', but this is slower than a single
-call to \verb'GxB_eWiseUnion', and uses more memory.
+\paragraph{\bf Performance considerations:} % C(i,J) = u'
+\verb'GrB_Col_assign' is much faster than \verb'GrB_Row_assign' if the format
+of \verb'C' is \verb'GxB_BY_COL'.  \verb'GrB_Row_assign' is much faster than
+\verb'GrB_Col_assign' if the format of \verb'C' is \verb'GxB_BY_ROW'.
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Vector\_eWiseUnion:} element-wise vector addition}
+\subsubsection{{\sf GrB\_Vector\_assign\_$<$type$>$:} assign a scalar to a subvector}
 %-------------------------------------------------------------------------------
-\label{eWiseUnion_vector}
+\label{assign_vector_scalar}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_eWiseUnion             // w<mask> = accum (w, u+v)
+GrB_Info GrB_assign                 // w<mask>(I) = accum (w(I),x)
 (
     GrB_Vector w,                   // input/output vector for results
     const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_BinaryOp add,         // defines '+' for t=u+v
-    const GrB_Vector u,             // first input:  vector u
-    const GrB_Scalar alpha,
-    const GrB_Vector v,             // second input: vector v
-    const GrB_Scalar beta,
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),x)
+    const <type> x,                 // scalar to assign to w(I)
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
     const GrB_Descriptor desc       // descriptor for w and mask
 ) ;
 \end{verbatim} } \end{mdframed}
 
-Identical to \verb'GrB_Vector_eWiseAdd' except that two scalars are used
-to define how to compute the result when entries are present in one of
-the two input vectors (\verb'u' and \verb'v'), but not the other.
-Each of the two input scalars, \verb'alpha' and \verb'beta'
-must contain an entry.
-When computing the result \verb't=u+v',
-if \verb'u(i)' is present but \verb'v(i)' is not, then \verb't(i)=u(i)+beta'.
-Likewise,
-if \verb'v(i)' is present but \verb'u(i)' is not, then \verb't(i)=alpha+v(i)',
-where \verb'+' denotes the binary operator, \verb'add'.
+\verb'GrB_Vector_assign_<type>' assigns a single scalar to an entire subvector
+of the vector \verb'w'.  The operation is exactly like setting a single entry
+in an \verb'n'-by-1 matrix, \verb'A(I,0) = x', where the column index for a
+vector is implicitly \verb'j=0'.  The \verb'mask' vector has the same size as
+\verb'w'.  For further details of this function, see
+\verb'GrB_Matrix_assign_<type>' in the next section
+(\ref{assign_matrix_scalar}).
 
-\newpage
+Following the C API Specification, results are well-defined if \verb'I'
+contains duplicate indices.  Duplicate indices are simply ignored.  See
+Section~\ref{duplicates} for more details.
+
+% \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_eWiseUnion:} element-wise matrix addition}
+\subsubsection{{\sf GrB\_Matrix\_assign\_$<$type$>$:} assign a scalar to a submatrix}
 %-------------------------------------------------------------------------------
-\label{eWiseUnion_matrix}
+\label{assign_matrix_scalar}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_eWiseUnion             // C<M> = accum (C, A+B)
+GrB_Info GrB_assign                 // C<Mask>(I,J) = accum (C(I,J),x)
 (
     GrB_Matrix C,                   // input/output matrix for results
     const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp add,         // defines '+' for T=A+B
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Scalar alpha,
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Scalar beta,
-    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),x)
+    const <type> x,                 // scalar to assign to C(I,J)
+    const GrB_Index *I,             // row indices
+    const GrB_Index ni,             // number of row indices
+    const GrB_Index *J,             // column indices
+    const GrB_Index nj,             // number of column indices
+    const GrB_Descriptor desc       // descriptor for C and Mask
 ) ;
 \end{verbatim} } \end{mdframed}
 
-Identical to \verb'GrB_Matrix_eWiseAdd' except that two scalars are used
-to define how to compute the result when entries are present in one of
-the two input matrices (\verb'A' and \verb'B'), but not the other.
-Each of the two input scalars, \verb'alpha' and \verb'beta'
-must contain an entry.
-When computing the result \verb'T=A+B',
-if \verb'A(i,j)' is present but \verb'B(i,j))' is not, then \verb'T(i,j)=A(i,j)+beta'.
-Likewise,
-if \verb'B(i,j)' is present but \verb'A(i,j)' is not, then \verb'T(i,j)=alpha+B(i,j)',
-where \verb'+' denotes the binary operator, \verb'add'.
+\verb'GrB_Matrix_assign_<type>' assigns a single scalar to an entire
+submatrix of \verb'C', like the {\em scalar expansion} \verb'C(I,J)=x' in
+MATLAB.  The scalar \verb'x' is implicitly expanded into a matrix \verb'A' of
+size \verb'ni' by \verb'nj', and then the matrix \verb'A' is assigned to
+\verb'C(I,J)' using the same method as in \verb'GrB_Matrix_assign'.  Refer
+to that function in Section~\ref{assign_matrix} for further details.
+
+The \verb'Mask' has the same size as \verb'C'.
+
+For the accumulation step, the scalar \verb'x' is typecasted directly into the
+type of \verb'C' when the \verb'accum' operator is not applied to it, or into
+the \verb'ytype' of the \verb'accum' operator, if \verb'accum' is not NULL, for
+entries that are already present in \verb'C'.
+
+The \verb'<type> x' notation is otherwise the same as
+\verb'GrB_Matrix_setElement' (see Section~\ref{matrix_setElement}).  Any value
+can be passed to this function and its type will be detected, via the
+\verb'_Generic' feature of ANSI C11.  For a user-defined type, \verb'x' is a
+\verb'void *' pointer that points to a memory space holding a single entry of a
+scalar that has exactly the same user-defined type as the matrix \verb'C'.
+This user-defined type must exactly match the user-defined type of \verb'C'
+since no typecasting is done between user-defined types.
+
+If a \verb'void *' pointer is passed in and the type of the underlying scalar
+does not exactly match the user-defined type of \verb'C', then results are
+undefined.  No error status will be returned since GraphBLAS has no way of
+catching this error.
+
+If \verb'x' is a \verb'GrB_Scalar' with no entry, then it is implicitly
+expanded into a matrix \verb'A' of size \verb'ni' by \verb'nj', with no entries
+present.
+
+Following the C API Specification, results are well-defined if \verb'I' or
+\verb'J' contain duplicate indices.  Duplicate indices are simply ignored.  See
+Section~\ref{duplicates} for more details.
+
+\paragraph{\bf Performance considerations:} % C(I,J) = scalar
+If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
+the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
+fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
+if \verb'A' is transposed.
 
 \newpage
 %===============================================================================
-\subsection{{\sf GrB\_extract:} submatrix extraction } %========================
+\subsection{Duplicate indices in {\sf GrB\_assign} and {\sf GxB\_subassign}}
 %===============================================================================
-\label{extract}
+\label{duplicates}
+
+According to the GraphBLAS C API Specification if the index vectors \verb'I' or
+\verb'J' contain duplicate indices, the results are undefined for
+\verb'GrB_Matrix_assign', \verb'GrB_Matrix_assign', \verb'GrB_Col_assign', and
+\verb'GrB_Row_assign'.  Only the scalar assignment operations
+(\verb'GrB_Matrix_assign_TYPE' and \verb'GrB_Matrix_assign_TYPE') are
+well-defined when duplicates appear in \verb'I' and \verb'J'.  In those two
+functions, duplicate indices are ignored.
+
+As an extension to the specification, SuiteSparse:GraphBLAS provides a
+definition of how duplicate indices are handled in all cases.  If \verb'I' has
+duplicate indices, they are ignored and the last unique entry in the list is
+used.  When no mask and no accumulator is present, the results are identical to
+how MATLAB handles duplicate indices in the built-in expression
+\verb'C(I,J)=A'.  Details of how this is done is shown below.
+
+{\small
+\begin{verbatim}
+    function C = subassign (C, I, J, A)
+    % submatrix assignment with pre-sort of I and J; and remove duplicates
+
+    % delete duplicates from I, keeping the last one seen
+    [I2 I2k] = sort (I) ;
+    Idupl = [(I2 (1:end-1) == I2 (2:end)), false] ;
+    I2  = I2  (~Idupl) ;
+    I2k = I2k (~Idupl) ;
+    assert (isequal (I2, unique (I)))
+
+    % delete duplicates from J, keeping the last one seen
+    [J2 J2k] = sort (J) ;
+    Jdupl = [(J2 (1:end-1) == J2 (2:end)), false] ;
+    J2  = J2  (~Jdupl) ;
+    J2k = J2k (~Jdupl) ;
+    assert (isequal (J2, unique (J)))
+
+    % do the submatrix assignment, with no duplicates in I2 or J2
+    C (I2,J2) = A (I2k,J2k) ;
+\end{verbatim}}
+
+If a mask is present, then it is replaced with \verb'M = M (I2k, J2k)' for
+\verb'GxB_subassign', or with \verb'M = M (I2, J2)' for \verb'GrB_assign'.
+If an accumulator operator is present, it is applied after the duplicates
+are removed, as (for example):
+
+{\small
+\begin{verbatim}
+    C (I2,J2) = C (I2,J2) + A (I2k,J2k) ;
+\end{verbatim}}
 
-The \verb'GrB_extract' function is a generic name for three specific functions:
-\verb'GrB_Vector_extract', \verb'GrB_Col_extract', and
-\verb'GrB_Matrix_extract'.  The generic name appears in the function signature,
-but the specific function name is used when describing what each variation
-does.
+These definitions allow the Octave/MATLAB interface to GraphBLAS to return the same
+results for \verb'C(I,J)=A' for a \verb'GrB' object as they do for built-in
+Octave/MATLAB matrices.  They also allow the assignment to be done in parallel.
 
-% \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_extract:} extract subvector from vector}
-%-------------------------------------------------------------------------------
-\label{extract_vector}
+Results are always well-defined in SuiteSparse:GraphBLAS, but they might not be
+what you expect.  For example, suppose the \verb'MIN' operator is being used
+the following assigment to the vector \verb'x', and suppose \verb'I' contains
+the entries \verb'[0 0]'.  Suppose \verb'x' is initially empty, of length 1,
+and suppose \verb'y' is a vector of length 2 with the values \verb'[5 7]'.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
+{\small
 \begin{verbatim}
-GrB_Info GrB_extract                // w<mask> = accum (w, u(I))
-(
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_Vector u,             // first input:  vector u
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Descriptor desc       // descriptor for w and mask
-) ;
-\end{verbatim} } \end{mdframed}
+    #include "GraphBLAS.h"
+    #include <stdio.h>
+    int main (void)
+    {
+        GrB_init (GrB_NONBLOCKING) ;
+        GrB_Vector x, y ;
+        GrB_Vector_new (&x, GrB_INT32, 1) ;
+        GrB_Vector_new (&y, GrB_INT32, 2) ;
+        GrB_Index I [2] = {0, 0} ;
+        GrB_Vector_setElement (y, 5, 0) ;
+        GrB_Vector_setElement (y, 7, 1) ;
+        GrB_Vector_wait (&y) ;
+        GxB_print (x, 3) ;
+        GxB_print (y, 3) ;
+        GrB_assign (x, NULL, GrB_MIN_INT32, y, I, 2, NULL) ;
+        GrB_Vector_wait (&y) ;
+        GxB_print (x, 3) ;
+        GrB_finalize ( ) ;
+    }
+\end{verbatim}}
 
-\verb'GrB_Vector_extract' extracts a subvector from another vector, identical
-to \verb't = u (I)' in MATLAB where \verb'I' is an integer vector of row
-indices.  Refer to \verb'GrB_Matrix_extract' for further details; vector
-extraction is the same as matrix extraction with \verb'n'-by-1 matrices.
-See Section~\ref{colon} for a description of \verb'I' and \verb'ni'.
-The final step is ${\bf w \langle m \rangle  = w \odot
-t}$, as described in Section~\ref{accummask}, except that all the terms are
-column vectors instead of matrices.
+You might (wrongly) expect the result to be the vector \verb'x(0)=5', since
+two entries seem to be assigned, and the min operator might be expected to
+take the minimum of the two.  This is not how SuiteSparse:GraphBLAS handles
+duplicates.
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_extract:} extract submatrix from matrix}
-%-------------------------------------------------------------------------------
-\label{extract_matrix}
+Instead, the first duplicate index of \verb'I' is discarded
+(\verb'I [0] = 0', and \verb'y(0)=5').
+and only the second entry is used
+(\verb'I [1] = 0', and \verb'y(1)=7').
+The output of the above program is:
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
+{\small
 \begin{verbatim}
-GrB_Info GrB_extract                // C<Mask> = accum (C, A(I,J))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index *J,             // column indices
-    const GrB_Index nj,             // number of column indices
-    const GrB_Descriptor desc       // descriptor for C, Mask, and A
-) ;
-\end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_extract' extracts a submatrix from another matrix, identical
-to \verb'T = A(I,J)' in MATLAB where \verb'I' and \verb'J' are integer vectors
-of row and column indices, respectively, except that indices are zero-based in
-GraphBLAS and one-based in MATLAB.  The input matrix \verb'A' may be transposed
-first, via the descriptor.  The type of \verb'T' and \verb'A' are the same.
-The size of \verb'C' is \verb'|I|'-by-\verb'|J|'.
-Entries outside \verb'A(I,J)' are not accessed and do not take part in the
-computation.  More precisely, assuming the matrix \verb'A' is not transposed,
-the matrix \verb'T' is defined as follows:
+  1x1 GraphBLAS int32_t vector, sparse by col:
+  x, no entries
 
-    \vspace{-0.1in}
-    {\footnotesize
-    \begin{verbatim}
-    T.matrix  = zeros (ni, nj) ;    % a matrix of size ni-by-nj
-    T.pattern = false (ni, nj) ;
-    for i = 1:ni
-        for j = 1:nj
-            if (A (I(i),J(j)).pattern)
-                T (i,j).matrix  = A (I(i),J(j)).matrix ;
-                T (i,j).pattern = true ;
-            end
-        end
-    end \end{verbatim}}
 
-\vspace{-0.1in}
-If duplicate indices are present in \verb'I' or \verb'J', the above method
-defines the result in \verb'T'.  Duplicates result in the same values of
-\verb'A' being copied into different places in \verb'T'.
-See Section~\ref{colon} for a description of the row indices
-\verb'I' and \verb'ni', and the column indices
-\verb'J' and \verb'nj'.
-The final step is ${\bf C \langle M \rangle  = C \odot
-T}$, as described in Section~\ref{accummask}.
+  2x1 GraphBLAS int32_t vector, sparse by col:
+  y, 2 entries
 
-\paragraph{\bf Performance considerations:} % C=A(I,J)
-If \verb'A' is not transposed via input descriptor: if \verb'|I|' is small,
-then it is fastest if \verb'A' is \verb'GxB_BY_ROW'; if
-\verb'|J|' is small, then it is fastest if \verb'A' is
-\verb'GxB_BY_COL'.  The opposite is true if \verb'A' is transposed.
+    (0,0)   5
+    (1,0)   7
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Col\_extract:} extract column vector from matrix}
-%-------------------------------------------------------------------------------
-\label{extract_column}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_extract                // w<mask> = accum (w, A(I,j))
-(
-    GrB_Vector w,                   // input/output matrix for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index j,              // column index
-    const GrB_Descriptor desc       // descriptor for w, mask, and A
-) ;
-\end{verbatim} } \end{mdframed}
+  1x1 GraphBLAS int32_t vector, sparse by col:
+  x, 1 entry
 
-\verb'GrB_Col_extract' extracts a subvector from a matrix, identical to
-\verb't = A (I,j)' in MATLAB where \verb'I' is an integer vector of row indices
-and where \verb'j' is a single column index.  The input matrix \verb'A' may be
-transposed first, via the descriptor, which results in the extraction of a
-single row \verb'j' from the matrix \verb'A', the result of which is a column
-vector \verb'w'.  The type of \verb't' and \verb'A' are the same.
-The size of \verb'w' is \verb'|I|'-by-1.
+    (0,0)   7
 
-See Section~\ref{colon} for a description of the row indices
-\verb'I' and \verb'ni'.
-The final step is ${\bf w \langle m
-\rangle  = w \odot t}$, as described in Section~\ref{accummask}, except that
-all the terms are column vectors instead of matrices.
+\end{verbatim}}
+
+You see that the result is \verb'x(0)=7', since the \verb'y(0)=5' entry
+has been ignored because of the duplicate indices in \verb'I'.
+
+\begin{alert}
+{\bf SPEC:} Providing a well-defined behavior for duplicate
+indices with matrix and vector assignment is an extension to the specification.
+The specification only defines the behavior when assigning a scalar into a matrix
+or vector, and states that duplicate indices otherwise lead to undefined
+results.
+\end{alert}
 
-\paragraph{\bf Performance considerations:} % w = A(I,j)
-If \verb'A' is not transposed: it is fastest if the format of \verb'A' is
-\verb'GxB_BY_COL'.  The opposite is true if \verb'A' is transposed.
 
 \newpage
 %===============================================================================
-\subsection{{\sf GxB\_subassign:} submatrix assignment} %=======================
+\subsection{Comparing {\sf GrB\_assign} and {\sf GxB\_subassign}} %=============
 %===============================================================================
-\label{subassign}
+\label{compare_assign}
 
-The methods described in this section are all variations of the form
-\verb'C(I,J)=A', which modifies a submatrix of the matrix \verb'C'.  All
-methods can be used in their generic form with the single name
-\verb'GxB_subassign'.  This is reflected in the prototypes.  However, to avoid
-confusion between the different kinds of assignment, the name of the specific
-function is used when describing each variation.  If the discussion applies to
-all variations, the simple name \verb'GxB_subassign' is used.
+The \verb'GxB_subassign' and \verb'GrB_assign' operations are very similar, but
+they differ in two ways:
 
-See Section~\ref{colon} for a description of the row indices
-\verb'I' and \verb'ni', and the column indices
-\verb'J' and \verb'nj'.
+\begin{enumerate}
+\item {\bf The Mask has a different size:}
+    The mask in \verb'GxB_subassign' has the same dimensions as \verb'w(I)' for
+    vectors and \verb'C(I,J)' for matrices.  In \verb'GrB_assign', the mask is
+    the same size as \verb'w' or \verb'C', respectively (except for the row/col
+    variants).  The two masks are related.  If \verb'M' is the mask for
+    \verb'GrB_assign', then \verb'M(I,J)' is the mask for \verb'GxB_subassign'.
+    If there is no mask, or if \verb'I' and \verb'J' are both \verb'GrB_ALL',
+    the two masks are the same.
+    For \verb'GrB_Row_assign' and \verb'GrB_Col_assign', the \verb'mask' vector
+    is the same size as a row or column of \verb'C', respectively.  For the
+    corresponding \verb'GxB_Row_subassign' and \verb'GxB_Col_subassign'
+    operations, the \verb'mask' is the same size as the sub-row \verb'C(i,J)' or
+    subcolumn \verb'C(I,j)', respectively.
 
-\verb'GxB_subassign' is very similar to \verb'GrB_assign', described in
-Section~\ref{assign}.  The two operations are compared and contrasted in
-Section~\ref{compare_assign}.  For a discussion of how duplicate indices
-are handled in \verb'I' and \verb'J', see Section~\ref{duplicates}.
+\item {\bf \verb'GrB_REPLACE' is different:}
+    They differ in how \verb'C' is affected in areas outside the \verb'C(I,J)'
+    submatrix.  In \verb'GxB_subassign', the \verb'C(I,J)' submatrix is the
+    only part of \verb'C' that can be modified, and no part of \verb'C' outside
+    the submatrix is ever modified.  In \verb'GrB_assign', it is possible to
+    delete entries in \verb'C' outside the submatrix, but only in one specific
+    manner.  Suppose the mask \verb'M' is present (or, suppose it is not
+    present but \verb'GrB_COMP' is true).  After (optionally) complementing the
+    mask, the value of \verb'M(i,j)' can be 0 for some entry outside the
+    \verb'C(I,J)' submatrix.  If the \verb'GrB_REPLACE' descriptor is
+    true, \verb'GrB_assign' deletes this entry.
 
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Vector\_subassign:} assign to a subvector }
-%-------------------------------------------------------------------------------
-\label{subassign_vector}
+\end{enumerate}
+
+\verb'GxB_subassign' and \verb'GrB_assign' are identical if \verb'GrB_REPLACE'
+is set to its default value of false, and if the masks happen to be the same.
+The two masks can be the same in two cases:  either the \verb'Mask' input is
+\verb'NULL' (and it is not complemented via \verb'GrB_COMP'), or \verb'I' and
+\verb'J' are both \verb'GrB_ALL'.
+If all these conditions hold,
+the two algorithms are identical and have the same performance.  Otherwise,
+\verb'GxB_subassign' is much faster than \verb'GrB_assign' when the latter
+must examine the entire matrix \verb'C' to delete entries (when
+\verb'GrB_REPLACE' is true), and if it must deal with a much larger \verb'Mask'
+matrix.  However, both methods have specific uses.
+
+Consider using \verb'C(I,J)+=F' for many submatrices \verb'F' (for example,
+when assembling a finite-element matrix).  If the \verb'Mask' is meant as a
+specification for which entries of \verb'C' should appear in the final result,
+then use \verb'GrB_assign'.
+
+If instead the \verb'Mask' is meant to control which entries of the submatrix
+\verb'C(I,J)' are modified by the finite-element \verb'F', then use
+\verb'GxB_subassign'.  This is particularly useful is the \verb'Mask' is a
+template that follows along with the finite-element \verb'F', independent of
+where it is applied to \verb'C'.  Using \verb'GrB_assign' would be very
+difficult in this case since a new \verb'Mask', the same size as \verb'C',
+would need to be constructed for each finite-element \verb'F'.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_subassign              // w(I)<mask> = accum (w(I),u)
-(
-    GrB_Vector w,                   // input/output matrix for results
-    const GrB_Vector mask,          // optional mask for w(I), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),t)
-    const GrB_Vector u,             // first input:  vector u
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Descriptor desc       // descriptor for w(I) and mask
-) ;
-\end{verbatim} } \end{mdframed}
+In GraphBLAS notation, the two methods can be described as follows:
 
-\verb'GxB_Vector_subassign' operates on a subvector \verb'w(I)' of \verb'w',
-modifying it with the vector \verb'u'.  The method is identical to
-\verb'GxB_Matrix_subassign' described in Section~\ref{subassign_matrix}, where
-all matrices have a single column each.  The \verb'mask' has the same size as
-\verb'w(I)' and \verb'u'.  The only other difference is that the input \verb'u'
-in this method is not transposed via the \verb'GrB_INP0' descriptor.
+\vspace{0.05in}
+\begin{tabular}{ll}
+\hline
+matrix and vector subassign & ${\bf C(I,J) \langle M \rangle}  = {\bf C(I,J)} \odot {\bf A}$ \\
+matrix and vector    assign & ${\bf C \langle M \rangle (I,J)} = {\bf C(I,J)} \odot {\bf A}$ \\
+\hline
+\end{tabular}
+\vspace{0.05in}
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_subassign:} assign to a submatrix }
-%-------------------------------------------------------------------------------
-\label{subassign_matrix}
+This notation does not include the details of the \verb'GrB_COMP' and
+\verb'GrB_REPLACE' descriptors, but it does illustrate the difference in the
+\verb'Mask'.  In the subassign, \verb'Mask' is the same size as \verb'C(I,J)'
+and \verb'A'.  If \verb'I[0]=i' and \verb'J[0]=j', Then \verb'Mask(0,0)'
+controls how \verb'C(i,j)' is modified by the subassign, from the value
+\verb'A(0,0)'.  In the assign, \verb'Mask' is the same size as \verb'C', and
+\verb'Mask(i,j)' controls how \verb'C(i,j)' is modified.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_subassign              // C(I,J)<Mask> = accum (C(I,J),A)
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C(I,J), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),T)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index *J,             // column indices
-    const GrB_Index nj,             // number of column indices
-    const GrB_Descriptor desc       // descriptor for C(I,J), Mask, and A
-) ;
-\end{verbatim} } \end{mdframed}
+The \verb'GxB_subassign' and \verb'GrB_assign' functions have the same
+signatures; they differ only in how they consider the \verb'Mask' and the
+\verb'GrB_REPLACE' descriptor
 
-\verb'GxB_Matrix_subassign' operates only on a submatrix \verb'S' of \verb'C',
-modifying it with the matrix \verb'A'.   For this operation, the result is not
-the entire matrix \verb'C', but a submatrix \verb'S=C(I,J)' of \verb'C'.  The
-steps taken are as follows, except that ${\bf A}$ may be optionally transposed
-via the \verb'GrB_INP0' descriptor option.
+Details of each step of the two operations are listed below:
 
 \vspace{0.1in}
 \begin{tabular}{lll}
 \hline
-Step & GraphBLAS & description \\
-     & notation  & \\
+Step & \verb'GrB_Matrix_assign'                & \verb'GxB_Matrix_subassign'                        \\
 \hline
-1 & ${\bf S} = {\bf C(I,J)}$                             & extract the ${\bf C(I,J)}$ submatrix \\
-2 & ${\bf S \langle M \rangle} = {\bf S} \odot {\bf A}$  & apply the accumulator/mask to the submatrix ${\bf S}$\\
-3 & ${\bf C(I,J)}= {\bf S}$                              & put the submatrix ${\bf S}$ back into ${\bf C(I,J)}$ \\
+1 & ${\bf S} = {\bf C(I,J)}$                & ${\bf S} = {\bf C(I,J)}$                              \\
+2 & ${\bf S} = {\bf S} \odot {\bf A}$       & ${\bf S \langle M \rangle} = {\bf S} \odot {\bf A}$   \\
+3 & ${\bf Z} = {\bf C}$                     & ${\bf C(I,J)}= {\bf S}$                               \\
+4 & ${\bf Z(I,J)} = {\bf S}$                &                                                       \\
+5 & ${\bf C \langle M \rangle = Z}$         &                                                       \\
 \hline
 \end{tabular}
 \vspace{0.1in}
 
-The accumulator/mask step in Step 2 is the same as for all other GraphBLAS
-operations, described in Section~\ref{accummask}, except that for
-\verb'GxB_subassign', it is applied to just the submatrix ${\bf S} = {\bf
-C(I,J)}$, and thus the \verb'Mask' has the same size as ${\bf A}$,
-${\bf S}$, and ${\bf C(I,J)}$.
-
-The \verb'GxB_subassign' operation is the reverse of matrix extraction:
-
-\begin{itemize}
-\item
-For submatrix extraction, \verb'GrB_Matrix_extract',
-the submatrix \verb'A(I,J)' appears on the right-hand side of the assignment,
-\verb'C=A(I,J)', and entries outside of the submatrix are not accessed and do
-not take part in the computation.
-
-\item
-For submatrix assignment, \verb'GxB_Matrix_subassign',
-the submatrix \verb'C(I,J)' appears on the left-hand-side of the assignment,
-\verb'C(I,J)=A', and entries outside of the submatrix are not accessed and do
-not take part in the computation.
-
-\end{itemize}
-
-In both methods, the accumulator and mask modify the submatrix of the
-assignment; they simply differ on which side of the assignment the submatrix
-resides on.  In both cases, if the \verb'Mask' matrix is present it is the same
-size as the submatrix:
+Step 1 is the same.  In the Accumulator Phase (Step 2), the expression
+${\bf S} \odot {\bf A}$,
+described in Section~\ref{accummask}, is the same in both
+operations.  The result is simply ${\bf A}$ if \verb'accum' is \verb'NULL'.  It
+only applies to the submatrix ${\bf S}$, not the whole matrix.
+The result ${\bf S} \odot {\bf A}$ is used differently in the Mask/Replace
+phase.
 
+The Mask/Replace Phase, described in Section~\ref{accummask} is different:
 \begin{itemize}
-
 \item
-For submatrix extraction,
-${\bf C \langle M \rangle = C \odot A(I,J)}$ is computed,
-where the submatrix is on the right.
-The mask ${\bf M}$ has the same size as the submatrix ${\bf A(I,J)}$.
+    For \verb'GrB_assign' (Step 5), the mask is applied to all of ${\bf
+    C}$.  The mask has the same size as ${\bf C}$.  Just prior to making the
+    assignment via the mask, the \verb'GrB_REPLACE' option can be used to clear
+    all of ${\bf C}$ first.  This is the only way in which entries in ${\bf C}$ that
+    are outside the ${\bf C(I,J)}$ submatrix can be modified by this operation.
 
 \item
-For submatrix assignment,
-${\bf C(I,J) \langle M \rangle = C(I,J) \odot A}$ is computed,
-where the submatrix is on the left.
-The mask ${\bf M}$ has the same size as the submatrix ${\bf C(I,J)}$.
+    For \verb'GxB_subassign' (Step 2b), the mask is applied to just
+    ${\bf S}$.  The mask has the same size as ${\bf C(I,J)}$, ${\bf S}$, and
+    ${\bf A}$.  Just prior to making the assignment via the mask, the
+    \verb'GrB_REPLACE' option can be used to clear ${\bf S}$ first.  No entries
+    in ${\bf C}$ that are outside the ${\bf C(I,J)}$ can be modified by this
+    operation.  Thus, \verb'GrB_REPLACE' has no effect on entries in ${\bf C}$
+    outside the ${\bf C(I,J)}$ submatrix.
 
 \end{itemize}
 
-In Step 1, the submatrix \verb'S' is first computed by the
-\verb'GrB_Matrix_extract' operation, \verb'S=C(I,J)'.
-
-Step 2 accumulates the results ${\bf S \langle M \rangle  = S \odot T}$,
-exactly as described in Section~\ref{accummask}, but operating on the submatrix
-${\bf S}$, not ${\bf C}$, using the optional \verb'Mask' and \verb'accum'
-operator.  The matrix ${\bf T}$ is simply ${\bf T}={\bf A}$, or ${\bf T}={\bf
-A}^{\sf T}$ if ${\bf A}$ is transposed via the \verb'desc' descriptor,
-\verb'GrB_INP0'.  The \verb'GrB_REPLACE' option in the descriptor clears ${\bf
-S}$ after computing ${\bf Z = T}$ or ${\bf Z = C \odot T}$, not all of ${\bf
-C}$ since this operation can only modify the specified submatrix of ${\bf C}$.
-
-Finally, Step 3 writes the result (which is the modified submatrix \verb'S' and
-not all of \verb'C') back into the \verb'C' matrix that contains it, via the
-assignment \verb'C(I,J)=S', using the reverse operation from the method
-described for matrix extraction:
-
-    {\footnotesize
-    \begin{verbatim}
-    for i = 1:ni
-        for j = 1:nj
-            if (S (i,j).pattern)
-                C (I(i),J(j)).matrix = S (i,j).matrix ;
-                C (I(i),J(j)).pattern = true ;
-            end
-        end
-    end \end{verbatim}}
-
-\paragraph{\bf Performance considerations:} % C(I,J) = A
-If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
-the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
-fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
-if \verb'A' is transposed.
-
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Col\_subassign:} assign to a sub-column of a matrix}
-%-------------------------------------------------------------------------------
-\label{subassign_column}
-
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_subassign              // C(I,j)<mask> = accum (C(I,j),u)
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Vector mask,          // optional mask for C(I,j), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(C(I,j),t)
-    const GrB_Vector u,             // input vector
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index j,              // column index
-    const GrB_Descriptor desc       // descriptor for C(I,j) and mask
-) ;
-\end{verbatim} } \end{mdframed}
-
-\verb'GxB_Col_subassign' modifies a single sub-column of a matrix \verb'C'.  It
-is the same as \verb'GxB_Matrix_subassign' where the index vector \verb'J[0]=j'
-is a single column index (and thus \verb'nj=1'), and where all matrices in
-\verb'GxB_Matrix_subassign' (except \verb'C') consist of a single column.  The
-\verb'mask' vector has the same size as \verb'u' and the sub-column
-\verb'C(I,j)'.  The input descriptor \verb'GrB_INP0' is ignored; the input
-vector \verb'u' is not transposed.  Refer to \verb'GxB_Matrix_subassign' for
-further details.
-
-\paragraph{\bf Performance considerations:} % C(I,j) = u
-\verb'GxB_Col_subassign' is much faster than \verb'GxB_Row_subassign' if the
-format of \verb'C' is \verb'GxB_BY_COL'.  \verb'GxB_Row_subassign' is much
-faster than \verb'GxB_Col_subassign' if the format of \verb'C' is
-\verb'GxB_BY_ROW'.
+The differences between \verb'GrB_assign' and
+\verb'GxB_subassign' can be seen in Tables~\ref{insubmatrix} and
+\ref{outsubmatrix}.  The first table considers the case when the entry $c_{ij}$
+is in the ${\bf C(I,J)}$ submatrix, and it describes what is computed for both
+\verb'GrB_assign' and \verb'GxB_subassign'.  They perform the
+exact same computation; the only difference is how the value of the mask is
+specified.  Compare Table~\ref{insubmatrix} with Table~\ref{tab:maskaccum}
+in Section~\ref{sec:maskaccum}.
 
-% \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Row\_subassign:} assign to a sub-row of a matrix}
-%-------------------------------------------------------------------------------
-\label{subassign_row}
+The first column of Table~\ref{insubmatrix} is {\em yes} if \verb'GrB_REPLACE' is enabled,
+and a dash otherwise.  The second column is {\em yes} if an accumulator
+operator is given, and a dash otherwise.  The third column is $c_{ij}$ if the
+entry is present in ${\bf C}$, and a dash otherwise.  The fourth column is
+$a_{i'j'}$ if the corresponding entry is present in ${\bf A}$, where
+$i={\bf I}(i')$ and $j={\bf J}(i')$.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_subassign              // C(i,J)<mask'> = accum (C(i,J),u')
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Vector mask,          // optional mask for C(i,J), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(C(i,J),t)
-    const GrB_Vector u,             // input vector
-    const GrB_Index i,              // row index
-    const GrB_Index *J,             // column indices
-    const GrB_Index nj,             // number of column indices
-    const GrB_Descriptor desc       // descriptor for C(i,J) and mask
-) ;
-\end{verbatim} } \end{mdframed}
+The {\em mask} column is 1 if the effective value of the mask mask allows ${\bf
+C}$ to be modified, and 0 otherwise.  This is $m_{ij}$ for \verb'GrB_assign',
+and $m_{i'j'}$ for \verb'GxB_subassign', to reflect the difference in the mask,
+but this difference is not reflected in the table.  The value 1 or 0 is the
+value of the entry in the mask after it is optionally complemented via the
+\verb'GrB_COMP' option.
 
-\verb'GxB_Row_subassign' modifies a single sub-row of a matrix \verb'C'.  It is
-the same as \verb'GxB_Matrix_subassign' where the index vector \verb'I[0]=i' is
-a single row index (and thus \verb'ni=1'), and where all matrices in
-\verb'GxB_Matrix_subassign' (except \verb'C') consist of a single row.  The
-\verb'mask' vector has the same size as \verb'u' and the sub-column
-\verb'C(I,j)'.  The input descriptor \verb'GrB_INP0' is ignored; the input
-vector \verb'u' is not transposed.  Refer to \verb'GxB_Matrix_subassign' for
-further details.
+Finally, the last column is the action taken in this case.  It is left blank if
+no action is taken, in which case $c_{ij}$ is not modified if present, or not
+inserted into ${\bf C}$ if not present.
 
-\paragraph{\bf Performance considerations:} % C(i,J) = u'
-\verb'GxB_Col_subassign' is much faster than \verb'GxB_Row_subassign' if the
-format of \verb'C' is \verb'GxB_BY_COL'.  \verb'GxB_Row_subassign' is much
-faster than \verb'GxB_Col_subassign' if the format of \verb'C' is
-\verb'GxB_BY_ROW'.
+\begin{table}
+{\small
+\begin{tabular}{lllll|l}
+\hline
+repl & accum & ${\bf C}$ & ${\bf A}$ & mask & action taken by \verb'GrB_assign' and \verb'GxB_subassign'\\
+\hline
+    -  &-   & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, update \\
+    -  &-   &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
+    -  &-   & $c_{ij}$ &  -          & 1    &  delete $c_{ij}$ because $a_{i'j'}$ not present \\
+    -  &-   &  -       &  -          & 1    &   \\
+    -  &-   & $c_{ij}$ & $a_{i'j'}$  & 0    &   \\
+    -  &-   &  -       & $a_{i'j'}$  & 0    &   \\
+    -  &-   & $c_{ij}$ &  -          & 0    &   \\
+    -  &-   &  -       &  -          & 0    &   \\
+\hline
+    yes&-   & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, update \\
+    yes&-   &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
+    yes&-   & $c_{ij}$ &  -          & 1    &  delete $c_{ij}$ because $a_{i'j'}$ not present \\
+    yes&-   &  -       &  -          & 1    &   \\
+    yes&-   & $c_{ij}$ & $a_{i'j'}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&-   &  -       & $a_{i'j'}$  & 0    &   \\
+    yes&-   & $c_{ij}$ &  -          & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&-   &  -       &  -          & 0    &   \\
+\hline
+    -  &yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
+    -  &yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
+    -  &yes & $c_{ij}$ &  -          & 1    &   \\
+    -  &yes &  -       &  -          & 1    &   \\
+    -  &yes & $c_{ij}$ & $a_{i'j'}$  & 0    &   \\
+    -  &yes &  -       & $a_{i'j'}$  & 0    &   \\
+    -  &yes & $c_{ij}$ &  -          & 0    &   \\
+    -  &yes &  -       &  -          & 0    &   \\
+\hline
+    yes&yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
+    yes&yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
+    yes&yes & $c_{ij}$ &  -          & 1    &   \\
+    yes&yes &  -       &  -          & 1    &   \\
+    yes&yes & $c_{ij}$ & $a_{i'j'}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&yes &  -       & $a_{i'j'}$  & 0    &   \\
+    yes&yes & $c_{ij}$ &  -          & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+    yes&yes &  -       &  -          & 0    &   \\
+\hline
+\end{tabular}
+}
+\caption{Results of assign and subassign for entries in the ${\bf C(I,J)}$ submatrix \label{insubmatrix}}
+\end{table}
 
-% \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Vector\_subassign\_$<$type$>$:} assign a scalar to a subvector}
-%-------------------------------------------------------------------------------
-\label{subassign_vector_scalar}
+\newpage
+Table~\ref{outsubmatrix} illustrates how \verb'GrB_assign' and
+\verb'GxB_subassign' differ for entries outside the submatrix.
+\verb'GxB_subassign' never modifies any entry outside the ${\bf C(I,J)}$
+submatrix, but \verb'GrB_assign' can modify them in two cases listed in
+Table~\ref{outsubmatrix}.  When the \verb'GrB_REPLACE' option is selected, and
+when the \verb'Mask(i,j)' for an entry $c_{ij}$ is false (or if the
+\verb'Mask(i,j)' is true and \verb'GrB_COMP' is enabled via the descriptor),
+then the entry is deleted by \verb'GrB_assign'.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_subassign              // w(I)<mask> = accum (w(I),x)
-(
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w(I), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),x)
-    const <type> x,                 // scalar to assign to w(I)
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Descriptor desc       // descriptor for w(I) and mask
-) ;
-\end{verbatim} } \end{mdframed}
+The fourth column of Table~\ref{outsubmatrix} differs from
+Table~\ref{insubmatrix}, since entries in ${\bf A}$ never affect these entries.
+Instead, for all index pairs outside the $I \times J$ submatrix, ${\bf C}$ and
+${\bf Z}$ are identical (see Step 3 above).  As a result, each section of the
+table includes just two cases: either $c_{ij}$ is present, or not.   This in
+contrast to Table~\ref{insubmatrix}, where each section must consider four
+different cases.
 
-\verb'GxB_Vector_subassign_<type>' assigns a single scalar to an entire
-subvector of the vector \verb'w'.  The operation is exactly like setting a
-single entry in an \verb'n'-by-1 matrix, \verb'A(I,0) = x', where the column
-index for a vector is implicitly \verb'j=0'.  For further details of this
-function, see \verb'GxB_Matrix_subassign_<type>' in
-Section~\ref{subassign_matrix_scalar}.
+The \verb'GrB_Row_assign' and \verb'GrB_Col_assign' operations are slightly
+different.  They only affect a single row or column of ${\bf C}$.
+For \verb'GrB_Row_assign', Table~\ref{outsubmatrix} only applies to entries in
+the single row \verb'C(i,J)' that are outside the list of indices, \verb'J'.
+For \verb'GrB_Col_assign', Table~\ref{outsubmatrix} only applies to entries in
+the single column \verb'C(I,j)' that are outside the list of indices, \verb'I'.
+
+\begin{table}
+{\small
+\begin{tabular}{lllll|l}
+\hline
+repl & accum & ${\bf C}$ & ${\bf C=Z}$ & mask & action taken by \verb'GrB_assign' \\
+\hline
+   -   &-     & $c_{ij}$ & $c_{ij}$ & 1 &  \\
+   -   &-     &  -       & -        & 1 &  \\
+   -   &-     & $c_{ij}$ & $c_{ij}$ & 0 &  \\
+   -   &-     &  -       & -        & 0 &  \\
+\hline
+   yes &  -   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
+   yes &  -   &    -     &     -    & 1 &  \\
+   yes &  -   & $c_{ij}$ & $c_{ij}$ & 0 & delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+   yes &  -   &    -     &  -       & 0 &  \\
+\hline
+   -   &yes   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
+   -   &yes   &    -     &  -       & 1 &  \\
+   -   &yes   & $c_{ij}$ & $c_{ij}$ & 0 &  \\
+   -   &yes   &    -     &  -       & 0 &  \\
+\hline
+   yes &  yes & $c_{ij}$ & $c_{ij}$ & 1 &  \\
+   yes &  yes &   -      &  -       & 1 &  \\
+   yes &  yes & $c_{ij}$ & $c_{ij}$ & 0 & delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
+   yes &  yes &   -      &  -       & 0 &  \\
+\hline
+\end{tabular}
+}
+\caption{Results of assign for entries outside the
+${\bf C(I,J)}$ submatrix.  Subassign has no effect on these entries. \label{outsubmatrix}}
+\end{table}
 
-\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_subassign\_$<$type$>$:} assign a scalar to a submatrix}
+\subsubsection{Example}
 %-------------------------------------------------------------------------------
-\label{subassign_matrix_scalar}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_subassign              // C(I,J)<Mask> = accum (C(I,J),x)
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C(I,J), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),x)
-    const <type> x,                 // scalar to assign to C(I,J)
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index *J,             // column indices
-    const GrB_Index nj,             // number of column indices
-    const GrB_Descriptor desc       // descriptor for C(I,J) and Mask
-) ;
-\end{verbatim} } \end{mdframed}
+The difference between \verb'GxB_subassign' and \verb'GrB_assign' is
+illustrated in the following example.  Consider the 2-by-2 matrix ${\bf C}$
+where all entries are present.
 
-\verb'GxB_Matrix_subassign_<type>' assigns a single scalar to an entire
-submatrix of \verb'C', like the {\em scalar expansion} \verb'C(I,J)=x' in
-MATLAB.  The scalar \verb'x' is implicitly expanded into a matrix \verb'A' of
-size \verb'ni' by \verb'nj', with all entries present and equal to \verb'x',
-and then the matrix \verb'A' is assigned to
-\verb'C(I,J)' using the same method as in \verb'GxB_Matrix_subassign'.  Refer
-to that function in Section~\ref{subassign_matrix} for further details.
-For the accumulation step, the scalar \verb'x' is typecasted directly into the
-type of \verb'C' when the \verb'accum' operator is not applied to it, or into
-the \verb'ytype' of the \verb'accum' operator, if \verb'accum' is not NULL, for
-entries that are already present in \verb'C'.
+\[
+{\bf C} = \left[
+    \begin{array}{rr}
+    11 & 12 \\
+    21 & 22 \\
+    \end{array}
+    \right]
+\]
 
-The \verb'<type> x' notation is otherwise the same as
-\verb'GrB_Matrix_setElement' (see Section~\ref{matrix_setElement}).  Any value
-can be passed to this function and its type will be detected, via the
-\verb'_Generic' feature of ANSI C11.  For a user-defined type, \verb'x' is a
-\verb'void *' pointer that points to a memory space holding a single entry of a
-scalar that has exactly the same user-defined type as the matrix \verb'C'.
-This user-defined type must exactly match the user-defined type of \verb'C'
-since no typecasting is done between user-defined types.
+Suppose \verb'GrB_REPLACE' is true, and \verb'GrB_COMP' is false.  Let the
+\verb'Mask' be:
 
-If a \verb'void *' pointer is passed in and the type of the underlying scalar
-does not exactly match the user-defined type of \verb'C', then results are
-undefined.  No error status will be returned since GraphBLAS has no way of
-catching this error.
-If \verb'x' is a \verb'GrB_Scalar' with no entry, then it is implicitly
-expanded into a matrix \verb'A' of size \verb'ni' by \verb'nj', with no entries
-present.
+\[
+{\bf M} = \left[
+    \begin{array}{rr}
+    1 & 1 \\
+    0 & 1 \\
+    \end{array}
+    \right].
+\]
 
-\paragraph{\bf Performance considerations:} % C(I,J) = scalar
-If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
-the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
-fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
-if \verb'A' is transposed.
+Let ${\bf A} = 100$, and let the index sets be ${\bf I}=0$ and ${\bf J}=1$.
+Consider the computation
+${\bf C \langle M \rangle} (0,1) = {\bf C}(0,1) + {\bf A}$,
+using the \verb'GrB_assign' operation.  The result is:
+\[
+{\bf C} = \left[
+    \begin{array}{rr}
+    11 & 112 \\
+     - &  22 \\
+    \end{array}
+    \right].
+\]
+The $(0,1)$ entry is updated and the $(1,0)$ entry is deleted because
+its \verb'Mask' is zero.  The other two entries are not modified since ${\bf Z}
+= {\bf C}$ outside the submatrix, and those two values are written back into
+${\bf C}$ because their \verb'Mask' values are 1.  The $(1,0)$ entry is deleted
+because the entry ${\bf Z}(1,0)=21$ is prevented from being written back into
+${\bf C}$ since \verb'Mask(1,0)=0'.
 
-\newpage
-%===============================================================================
-\subsection{{\sf GrB\_assign:} submatrix assignment} %==========================
-%===============================================================================
-\label{assign}
+Now consider the analogous \verb'GxB_subassign' operation.  The \verb'Mask' has
+the same size as ${\bf A}$, namely:
+\[
+{\bf M} = \left[
+    \begin{array}{r}
+    1 \\
+    \end{array}
+    \right].
+\]
 
-The methods described in this section are all variations of the form
-\verb'C(I,J)=A', which modifies a submatrix of the matrix \verb'C'.  All
-methods can be used in their generic form with the single name
-\verb'GrB_assign'.  These methods are very similar to their
-\verb'GxB_subassign' counterparts in Section~\ref{subassign}.  They differ
-primarily in the size of the \verb'Mask', and how the \verb'GrB_REPLACE' option
-works.  Section~\ref{compare_assign} compares
-\verb'GxB_subassign' and \verb'GrB_assign'.
+After computing
+${\bf C} (0,1) {\bf \langle M \rangle} = {\bf C}(0,1) + {\bf A}$,
+the result is
 
-See Section~\ref{colon} for a description of
-\verb'I', \verb'ni', \verb'J', and \verb'nj'.
+\[
+{\bf C} = \left[
+    \begin{array}{rr}
+    11 & 112 \\
+    21 &  22 \\
+    \end{array}
+    \right].
+\]
 
+Only the ${\bf C(I,J)}$ submatrix, the single entry ${\bf C}(0,1)$, is modified
+by \verb'GxB_subassign'.  The entry ${\bf C}(1,0)=21$ is unaffected by
+\verb'GxB_subassign', but it is deleted by \verb'GrB_assign'.
+
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_assign:} assign to a subvector }
+\subsubsection{Performance of {\sf GxB\_subassign}, {\sf GrB\_assign}
+and {\sf GrB\_*\_setElement}}
 %-------------------------------------------------------------------------------
-\label{assign_vector}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_assign                 // w<mask>(I) = accum (w(I),u)
-(
-    GrB_Vector w,                   // input/output matrix for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),t)
-    const GrB_Vector u,             // first input:  vector u
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Descriptor desc       // descriptor for w and mask
-) ;
-\end{verbatim} } \end{mdframed}
+When SuiteSparse:GraphBLAS uses non-blocking mode, the modifications to a
+matrix by \verb'GxB_subassign', \verb'GrB_assign', and \verb'GrB_*_setElement'
+can postponed, and computed all at once later on.  This has a huge impact on
+performance.
 
-\verb'GrB_Vector_assign' operates on a subvector \verb'w(I)' of \verb'w',
-modifying it with the vector \verb'u'.  The \verb'mask' vector has the same
-size as \verb'w'.  The method is identical to \verb'GrB_Matrix_assign'
-described in Section~\ref{assign_matrix}, where all matrices have a single
-column each.  The only other difference is that the input \verb'u' in this
-method is not transposed via the \verb'GrB_INP0' descriptor.
+A sequence of assignments is fast if their completion can be postponed for as
+long as possible, or if they do not modify the pattern at all.  Modifying the
+pattern can be costly, but it is fast if non-blocking mode can be fully
+exploited.
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_assign:} assign to a submatrix }
-%-------------------------------------------------------------------------------
-\label{assign_matrix}
+Consider a sequence of $t$ submatrix assignments \verb'C(I,J)=C(I,J)+A' to an
+$n$-by-$n$ matrix \verb'C' where each submatrix \verb'A' has size $a$-by-$a$
+with $s$ entries, and where \verb'C' starts with $c$ entries.
+Assume the matrices are all stored in non-hypersparse form, by row
+(\verb'GxB_BY_ROW').
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_assign                 // C<Mask>(I,J) = accum (C(I,J),A)
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),T)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index *J,             // column indices
-    const GrB_Index nj,             // number of column indices
-    const GrB_Descriptor desc       // descriptor for C, Mask, and A
-) ;
-\end{verbatim} } \end{mdframed}
+If blocking mode is enabled, or if the sequence requires the matrix to be
+completed after each assignment, each of the $t$ assignments takes $O(a + s
+\log n)$ time to process the \verb'A' matrix and then $O(n + c + s \log s)$
+time to complete \verb'C'.  The latter step uses \verb'GrB_*_build' to build an
+update matrix and then merge it with \verb'C'.  This step does not occur if the
+sequence of assignments does not add new entries to the pattern of \verb'C',
+however.  Assuming in the worst case that the pattern does change, the total
+time is $O (t \left[ a + s \log n + n + c + s \log s \right] )$.
 
-\verb'GrB_Matrix_assign' operates on a submatrix \verb'S' of \verb'C',
-modifying it with the matrix \verb'A'.  It may also modify all of \verb'C',
-depending on the input descriptor \verb'desc' and the \verb'Mask'.
+If the sequence can be computed with all updates postponed until the end of the
+sequence, then the total time is no worse than $O(a + s \log n)$ to process
+each \verb'A' matrix, for $t$ assignments, and then a single \verb'build' at
+the end, taking $O(n + c + st \log st)$ time.
+The total time is $O (t \left [a + s \log n \right] + (n + c + st \log st))$.
+If no new entries appear in
+\verb'C' the time drops to $O (t \left [a + s \log n \right])$, and in this
+case, the time for both methods is the same; both are equally efficient.
 
-\vspace{0.1in}
-\begin{tabular}{lll}
-\hline
-Step & GraphBLAS & description \\
-     & notation  & \\
-\hline
-1 & ${\bf S} = {\bf C(I,J)}$                & extract ${\bf C(I,J)}$ submatrix \\
-2 & ${\bf S} = {\bf S} \odot {\bf A}$       & apply the accumulator (but not the mask) to ${\bf S}$\\
-3 & ${\bf Z} = {\bf C}$                     & make a copy of ${\bf C}$ \\
-4 & ${\bf Z(I,J)} = {\bf S}$                & put the submatrix into ${\bf Z(I,J)}$ \\
-5 & ${\bf C \langle M \rangle = Z}$         & apply the mask/replace phase to all of ${\bf C}$ \\
-\hline
-\end{tabular}
-\vspace{0.1in}
+A few simplifying assumptions are useful to compare these times.  Consider a
+graph of $n$ nodes with $O(n)$ edges, and with a constant bound on the degree
+of each node.  The asymptotic bounds assume a worst-case scenario where
+\verb'C' has a least some dense rows (thus the $\log n$ terms).  If these
+are not present, if both $t$ and $c$ are $O(n)$, and if $a$ and $s$ are
+constants, then the total time with blocking mode becomes $O(n^2)$, assuming
+the pattern of \verb'C' changes at each assignment.  This very high for a
+sparse graph problem.  In contrast, the non-blocking time becomes $O(n \log n)$
+under these same assumptions, which is asymptotically much faster.
 
-In contrast to \verb'GxB_subassign', the \verb'Mask' has the same as \verb'C'.
+\newpage
+The difference in practice can be very dramatic, since $n$ can be many millions
+for sparse graphs with $n$ nodes and $O(n)$, which can be handled on a
+commodity laptop.
 
-Step 1 extracts the submatrix and then Step 2 applies the accumulator
-(or ${\bf S}={\bf A}$ if \verb'accum' is \verb'NULL').  The \verb'Mask' is
-not yet applied.
+The following guidelines should be considered when using
+\verb'GxB_subassign', \verb'GrB_assign' and \verb'GrB_*_setElement'.
 
-Step 3 makes a copy of the ${\bf C}$ matrix, and then Step 4 writes the
-submatrix ${\bf S}$ into ${\bf Z}$.  This is the same as Step 3 of
-\verb'GxB_subassign', except that it operates on a temporary matrix ${\bf Z}$.
+\begin{enumerate}
 
-Finally, Step 5 writes ${\bf Z}$ back into ${\bf C}$ via the \verb'Mask', using
-the Mask/Replace Phase described in Section~\ref{accummask}.  If
-\verb'GrB_REPLACE' is enabled, then all of ${\bf C}$ is cleared prior to
-writing ${\bf Z}$ via the mask.  As a result, the \verb'GrB_REPLACE' option can
-delete entries outside the ${\bf C(I,J)}$ submatrix.
+\item A sequence of assignments that does not modify the pattern at all is
+fast, taking as little as $\Omega(1)$ time per entry modified.  The worst case
+time complexity is $O(\log n)$ per entry, assuming they all modify a dense
+row of \verb'C' with \verb'n' entries, which can occur in practice.  It is
+more common, however, that most rows of \verb'C' have a constant number of
+entries, independent of \verb'n'.  No work is ever left pending when the
+pattern of \verb'C' does not change.
 
-\paragraph{\bf Performance considerations:} % C(I,J) = A
-If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
-the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
-fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
-if \verb'A' is transposed.
+\item A sequence of assignments that modifies the entries that already exist in
+the pattern of a matrix, or adds new entries to the pattern (using the same
+\verb'accum' operator), but does not delete any entries, is fast.  The matrix
+is not completed until the end of the sequence.
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Col\_assign:} assign to a sub-column of a matrix}
-%-------------------------------------------------------------------------------
-\label{assign_column}
+\item Similarly, a sequence that modifies existing entries, or deletes them,
+but does not add new ones, is also fast.  This sequence can also repeatedly
+delete pre-existing entries and then reinstate them and still be fast.  The
+matrix is not completed until the end of the sequence.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_assign                 // C<mask>(I,j) = accum (C(I,j),u)
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Vector mask,          // optional mask for C(:,j), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(C(I,j),t)
-    const GrB_Vector u,             // input vector
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index j,              // column index
-    const GrB_Descriptor desc       // descriptor for C(:,j) and mask
-) ;
-\end{verbatim} } \end{mdframed}
+\item A sequence that mixes assignments of types (2) and (3) above can be
+costly, since the matrix may need to be completed after each assignment.  The
+time complexity can become quadratic in the worst case.
 
-\verb'GrB_Col_assign' modifies a single sub-column of a matrix \verb'C'.  It is
-the same as \verb'GrB_Matrix_assign' where the index vector \verb'J[0]=j' is a
-single column index, and where all matrices in \verb'GrB_Matrix_assign' (except
-\verb'C') consist of a single column.
+\item However, any single assignment takes no more than $O (a + s \log n + n +
+c + s \log s )$ time, even including the time for a matrix completion, where
+\verb'C' is $n$-by-$n$ with $c$ entries and \verb'A' is $a$-by-$a$ with $s$
+entries.  This time is essentially linear in the size of the matrix \verb'C',
+if \verb'A' is relatively small and sparse compared with \verb'C'.  In this
+case, $n+c$ are the two dominant terms.
 
-Unlike \verb'GrB_Matrix_assign', the \verb'mask' is a vector with the same size
-as a single column of \verb'C'.
+\item In general, \verb'GxB_subassign' is faster than \verb'GrB_assign'.
+If \verb'GrB_REPLACE' is used with \verb'GrB_assign', the entire matrix
+\verb'C' must be traversed.  This is much slower than \verb'GxB_subassign',
+which only needs to examine the \verb'C(I,J)' submatrix.  Furthermore,
+\verb'GrB_assign' must deal with a much larger \verb'Mask' matrix, whereas
+\verb'GxB_subassign' has a smaller mask.  Since its mask is smaller,
+\verb'GxB_subassign' takes less time than \verb'GrB_assign' to access the mask.
 
-The input descriptor \verb'GrB_INP0' is ignored; the input vector \verb'u' is
-not transposed.  Refer to \verb'GrB_Matrix_assign' for further details.
+\end{enumerate}
 
-\paragraph{\bf Performance considerations:} % C(I,j) = u
-\verb'GrB_Col_assign' is much faster than \verb'GrB_Row_assign' if the format
-of \verb'C' is \verb'GxB_BY_COL'.  \verb'GrB_Row_assign' is much faster than
-\verb'GrB_Col_assign' if the format of \verb'C' is \verb'GxB_BY_ROW'.
+% see GraphBLAS/Test/test46.m
+
+Submatrix assignment in SuiteSparse:GraphBLAS is extremely efficient, even
+without considering the advantages of non-blocking mode discussed in
+Section~\ref{compare_assign}.  It can be up to 1000x faster than MATLAB R2019b,
+or even higher depending on the kind of matrix assignment.  MATLAB logical
+indexing (the mask of GraphBLAS) is extremely faster with GraphBLAS as compared
+in MATLAB R2019b; differences of up to 250,000x have been observed (0.4 seconds
+in GraphBLAS versus 28 hours in MATLAB).
+
+All of the 28 variants (each with their own source code) are either
+asymptotically optimal, or to within a log factor of being asymptotically
+optimal.  The methods are also fully parallel.  For hypersparse matrices, the
+term $n$ in the expressions in the above discussion is dropped, and is replaced
+with $h \log h$, at the worst case, where $h << n$ is the number of non-empty
+columns of a hypersparse matrix stored by column, or the number of non-empty
+rows of a hypersparse matrix stored by row.  In many methods, $n$ is replaced
+with $h$, not $h \log h$.
 
 \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Row\_assign:} assign to a sub-row of a matrix}
-%-------------------------------------------------------------------------------
-\label{assign_row}
+%===============================================================================
+\subsection{{\sf GrB\_apply:} apply a unary, binary, or index-unary operator}
+%===============================================================================
+\label{apply}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_assign                 // C<mask'>(i,J) = accum (C(i,J),u')
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Vector mask,          // optional mask for C(i,:), unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(C(i,J),t)
-    const GrB_Vector u,             // input vector
-    const GrB_Index i,              // row index
-    const GrB_Index *J,             // column indices
-    const GrB_Index nj,             // number of column indices
-    const GrB_Descriptor desc       // descriptor for C(i,:) and mask
-) ;
-\end{verbatim} } \end{mdframed}
+\verb'GrB_apply' is the generic name for 92 specific functions:
 
-\verb'GrB_Row_assign' modifies a single sub-row of a matrix \verb'C'.  It is
-the same as \verb'GrB_Matrix_assign' where the index vector \verb'I[0]=i' is
-a single row index, and where all matrices in \verb'GrB_Matrix_assign'
-(except \verb'C') consist of a single row.
+\begin{packed_itemize}
+\item
+\verb'GrB_Vector_apply' and \verb'GrB_Matrix_apply' apply a unary operator to
+the entries of a matrix (two variants).
+
+\item \verb'GrB_*_apply_BinaryOp1st_*' applies a binary
+operator where a single scalar is provided as the $x$ input to the binary
+operator.
+There are 30 variants, depending on the type of the scalar: (matrix or vector)
+x (13 built-in types, one for user-defined types, and a version for
+\verb'GrB_Scalar').
+
+\item \verb'GrB_*_apply_BinaryOp2nd_*' applies a binary operator where a
+single scalar is provided as the $y$ input to the binary operator.
+There are 30 variants, depending on the type of the scalar: (matrix or vector)
+x (13 built-in types, one for user-defined types, and a version for
+\verb'GrB_Scalar').
 
-Unlike \verb'GrB_Matrix_assign', the \verb'mask' is a vector with the same size
-as a single row of \verb'C'.
+\item \verb'GrB_*_apply_IndexOp_*' applies a \verb'GrB_IndexUnaryOp',
+single scalar is provided as the scalar $y$ input to the index-unary operator.
+There are 30 variants, depending on the type of the scalar: (matrix or vector)
+x (13 built-in types, one for user-defined types, and a version for
+\verb'GrB_Scalar').
 
-The input descriptor \verb'GrB_INP0' is ignored; the input vector \verb'u' is
-not transposed.  Refer to \verb'GrB_Matrix_assign' for further details.
+\end{packed_itemize}
 
-\paragraph{\bf Performance considerations:} % C(i,J) = u'
-\verb'GrB_Col_assign' is much faster than \verb'GrB_Row_assign' if the format
-of \verb'C' is \verb'GxB_BY_COL'.  \verb'GrB_Row_assign' is much faster than
-\verb'GrB_Col_assign' if the format of \verb'C' is \verb'GxB_BY_ROW'.
+The generic
+name appears in the function prototypes, but the specific function name is used
+when describing each variation.  When discussing features that apply to all
+versions, the simple name \verb'GrB_apply' is used.
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_assign\_$<$type$>$:} assign a scalar to a subvector}
+\subsubsection{{\sf GrB\_Vector\_apply:} apply a unary operator to a vector}
 %-------------------------------------------------------------------------------
-\label{assign_vector_scalar}
+\label{apply_vector}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_assign                 // w<mask>(I) = accum (w(I),x)
+GrB_Info GrB_apply                  // w<mask> = accum (w, op(u))
 (
     GrB_Vector w,                   // input/output vector for results
     const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w(I),x)
-    const <type> x,                 // scalar to assign to w(I)
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_UnaryOp op,           // operator to apply to the entries
+    const GrB_Vector u,             // first input:  vector u
     const GrB_Descriptor desc       // descriptor for w and mask
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_assign_<type>' assigns a single scalar to an entire subvector
-of the vector \verb'w'.  The operation is exactly like setting a single entry
-in an \verb'n'-by-1 matrix, \verb'A(I,0) = x', where the column index for a
-vector is implicitly \verb'j=0'.  The \verb'mask' vector has the same size as
-\verb'w'.  For further details of this function, see
-\verb'GrB_Matrix_assign_<type>' in the next section
-(\ref{assign_matrix_scalar}).
-
-Following the C API Specification, results are well-defined if \verb'I'
-contains duplicate indices.  Duplicate indices are simply ignored.  See
-Section~\ref{duplicates} for more details.
+\verb'GrB_Vector_apply' applies a unary operator to the entries of a vector,
+analogous to \verb't = op(u)'  in MATLAB except the operator \verb'op' is only
+applied to entries in the pattern of \verb'u'.  Implicit values outside the
+pattern of \verb'u' are not affected.  The entries in \verb'u' are typecasted
+into the \verb'xtype' of the unary operator.  The vector \verb't' has the same
+type as the \verb'ztype' of the unary operator.  The final step is ${\bf w
+\langle m \rangle  = w \odot t}$, as described in Section~\ref{accummask},
+except that all the terms are column vectors instead of matrices.
 
-% \newpage
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_assign\_$<$type$>$:} assign a scalar to a submatrix}
+\subsubsection{{\sf GrB\_Matrix\_apply:} apply a unary operator to a matrix}
 %-------------------------------------------------------------------------------
-\label{assign_matrix_scalar}
+\label{apply_matrix}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_assign                 // C<Mask>(I,J) = accum (C(I,J),x)
+GrB_Info GrB_apply                  // C<Mask> = accum (C, op(A)) or op(A')
 (
     GrB_Matrix C,                   // input/output matrix for results
     const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),x)
-    const <type> x,                 // scalar to assign to C(I,J)
-    const GrB_Index *I,             // row indices
-    const GrB_Index ni,             // number of row indices
-    const GrB_Index *J,             // column indices
-    const GrB_Index nj,             // number of column indices
-    const GrB_Descriptor desc       // descriptor for C and Mask
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_UnaryOp op,           // operator to apply to the entries
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Descriptor desc       // descriptor for C, mask, and A
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_assign_<type>' assigns a single scalar to an entire
-submatrix of \verb'C', like the {\em scalar expansion} \verb'C(I,J)=x' in
-MATLAB.  The scalar \verb'x' is implicitly expanded into a matrix \verb'A' of
-size \verb'ni' by \verb'nj', and then the matrix \verb'A' is assigned to
-\verb'C(I,J)' using the same method as in \verb'GrB_Matrix_assign'.  Refer
-to that function in Section~\ref{assign_matrix} for further details.
-
-The \verb'Mask' has the same size as \verb'C'.
-
-For the accumulation step, the scalar \verb'x' is typecasted directly into the
-type of \verb'C' when the \verb'accum' operator is not applied to it, or into
-the \verb'ytype' of the \verb'accum' operator, if \verb'accum' is not NULL, for
-entries that are already present in \verb'C'.
-
-The \verb'<type> x' notation is otherwise the same as
-\verb'GrB_Matrix_setElement' (see Section~\ref{matrix_setElement}).  Any value
-can be passed to this function and its type will be detected, via the
-\verb'_Generic' feature of ANSI C11.  For a user-defined type, \verb'x' is a
-\verb'void *' pointer that points to a memory space holding a single entry of a
-scalar that has exactly the same user-defined type as the matrix \verb'C'.
-This user-defined type must exactly match the user-defined type of \verb'C'
-since no typecasting is done between user-defined types.
-
-If a \verb'void *' pointer is passed in and the type of the underlying scalar
-does not exactly match the user-defined type of \verb'C', then results are
-undefined.  No error status will be returned since GraphBLAS has no way of
-catching this error.
-
-If \verb'x' is a \verb'GrB_Scalar' with no entry, then it is implicitly
-expanded into a matrix \verb'A' of size \verb'ni' by \verb'nj', with no entries
-present.
-
-Following the C API Specification, results are well-defined if \verb'I' or
-\verb'J' contain duplicate indices.  Duplicate indices are simply ignored.  See
-Section~\ref{duplicates} for more details.
-
-\paragraph{\bf Performance considerations:} % C(I,J) = scalar
-If \verb'A' is not transposed: if \verb'|I|' is small, then it is fastest if
-the format of \verb'C' is \verb'GxB_BY_ROW'; if \verb'|J|' is small, then it is
-fastest if the format of \verb'C' is \verb'GxB_BY_COL'.  The opposite is true
-if \verb'A' is transposed.
-
-\newpage
-%===============================================================================
-\subsection{Duplicate indices in {\sf GrB\_assign} and {\sf GxB\_subassign}}
-%===============================================================================
-\label{duplicates}
-
-According to the GraphBLAS C API Specification if the index vectors \verb'I' or
-\verb'J' contain duplicate indices, the results are undefined for
-\verb'GrB_Matrix_assign', \verb'GrB_Matrix_assign', \verb'GrB_Col_assign', and
-\verb'GrB_Row_assign'.  Only the scalar assignment operations
-(\verb'GrB_Matrix_assign_TYPE' and \verb'GrB_Matrix_assign_TYPE') are
-well-defined when duplicates appear in \verb'I' and \verb'J'.  In those two
-functions, duplicate indices are ignored.
-
-As an extension to the specification, SuiteSparse:GraphBLAS provides a
-definition of how duplicate indices are handled in all cases.  If \verb'I' has
-duplicate indices, they are ignored and the last unique entry in the list is
-used.  When no mask and no accumulator is present, the results are identical to
-how MATLAB handles duplicate indices in the built-in expression
-\verb'C(I,J)=A'.  Details of how this is done is shown below.
-
-{\small
-\begin{verbatim}
-    function C = subassign (C, I, J, A)
-    % submatrix assignment with pre-sort of I and J; and remove duplicates
-
-    % delete duplicates from I, keeping the last one seen
-    [I2 I2k] = sort (I) ;
-    Idupl = [(I2 (1:end-1) == I2 (2:end)), false] ;
-    I2  = I2  (~Idupl) ;
-    I2k = I2k (~Idupl) ;
-    assert (isequal (I2, unique (I)))
-
-    % delete duplicates from J, keeping the last one seen
-    [J2 J2k] = sort (J) ;
-    Jdupl = [(J2 (1:end-1) == J2 (2:end)), false] ;
-    J2  = J2  (~Jdupl) ;
-    J2k = J2k (~Jdupl) ;
-    assert (isequal (J2, unique (J)))
-
-    % do the submatrix assignment, with no duplicates in I2 or J2
-    C (I2,J2) = A (I2k,J2k) ;
-\end{verbatim}}
-
-If a mask is present, then it is replaced with \verb'M = M (I2k, J2k)' for
-\verb'GxB_subassign', or with \verb'M = M (I2, J2)' for \verb'GrB_assign'.
-If an accumulator operator is present, it is applied after the duplicates
-are removed, as (for example):
-
-{\small
-\begin{verbatim}
-    C (I2,J2) = C (I2,J2) + A (I2k,J2k) ;
-\end{verbatim}}
-
-These definitions allow the Octave/MATLAB interface to GraphBLAS to return the same
-results for \verb'C(I,J)=A' for a \verb'GrB' object as they do for built-in
-Octave/MATLAB matrices.  They also allow the assignment to be done in parallel.
-
-Results are always well-defined in SuiteSparse:GraphBLAS, but they might not be
-what you expect.  For example, suppose the \verb'MIN' operator is being used
-the following assigment to the vector \verb'x', and suppose \verb'I' contains
-the entries \verb'[0 0]'.  Suppose \verb'x' is initially empty, of length 1,
-and suppose \verb'y' is a vector of length 2 with the values \verb'[5 7]'.
-
-{\small
-\begin{verbatim}
-    #include "GraphBLAS.h"
-    #include <stdio.h>
-    int main (void)
-    {
-        GrB_init (GrB_NONBLOCKING) ;
-        GrB_Vector x, y ;
-        GrB_Vector_new (&x, GrB_INT32, 1) ;
-        GrB_Vector_new (&y, GrB_INT32, 2) ;
-        GrB_Index I [2] = {0, 0} ;
-        GrB_Vector_setElement (y, 5, 0) ;
-        GrB_Vector_setElement (y, 7, 1) ;
-        GrB_Vector_wait (&y) ;
-        GxB_print (x, 3) ;
-        GxB_print (y, 3) ;
-        GrB_assign (x, NULL, GrB_MIN_INT32, y, I, 2, NULL) ;
-        GrB_Vector_wait (&y) ;
-        GxB_print (x, 3) ;
-        GrB_finalize ( ) ;
-    }
-\end{verbatim}}
-
-You might (wrongly) expect the result to be the vector \verb'x(0)=5', since
-two entries seem to be assigned, and the min operator might be expected to
-take the minimum of the two.  This is not how SuiteSparse:GraphBLAS handles
-duplicates.
-
-Instead, the first duplicate index of \verb'I' is discarded
-(\verb'I [0] = 0', and \verb'y(0)=5').
-and only the second entry is used
-(\verb'I [1] = 0', and \verb'y(1)=7').
-The output of the above program is:
-
-{\small
-\begin{verbatim}
-
-  1x1 GraphBLAS int32_t vector, sparse by col:
-  x, no entries
-
-
-  2x1 GraphBLAS int32_t vector, sparse by col:
-  y, 2 entries
-
-    (0,0)   5
-    (1,0)   7
-
-
-  1x1 GraphBLAS int32_t vector, sparse by col:
-  x, 1 entry
-
-    (0,0)   7
-
-\end{verbatim}}
-
-You see that the result is \verb'x(0)=7', since the \verb'y(0)=5' entry
-has been ignored because of the duplicate indices in \verb'I'.
+\verb'GrB_Matrix_apply'
+applies a unary operator to the entries of a matrix, analogous to
+\verb'T = op(A)'  in MATLAB except the operator \verb'op' is only applied to
+entries in the pattern of \verb'A'.  Implicit values outside the pattern of
+\verb'A' are not affected.  The input matrix \verb'A' may be transposed first.
+The entries in \verb'A' are typecasted into the \verb'xtype' of the unary
+operator.  The matrix \verb'T' has the same type as the \verb'ztype' of the
+unary operator.  The final step is ${\bf C \langle M \rangle  = C \odot T}$, as
+described in Section~\ref{accummask}.
 
-\begin{alert}
-{\bf SPEC:} Providing a well-defined behavior for duplicate
-indices with matrix and vector assignment is an extension to the specification.
-The specification only defines the behavior when assigning a scalar into a matrix
-or vector, and states that duplicate indices otherwise lead to undefined
-results.
-\end{alert}
+The built-in \verb'GrB_IDENTITY_'$T$ operators (one for each built-in type $T$)
+are very useful when combined with this function, enabling it to compute ${\bf
+C \langle M \rangle  = C \odot A}$.  This makes \verb'GrB_apply' a direct
+interface to the accumulator/mask function for both matrices and vectors.
+The \verb'GrB_IDENTITY_'$T$ operators also provide the fastest stand-alone
+typecasting methods in SuiteSparse:GraphBLAS, with all $13 \times 13=169$
+methods appearing as individual functions, to typecast between any of the 13
+built-in types.
 
+To compute ${\bf C \langle M \rangle = A}$ or ${\bf C \langle M \rangle = C
+\odot A}$ for user-defined types, the user application would need to define an
+identity operator for the type.  Since GraphBLAS cannot detect that it is an
+identity operator, it must call the operator to make the full copy \verb'T=A'
+and apply the operator to each entry of the matrix or vector.
+
+The other GraphBLAS operation that provides a direct interface to the
+accumulator/mask function is \verb'GrB_transpose', which does not require an
+operator to perform this task.  As a result, \verb'GrB_transpose' can be used
+as an efficient and direct interface to the accumulator/mask function for
+both built-in and user-defined types.  However, it is only available for
+matrices, not vectors.
 
 \newpage
 %===============================================================================
-\subsection{Comparing {\sf GrB\_assign} and {\sf GxB\_subassign}} %=============
+\subsubsection{{\sf GrB\_Vector\_apply\_BinaryOp1st:} apply a binary operator to a vector; 1st scalar binding}
 %===============================================================================
-\label{compare_assign}
+\label{vector_apply1st}
 
-The \verb'GxB_subassign' and \verb'GrB_assign' operations are very similar, but
-they differ in two ways:
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_apply                  // w<mask> = accum (w, op(x,u))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_BinaryOp op,          // operator to apply to the entries
+    <type> x,                       // first input:  scalar x
+    const GrB_Vector u,             // second input: vector u
+    const GrB_Descriptor desc       // descriptor for w and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-\begin{enumerate}
-\item {\bf The Mask has a different size:}
-    The mask in \verb'GxB_subassign' has the same dimensions as \verb'w(I)' for
-    vectors and \verb'C(I,J)' for matrices.  In \verb'GrB_assign', the mask is
-    the same size as \verb'w' or \verb'C', respectively (except for the row/col
-    variants).  The two masks are related.  If \verb'M' is the mask for
-    \verb'GrB_assign', then \verb'M(I,J)' is the mask for \verb'GxB_subassign'.
-    If there is no mask, or if \verb'I' and \verb'J' are both \verb'GrB_ALL',
-    the two masks are the same.
-    For \verb'GrB_Row_assign' and \verb'GrB_Col_assign', the \verb'mask' vector
-    is the same size as a row or column of \verb'C', respectively.  For the
-    corresponding \verb'GxB_Row_subassign' and \verb'GxB_Col_subassign'
-    operations, the \verb'mask' is the same size as the sub-row \verb'C(i,J)' or
-    subcolumn \verb'C(I,j)', respectively.
+\verb'GrB_Vector_apply_BinaryOp1st_<type>'  applies a binary operator
+$z=f(x,y)$ to a vector, where a scalar $x$ is bound to the first input of the
+operator.
+The scalar \verb'x' can be a non-opaque C scalar corresponding to a built-in
+type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
+It is otherwise identical to \verb'GrB_Vector_apply'.
 
-\item {\bf \verb'GrB_REPLACE' is different:}
-    They differ in how \verb'C' is affected in areas outside the \verb'C(I,J)'
-    submatrix.  In \verb'GxB_subassign', the \verb'C(I,J)' submatrix is the
-    only part of \verb'C' that can be modified, and no part of \verb'C' outside
-    the submatrix is ever modified.  In \verb'GrB_assign', it is possible to
-    delete entries in \verb'C' outside the submatrix, but only in one specific
-    manner.  Suppose the mask \verb'M' is present (or, suppose it is not
-    present but \verb'GrB_COMP' is true).  After (optionally) complementing the
-    mask, the value of \verb'M(i,j)' can be 0 for some entry outside the
-    \verb'C(I,J)' submatrix.  If the \verb'GrB_REPLACE' descriptor is
-    true, \verb'GrB_assign' deletes this entry.
+%===============================================================================
+\subsubsection{{\sf GrB\_Vector\_apply\_BinaryOp2nd:} apply a binary operator to a vector; 2nd scalar binding}
+%===============================================================================
+\label{vector_apply2nd}
 
-\end{enumerate}
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_apply                  // w<mask> = accum (w, op(u,y))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_BinaryOp op,          // operator to apply to the entries
+    const GrB_Vector u,             // first input:  vector u
+    <type> y,                       // second input: scalar y
+    const GrB_Descriptor desc       // descriptor for w and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-\verb'GxB_subassign' and \verb'GrB_assign' are identical if \verb'GrB_REPLACE'
-is set to its default value of false, and if the masks happen to be the same.
-The two masks can be the same in two cases:  either the \verb'Mask' input is
-\verb'NULL' (and it is not complemented via \verb'GrB_COMP'), or \verb'I' and
-\verb'J' are both \verb'GrB_ALL'.
-If all these conditions hold,
-the two algorithms are identical and have the same performance.  Otherwise,
-\verb'GxB_subassign' is much faster than \verb'GrB_assign' when the latter
-must examine the entire matrix \verb'C' to delete entries (when
-\verb'GrB_REPLACE' is true), and if it must deal with a much larger \verb'Mask'
-matrix.  However, both methods have specific uses.
+\verb'GrB_Vector_apply_BinaryOp2nd_<type>'  applies a binary operator
+$z=f(x,y)$ to a vector, where a scalar $y$ is bound to the second input of the
+operator.
+The scalar \verb'x' can be a non-opaque C scalar corresponding to a built-in
+type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
+It is otherwise identical to \verb'GrB_Vector_apply'.
 
-Consider using \verb'C(I,J)+=F' for many submatrices \verb'F' (for example,
-when assembling a finite-element matrix).  If the \verb'Mask' is meant as a
-specification for which entries of \verb'C' should appear in the final result,
-then use \verb'GrB_assign'.
+\newpage
+%===============================================================================
+\subsubsection{{\sf GrB\_Vector\_apply\_IndexOp:} apply an index-unary operator to a vector}
+%===============================================================================
+\label{vector_apply_idxunop}
 
-If instead the \verb'Mask' is meant to control which entries of the submatrix
-\verb'C(I,J)' are modified by the finite-element \verb'F', then use
-\verb'GxB_subassign'.  This is particularly useful is the \verb'Mask' is a
-template that follows along with the finite-element \verb'F', independent of
-where it is applied to \verb'C'.  Using \verb'GrB_assign' would be very
-difficult in this case since a new \verb'Mask', the same size as \verb'C',
-would need to be constructed for each finite-element \verb'F'.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_apply                  // w<mask> = accum (w, op(u,y))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_IndexUnaryOp op,      // operator to apply to the entries
+    const GrB_Vector u,             // first input:  vector u
+    const <type> y,                 // second input: scalar y
+    const GrB_Descriptor desc       // descriptor for w and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-In GraphBLAS notation, the two methods can be described as follows:
+\verb'GrB_Vector_apply_IndexOp_<type>'  applies an index-unary operator
+$z=f(x,i,0,y)$ to a vector.
+The scalar \verb'y' can be a non-opaque C scalar corresponding to a built-in
+type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
+It is otherwise identical to \verb'GrB_Vector_apply'.
 
-\vspace{0.05in}
-\begin{tabular}{ll}
-\hline
-matrix and vector subassign & ${\bf C(I,J) \langle M \rangle}  = {\bf C(I,J)} \odot {\bf A}$ \\
-matrix and vector    assign & ${\bf C \langle M \rangle (I,J)} = {\bf C(I,J)} \odot {\bf A}$ \\
-\hline
-\end{tabular}
-\vspace{0.05in}
+%===============================================================================
+\subsubsection{{\sf GrB\_Matrix\_apply\_BinaryOp1st:} apply a binary operator to a matrix; 1st scalar binding}
+%===============================================================================
+\label{matrix_apply1st}
 
-This notation does not include the details of the \verb'GrB_COMP' and
-\verb'GrB_REPLACE' descriptors, but it does illustrate the difference in the
-\verb'Mask'.  In the subassign, \verb'Mask' is the same size as \verb'C(I,J)'
-and \verb'A'.  If \verb'I[0]=i' and \verb'J[0]=j', Then \verb'Mask(0,0)'
-controls how \verb'C(i,j)' is modified by the subassign, from the value
-\verb'A(0,0)'.  In the assign, \verb'Mask' is the same size as \verb'C', and
-\verb'Mask(i,j)' controls how \verb'C(i,j)' is modified.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_apply                  // C<M>=accum(C,op(x,A))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_BinaryOp op,          // operator to apply to the entries
+    <type> x,                       // first input:  scalar x
+    const GrB_Matrix A,             // second input: matrix A
+    const GrB_Descriptor desc       // descriptor for C, mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-The \verb'GxB_subassign' and \verb'GrB_assign' functions have the same
-signatures; they differ only in how they consider the \verb'Mask' and the
-\verb'GrB_REPLACE' descriptor
+\verb'GrB_Matrix_apply_BinaryOp1st_<type>'  applies a binary operator
+$z=f(x,y)$ to a matrix, where a scalar $x$ is bound to the first input of the
+operator. 
+The scalar \verb'x' can be a non-opaque C scalar corresponding to a built-in
+type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
+It is otherwise identical to \verb'GrB_Matrix_apply'.
 
-Details of each step of the two operations are listed below:
+\newpage
+%===============================================================================
+\subsubsection{{\sf GrB\_Matrix\_apply\_BinaryOp2nd:} apply a binary operator to a matrix; 2nd scalar binding}
+%===============================================================================
+\label{matrix_apply2nd}
 
-\vspace{0.1in}
-\begin{tabular}{lll}
-\hline
-Step & \verb'GrB_Matrix_assign'                & \verb'GxB_Matrix_subassign'                        \\
-\hline
-1 & ${\bf S} = {\bf C(I,J)}$                & ${\bf S} = {\bf C(I,J)}$                              \\
-2 & ${\bf S} = {\bf S} \odot {\bf A}$       & ${\bf S \langle M \rangle} = {\bf S} \odot {\bf A}$   \\
-3 & ${\bf Z} = {\bf C}$                     & ${\bf C(I,J)}= {\bf S}$                               \\
-4 & ${\bf Z(I,J)} = {\bf S}$                &                                                       \\
-5 & ${\bf C \langle M \rangle = Z}$         &                                                       \\
-\hline
-\end{tabular}
-\vspace{0.1in}
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_apply                  // C<M>=accum(C,op(A,y))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_BinaryOp op,          // operator to apply to the entries
+    const GrB_Matrix A,             // first input:  matrix A
+    <type> y,                       // second input: scalar y
+    const GrB_Descriptor desc       // descriptor for C, mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-Step 1 is the same.  In the Accumulator Phase (Step 2), the expression
-${\bf S} \odot {\bf A}$,
-described in Section~\ref{accummask}, is the same in both
-operations.  The result is simply ${\bf A}$ if \verb'accum' is \verb'NULL'.  It
-only applies to the submatrix ${\bf S}$, not the whole matrix.
-The result ${\bf S} \odot {\bf A}$ is used differently in the Mask/Replace
-phase.
+\verb'GrB_Matrix_apply_BinaryOp2nd_<type>'  applies a binary operator
+$z=f(x,y)$ to a matrix, where a scalar $x$ is bound to the second input of the
+operator.
+The scalar \verb'y' can be a non-opaque C scalar corresponding to a built-in
+type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
+It is otherwise identical to \verb'GrB_Matrix_apply'.
 
-The Mask/Replace Phase, described in Section~\ref{accummask} is different:
-\begin{itemize}
-\item
-    For \verb'GrB_assign' (Step 5), the mask is applied to all of ${\bf
-    C}$.  The mask has the same size as ${\bf C}$.  Just prior to making the
-    assignment via the mask, the \verb'GrB_REPLACE' option can be used to clear
-    all of ${\bf C}$ first.  This is the only way in which entries in ${\bf C}$ that
-    are outside the ${\bf C(I,J)}$ submatrix can be modified by this operation.
+%===============================================================================
+\subsubsection{{\sf GrB\_Matrix\_apply\_IndexOp:} apply an index-unary operator to a matrix}
+%===============================================================================
+\label{matrix_apply_idxunop}
 
-\item
-    For \verb'GxB_subassign' (Step 2b), the mask is applied to just
-    ${\bf S}$.  The mask has the same size as ${\bf C(I,J)}$, ${\bf S}$, and
-    ${\bf A}$.  Just prior to making the assignment via the mask, the
-    \verb'GrB_REPLACE' option can be used to clear ${\bf S}$ first.  No entries
-    in ${\bf C}$ that are outside the ${\bf C(I,J)}$ can be modified by this
-    operation.  Thus, \verb'GrB_REPLACE' has no effect on entries in ${\bf C}$
-    outside the ${\bf C(I,J)}$ submatrix.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_apply                  // C<M>=accum(C,op(A,y))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_IndexUnaryOp op,      // operator to apply to the entries
+    const GrB_Matrix A,             // first input:  matrix A
+    const <type> y,                 // second input: scalar y
+    const GrB_Descriptor desc       // descriptor for C, mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-\end{itemize}
+\verb'GrB_Matrix_apply_IndexOp_<type>'  applies an index-unary operator
+$z=f(x,i,j,y)$ to a matrix.
+The scalar \verb'y' can be a non-opaque C scalar corresponding to a built-in
+type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
+It is otherwise identical to \verb'GrB_Matrix_apply'.
 
-The differences between \verb'GrB_assign' and
-\verb'GxB_subassign' can be seen in Tables~\ref{insubmatrix} and
-\ref{outsubmatrix}.  The first table considers the case when the entry $c_{ij}$
-is in the ${\bf C(I,J)}$ submatrix, and it describes what is computed for both
-\verb'GrB_assign' and \verb'GxB_subassign'.  They perform the
-exact same computation; the only difference is how the value of the mask is
-specified.  Compare Table~\ref{insubmatrix} with Table~\ref{tab:maskaccum}
-in Section~\ref{sec:maskaccum}.
+\newpage
+%===============================================================================
+\subsection{{\sf GrB\_select:} select entries based on an index-unary operator}
+%===============================================================================
+\label{select}
 
-The first column of Table~\ref{insubmatrix} is {\em yes} if \verb'GrB_REPLACE' is enabled,
-and a dash otherwise.  The second column is {\em yes} if an accumulator
-operator is given, and a dash otherwise.  The third column is $c_{ij}$ if the
-entry is present in ${\bf C}$, and a dash otherwise.  The fourth column is
-$a_{i'j'}$ if the corresponding entry is present in ${\bf A}$, where
-$i={\bf I}(i')$ and $j={\bf J}(i')$.
+The \verb'GrB_select' function is the generic name for 30 specific functions,
+depending on whether it operates on a matrix or vector, and depending on the
+type of the scalar \verb'y': (matrix or vector) x (13 built-in types,
+\verb'void *' for user-defined types, and a \verb'GrB_Scalar').  The generic
+name appears in the function prototypes, but the specific function name is used
+when describing each variation.  When discussing features that apply to both
+versions, the simple name \verb'GrB_select' is used.
 
-The {\em mask} column is 1 if the effective value of the mask mask allows ${\bf
-C}$ to be modified, and 0 otherwise.  This is $m_{ij}$ for \verb'GrB_assign',
-and $m_{i'j'}$ for \verb'GxB_subassign', to reflect the difference in the mask,
-but this difference is not reflected in the table.  The value 1 or 0 is the
-value of the entry in the mask after it is optionally complemented via the
-\verb'GrB_COMP' option.
+% \newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Vector\_select:} select entries from a vector}
+%-------------------------------------------------------------------------------
+\label{select_vector}
 
-Finally, the last column is the action taken in this case.  It is left blank if
-no action is taken, in which case $c_{ij}$ is not modified if present, or not
-inserted into ${\bf C}$ if not present.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_select                 // w<mask> = accum (w, op(u))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_IndexUnaryOp op,      // operator to apply to the entries
+    const GrB_Vector u,             // first input:  vector u
+    const <type> y,                 // second input: scalar y
+    const GrB_Descriptor desc       // descriptor for w and mask
+) ;
+\end{verbatim} } \end{mdframed}
 
-\begin{table}
-{\small
-\begin{tabular}{lllll|l}
-\hline
-repl & accum & ${\bf C}$ & ${\bf A}$ & mask & action taken by \verb'GrB_assign' and \verb'GxB_subassign'\\
-\hline
-    -  &-   & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, update \\
-    -  &-   &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
-    -  &-   & $c_{ij}$ &  -          & 1    &  delete $c_{ij}$ because $a_{i'j'}$ not present \\
-    -  &-   &  -       &  -          & 1    &   \\
-    -  &-   & $c_{ij}$ & $a_{i'j'}$  & 0    &   \\
-    -  &-   &  -       & $a_{i'j'}$  & 0    &   \\
-    -  &-   & $c_{ij}$ &  -          & 0    &   \\
-    -  &-   &  -       &  -          & 0    &   \\
-\hline
-    yes&-   & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, update \\
-    yes&-   &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
-    yes&-   & $c_{ij}$ &  -          & 1    &  delete $c_{ij}$ because $a_{i'j'}$ not present \\
-    yes&-   &  -       &  -          & 1    &   \\
-    yes&-   & $c_{ij}$ & $a_{i'j'}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&-   &  -       & $a_{i'j'}$  & 0    &   \\
-    yes&-   & $c_{ij}$ &  -          & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&-   &  -       &  -          & 0    &   \\
-\hline
-    -  &yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
-    -  &yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
-    -  &yes & $c_{ij}$ &  -          & 1    &   \\
-    -  &yes &  -       &  -          & 1    &   \\
-    -  &yes & $c_{ij}$ & $a_{i'j'}$  & 0    &   \\
-    -  &yes &  -       & $a_{i'j'}$  & 0    &   \\
-    -  &yes & $c_{ij}$ &  -          & 0    &   \\
-    -  &yes &  -       &  -          & 0    &   \\
-\hline
-    yes&yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
-    yes&yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
-    yes&yes & $c_{ij}$ &  -          & 1    &   \\
-    yes&yes &  -       &  -          & 1    &   \\
-    yes&yes & $c_{ij}$ & $a_{i'j'}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&yes &  -       & $a_{i'j'}$  & 0    &   \\
-    yes&yes & $c_{ij}$ &  -          & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-    yes&yes &  -       &  -          & 0    &   \\
-\hline
-\end{tabular}
-}
-\caption{Results of assign and subassign for entries in the ${\bf C(I,J)}$ submatrix \label{insubmatrix}}
-\end{table}
+\verb'GrB_Vector_select_*' applies a \verb'GrB_IndexUnaryOp' operator to the
+entries of a vector.  If the operator evaluates as \verb'true' for the entry
+\verb'u(i)', it is copied to the vector \verb't', or not copied if the operator
+evaluates to \verb'false'.   The vector \verb't' is then written to the result
+\verb'w' via the mask/accumulator step.  This operation operates on vectors
+just as if they were \verb'm'-by-1 matrices, except that GraphBLAS never
+transposes a vector via the descriptor.  Refer to the next section
+(\ref{select_matrix}) on \verb'GrB_Matrix_select' for more details.
 
 \newpage
-Table~\ref{outsubmatrix} illustrates how \verb'GrB_assign' and
-\verb'GxB_subassign' differ for entries outside the submatrix.
-\verb'GxB_subassign' never modifies any entry outside the ${\bf C(I,J)}$
-submatrix, but \verb'GrB_assign' can modify them in two cases listed in
-Table~\ref{outsubmatrix}.  When the \verb'GrB_REPLACE' option is selected, and
-when the \verb'Mask(i,j)' for an entry $c_{ij}$ is false (or if the
-\verb'Mask(i,j)' is true and \verb'GrB_COMP' is enabled via the descriptor),
-then the entry is deleted by \verb'GrB_assign'.
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_select:} apply a select operator to a matrix}
+%-------------------------------------------------------------------------------
+\label{select_matrix}
 
-The fourth column of Table~\ref{outsubmatrix} differs from
-Table~\ref{insubmatrix}, since entries in ${\bf A}$ never affect these entries.
-Instead, for all index pairs outside the $I \times J$ submatrix, ${\bf C}$ and
-${\bf Z}$ are identical (see Step 3 above).  As a result, each section of the
-table includes just two cases: either $c_{ij}$ is present, or not.   This in
-contrast to Table~\ref{insubmatrix}, where each section must consider four
-different cases.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_select                 // C<M>=accum(C,op(A))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_IndexUnaryOp op,      // operator to apply to the entries
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Scalar y,             // second input: scalar y
+    const GrB_Descriptor desc       // descriptor for C, mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-The \verb'GrB_Row_assign' and \verb'GrB_Col_assign' operations are slightly
-different.  They only affect a single row or column of ${\bf C}$.
-For \verb'GrB_Row_assign', Table~\ref{outsubmatrix} only applies to entries in
-the single row \verb'C(i,J)' that are outside the list of indices, \verb'J'.
-For \verb'GrB_Col_assign', Table~\ref{outsubmatrix} only applies to entries in
-the single column \verb'C(I,j)' that are outside the list of indices, \verb'I'.
+\verb'GrB_Matrix_select_*' applies a \verb'GrB_IndexUnaryOp' operator to the
+entries of a matrix.  If the operator evaluates as \verb'true' for the entry
+\verb'A(i,j)', it is copied to the matrix \verb'T', or not copied if the
+operator evaluates to \verb'false'.  The input matrix \verb'A' may be
+transposed first.  The entries in \verb'A' are typecasted into the \verb'xtype'
+of the select operator.  The final step is ${\bf C \langle M \rangle  = C \odot
+T}$, as described in Section~\ref{accummask}.
 
-\begin{table}
-{\small
-\begin{tabular}{lllll|l}
-\hline
-repl & accum & ${\bf C}$ & ${\bf C=Z}$ & mask & action taken by \verb'GrB_assign' \\
+The matrix \verb'T' has the same size and type as \verb'A' (or the transpose of
+\verb'A' if the input is transposed via the descriptor).  The entries of
+\verb'T' are a subset of those of \verb'A'.  Each entry \verb'A(i,j)' of
+\verb'A' is passed to the \verb'op', as $z=f(a_{ij},i,j,y)$.  If
+\verb'A' is transposed first then the operator is applied to entries in the
+transposed matrix, \verb"A'".  If $z$ is returned as true, then the entry is
+copied into \verb'T', unchanged.  If it returns false, the entry does not
+appear in \verb'T'.
+
+The action of \verb'GrB_select' with the built-in index-unary operators is
+described in the table below.  The MATLAB analogs are precise for \verb'tril'
+and \verb'triu', but shorthand for the other operations.  The MATLAB
+\verb'diag' function returns a column with the diagonal, if \verb'A' is a
+matrix, whereas the matrix \verb'T' in \verb'GrB_select' always has the same
+size as \verb'A' (or its transpose if the \verb'GrB_INP0' is set to
+\verb'GrB_TRAN').  In the MATLAB analog column, \verb'diag' is as if it
+operates like \verb'GrB_select', where \verb'T' is a matrix.
+
+The following operators may be used on matrices with a user-defined type:
+\verb'GrB_ROWINDEX_*',
+\verb'GrB_COLINDEX_*',
+\verb'GrB_DIAGINDEX_*',
+\verb'GrB_TRIL', \newline
+\verb'GrB_TRIU',
+\verb'GrB_DIAG',
+\verb'GrB_OFFIAG',
+\verb'GrB_COLLE',
+\verb'GrB_COLGT',
+\verb'GrB_ROWLE',
+and
+\verb'GrB_ROWGT'.
+
+For floating-point values, comparisons with \verb'NaN' always return false.
+The \verb'GrB_VALUE*' operators should not be used with a scalar \verb'y' that is
+equal to \verb'NaN'.  For this case, create a user-defined select operator that
+performs the test with the ANSI C \verb'isnan' function instead.
+
+\vspace{0.2in}
+\noindent
+{\footnotesize
+\begin{tabular}{lll}
 \hline
-   -   &-     & $c_{ij}$ & $c_{ij}$ & 1 &  \\
-   -   &-     &  -       & -        & 1 &  \\
-   -   &-     & $c_{ij}$ & $c_{ij}$ & 0 &  \\
-   -   &-     &  -       & -        & 0 &  \\
+GraphBLAS name          & Octave/MATLAB     & description \\
+                        & analog            & \\
 \hline
-   yes &  -   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
-   yes &  -   &    -     &     -    & 1 &  \\
-   yes &  -   & $c_{ij}$ & $c_{ij}$ & 0 & delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-   yes &  -   &    -     &  -       & 0 &  \\
+\verb'GrB_ROWINDEX_*'    & \verb'z=i+y'         & select \verb'A(i,j)' if \verb'i != -y' \\
+\verb'GrB_COLINDEX_*'    & \verb'z=j+y'         & select \verb'A(i,j)' if \verb'j != -y' \\
+\verb'GrB_DIAGINDEX_*'   & \verb'z=j-(i+y)'     & select \verb'A(i,j)' if \verb'j != i+y' \\
 \hline
-   -   &yes   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
-   -   &yes   &    -     &  -       & 1 &  \\
-   -   &yes   & $c_{ij}$ & $c_{ij}$ & 0 &  \\
-   -   &yes   &    -     &  -       & 0 &  \\
+\verb'GrB_TRIL'    & \verb'z=(j<=(i+y))'  & select entries on or below the \verb'y'th diagonal \\
+\verb'GrB_TRIU'    & \verb'z=(j>=(i+y))'  & select entries on or above the \verb'y'th diagonal \\
+\verb'GrB_DIAG'    & \verb'z=(j==(i+y))'  & select entries on the \verb'y'th diagonal \\
+\verb'GrB_OFFDIAG' & \verb'z=(j!=(i+y))'  & select entries not on the \verb'y'th diagonal \\
+\verb'GrB_COLLE'   & \verb'z=(j<=y)'      & select entries in columns 0 to \verb'y' \\
+\verb'GrB_COLGT'   & \verb'z=(j>y)'       & select entries in columns \verb'y+1' and above \\
+\verb'GrB_ROWLE'   & \verb'z=(i<=y)'      & select entries in rows 0 to \verb'y' \\
+\verb'GrB_ROWGT'   & \verb'z=(i>y)'       & select entries in rows \verb'y+1' and above \\
 \hline
-   yes &  yes & $c_{ij}$ & $c_{ij}$ & 1 &  \\
-   yes &  yes &   -      &  -       & 1 &  \\
-   yes &  yes & $c_{ij}$ & $c_{ij}$ & 0 & delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
-   yes &  yes &   -      &  -       & 0 &  \\
+\verb'GrB_VALUENE_T'     & \verb'z=(aij!=y)'    & select \verb'A(i,j)' if it is not equal to \verb'y'\\
+\verb'GrB_VALUEEQ_T'     & \verb'z=(aij==y)'    & select \verb'A(i,j)' is it equal to \verb'y'\\
+\verb'GrB_VALUEGT_T'     & \verb'z=(aij>y)'     & select \verb'A(i,j)' is it greater than \verb'y' \\
+\verb'GrB_VALUEGE_T'     & \verb'z=(aij>=y)'    & select \verb'A(i,j)' is it greater than or equal to \verb'y' \\
+\verb'GrB_VALUELT_T'     & \verb'z=(aij<y)'     & select \verb'A(i,j)' is it less than \verb'y' \\
+\verb'GrB_VALUELE_T'     & \verb'z=(aij<=y)'    & select \verb'A(i,j)' is it less than or equal to \verb'y' \\
+%
 \hline
 \end{tabular}
 }
-\caption{Results of assign for entries outside the
-${\bf C(I,J)}$ submatrix.  Subassign has no effect on these entries. \label{outsubmatrix}}
-\end{table}
-
-%-------------------------------------------------------------------------------
-\subsubsection{Example}
-%-------------------------------------------------------------------------------
+\vspace{0.2in}
 
-The difference between \verb'GxB_subassign' and \verb'GrB_assign' is
-illustrated in the following example.  Consider the 2-by-2 matrix ${\bf C}$
-where all entries are present.
+\newpage
+%===============================================================================
+\subsection{{\sf GrB\_reduce:} reduce to a vector or scalar} %==================
+%===============================================================================
+\label{reduce}
 
-\[
-{\bf C} = \left[
-    \begin{array}{rr}
-    11 & 12 \\
-    21 & 22 \\
-    \end{array}
-    \right]
-\]
+The generic function name \verb'GrB_reduce' may be used for all specific
+functions discussed in this section.  When the details of a specific function
+are discussed, the specific name is used for clarity.
 
-Suppose \verb'GrB_REPLACE' is true, and \verb'GrB_COMP' is false.  Let the
-\verb'Mask' be:
+\begin{alert}
+{\bf SPEC:} 
+All methods below use a monoid for the reduction.  The Specification also
+allows reductions using an associative and commutative binary operator.
+SuiteSparse:GraphBLAS permits the use of a \verb'GrB_BinaryOp' instead of a
+\verb'GrB_Monoid', but only if the binary operator is built-in and corresponds
+to a known built-in monoid.  For example, the binary operator
+\verb'GrB_PLUS_FP64' can be used, since this is the binary operator of the
+built-in \verb'GrB_PLUS_MONOID_FP64'.  For other binary ops (including any
+user-defined ones), \verb'GrB_NOT_IMPLEMENTED' is returned.
 
-\[
-{\bf M} = \left[
-    \begin{array}{rr}
-    1 & 1 \\
-    0 & 1 \\
-    \end{array}
-    \right].
-\]
+\end{alert}
 
-Let ${\bf A} = 100$, and let the index sets be ${\bf I}=0$ and ${\bf J}=1$.
-Consider the computation
-${\bf C \langle M \rangle} (0,1) = {\bf C}(0,1) + {\bf A}$,
-using the \verb'GrB_assign' operation.  The result is:
-\[
-{\bf C} = \left[
-    \begin{array}{rr}
-    11 & 112 \\
-     - &  22 \\
-    \end{array}
-    \right].
-\]
-The $(0,1)$ entry is updated and the $(1,0)$ entry is deleted because
-its \verb'Mask' is zero.  The other two entries are not modified since ${\bf Z}
-= {\bf C}$ outside the submatrix, and those two values are written back into
-${\bf C}$ because their \verb'Mask' values are 1.  The $(1,0)$ entry is deleted
-because the entry ${\bf Z}(1,0)=21$ is prevented from being written back into
-${\bf C}$ since \verb'Mask(1,0)=0'.
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_reduce\_Monoid} reduce a matrix to a vector}
+%-------------------------------------------------------------------------------
+\label{reduce_to_vector}
 
-Now consider the analogous \verb'GxB_subassign' operation.  The \verb'Mask' has
-the same size as ${\bf A}$, namely:
-\[
-{\bf M} = \left[
-    \begin{array}{r}
-    1 \\
-    \end{array}
-    \right].
-\]
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_reduce                 // w<mask> = accum (w,reduce(A))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_Monoid monoid,        // reduce monoid for t=reduce(A)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Descriptor desc       // descriptor for w, mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-After computing
-${\bf C} (0,1) {\bf \langle M \rangle} = {\bf C}(0,1) + {\bf A}$,
-the result is
+\verb'GrB_Matrix_reduce_Monoid'
+reduces a matrix to a column vector using a monoid, roughly analogous
+to \verb"t = sum (A')" in MATLAB, in the default case, where \verb't' is a
+column vector.  By default, the method reduces across the rows to
+obtain a column vector; use \verb'GrB_TRAN' to reduce down the columns.
 
-\[
-{\bf C} = \left[
-    \begin{array}{rr}
-    11 & 112 \\
-    21 &  22 \\
-    \end{array}
-    \right].
-\]
+The input matrix \verb'A' may be transposed first.  Its entries are then
+typecast into the type of the \verb'reduce' operator or monoid.  The reduction
+is applied to all entries in \verb'A (i,:)' to produce the scalar \verb't (i)'.
+This is done without the use of the identity value of the monoid.  If the
+\verb'i'th row \verb'A (i,:)' has no entries, then \verb'(i)' is not an entry
+in \verb't' and its value is implicit.  If \verb'A (i,:)' has a single entry,
+then that is the result \verb't (i)' and \verb'reduce' is not applied at all
+for the \verb'i'th row.  Otherwise, multiple entries in row \verb'A (i,:)' are
+reduced via the \verb'reduce' operator or monoid to obtain a single scalar,
+the result \verb't (i)'.
 
-Only the ${\bf C(I,J)}$ submatrix, the single entry ${\bf C}(0,1)$, is modified
-by \verb'GxB_subassign'.  The entry ${\bf C}(1,0)=21$ is unaffected by
-\verb'GxB_subassign', but it is deleted by \verb'GrB_assign'.
+The final step is ${\bf w \langle m \rangle  = w \odot t}$, as described
+in Section~\ref{accummask}, except that all the
+terms are column vectors instead of matrices.
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{Performance of {\sf GxB\_subassign}, {\sf GrB\_assign}
-and {\sf GrB\_*\_setElement}}
+\subsubsection{{\sf GrB\_Vector\_reduce\_$<$type$>$:} reduce a vector to a scalar}
 %-------------------------------------------------------------------------------
+\label{reduce_vector_to_scalar}
 
-When SuiteSparse:GraphBLAS uses non-blocking mode, the modifications to a
-matrix by \verb'GxB_subassign', \verb'GrB_assign', and \verb'GrB_*_setElement'
-can postponed, and computed all at once later on.  This has a huge impact on
-performance.
-
-A sequence of assignments is fast if their completion can be postponed for as
-long as possible, or if they do not modify the pattern at all.  Modifying the
-pattern can be costly, but it is fast if non-blocking mode can be fully
-exploited.
-
-Consider a sequence of $t$ submatrix assignments \verb'C(I,J)=C(I,J)+A' to an
-$n$-by-$n$ matrix \verb'C' where each submatrix \verb'A' has size $a$-by-$a$
-with $s$ entries, and where \verb'C' starts with $c$ entries.
-Assume the matrices are all stored in non-hypersparse form, by row
-(\verb'GxB_BY_ROW').
-
-If blocking mode is enabled, or if the sequence requires the matrix to be
-completed after each assignment, each of the $t$ assignments takes $O(a + s
-\log n)$ time to process the \verb'A' matrix and then $O(n + c + s \log s)$
-time to complete \verb'C'.  The latter step uses \verb'GrB_*_build' to build an
-update matrix and then merge it with \verb'C'.  This step does not occur if the
-sequence of assignments does not add new entries to the pattern of \verb'C',
-however.  Assuming in the worst case that the pattern does change, the total
-time is $O (t \left[ a + s \log n + n + c + s \log s \right] )$.
-
-If the sequence can be computed with all updates postponed until the end of the
-sequence, then the total time is no worse than $O(a + s \log n)$ to process
-each \verb'A' matrix, for $t$ assignments, and then a single \verb'build' at
-the end, taking $O(n + c + st \log st)$ time.
-The total time is $O (t \left [a + s \log n \right] + (n + c + st \log st))$.
-If no new entries appear in
-\verb'C' the time drops to $O (t \left [a + s \log n \right])$, and in this
-case, the time for both methods is the same; both are equally efficient.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (u))
+(
+    <type> *c,                      // result scalar
+    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
+    const GrB_Monoid monoid,        // monoid to do the reduction
+    const GrB_Vector u,             // vector to reduce
+    const GrB_Descriptor desc       // descriptor (currently unused)
+) ;
 
-A few simplifying assumptions are useful to compare these times.  Consider a
-graph of $n$ nodes with $O(n)$ edges, and with a constant bound on the degree
-of each node.  The asymptotic bounds assume a worst-case scenario where
-\verb'C' has a least some dense rows (thus the $\log n$ terms).  If these
-are not present, if both $t$ and $c$ are $O(n)$, and if $a$ and $s$ are
-constants, then the total time with blocking mode becomes $O(n^2)$, assuming
-the pattern of \verb'C' changes at each assignment.  This very high for a
-sparse graph problem.  In contrast, the non-blocking time becomes $O(n \log n)$
-under these same assumptions, which is asymptotically much faster.
+GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (u))
+(
+    GrB_Scalar c,                   // result scalar
+    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
+    const GrB_Monoid monoid,        // monoid to do the reduction
+    const GrB_Vector u,             // vector to reduce
+    const GrB_Descriptor desc       // descriptor (currently unused)
+) ;
+\end{verbatim} } \end{mdframed}
 
-\newpage
-The difference in practice can be very dramatic, since $n$ can be many millions
-for sparse graphs with $n$ nodes and $O(n)$, which can be handled on a
-commodity laptop.
+\verb'GrB_Vector_reduce_<type>'
+reduces a vector to a scalar, analogous to \verb't = sum (u)' in MATLAB,
+except that in GraphBLAS any commutative and associative monoid can be used
+in the reduction.
 
-The following guidelines should be considered when using
-\verb'GxB_subassign', \verb'GrB_assign' and \verb'GrB_*_setElement'.
+The scalar \verb'c' can be a pointer C type: \verb'bool', \verb'int8_t', ...
+\verb'float', \verb'double', or \verb'void *' for a user-defined type,
+or a \verb'GrB_Scalar'.
+If \verb'c' is a \verb'void *' pointer to a user-defined type,
+the type must be identical to the type of the vector \verb'u'.
+This cannot be checked by GraphBLAS and thus results are undefined if the
+types are not the same.
 
-\begin{enumerate}
+If the vector \verb'u' has no entries, that identity value of the \verb'monoid'
+is copied into the scalar \verb't' (unless \verb'c' is a \verb'GrB_Scalar',
+in which case \verb't' is an empty \verb'GrB_Scalar', with no entry).
+Otherwise, all of the entries in the
+vector are reduced to a single scalar using the \verb'monoid'.
 
-\item A sequence of assignments that does not modify the pattern at all is
-fast, taking as little as $\Omega(1)$ time per entry modified.  The worst case
-time complexity is $O(\log n)$ per entry, assuming they all modify a dense
-row of \verb'C' with \verb'n' entries, which can occur in practice.  It is
-more common, however, that most rows of \verb'C' have a constant number of
-entries, independent of \verb'n'.  No work is ever left pending when the
-pattern of \verb'C' does not change.
+The descriptor is unused, but it appears in case it is needed in future
+versions of the GraphBLAS API.
+This function has no mask so its accumulator/mask step differs from the other
+GraphBLAS operations.  It does not use the methods described in
+Section~\ref{accummask}, but uses the following method instead.
 
-\item A sequence of assignments that modifies the entries that already exist in
-the pattern of a matrix, or adds new entries to the pattern (using the same
-\verb'accum' operator), but does not delete any entries, is fast.  The matrix
-is not completed until the end of the sequence.
+If \verb'accum' is \verb'NULL', then the scalar \verb't' is typecast into the
+type of \verb'c', and \verb'c = t' is the final result.  Otherwise, the scalar
+\verb't' is typecast into the \verb'ytype' of the \verb'accum' operator, and
+the value of \verb'c' (on input) is typecast into the \verb'xtype' of the
+\verb'accum' operator.  Next, the scalar \verb'z = accum (c,t)' is computed, of
+the \verb'ztype' of the \verb'accum' operator.  Finally, \verb'z' is typecast
+into the final result, \verb'c'.
 
-\item Similarly, a sequence that modifies existing entries, or deletes them,
-but does not add new ones, is also fast.  This sequence can also repeatedly
-delete pre-existing entries and then reinstate them and still be fast.  The
-matrix is not completed until the end of the sequence.
+If \verb'c' is a non-opaque scalar, no error message can be returned by
+\verb'GrB_error'.  If \verb'c' is a \verb'GrB_Scalar', then
+\verb'GrB_error(&err,c)' can be used to return an error string, if an error
+occurs.
 
-\item A sequence that mixes assignments of types (2) and (3) above can be
-costly, since the matrix may need to be completed after each assignment.  The
-time complexity can become quadratic in the worst case.
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_reduce\_$<$type$>$:} reduce a matrix to a scalar}
+%-------------------------------------------------------------------------------
+\label{reduce_matrix_to_scalar}
 
-\item However, any single assignment takes no more than $O (a + s \log n + n +
-c + s \log s )$ time, even including the time for a matrix completion, where
-\verb'C' is $n$-by-$n$ with $c$ entries and \verb'A' is $a$-by-$a$ with $s$
-entries.  This time is essentially linear in the size of the matrix \verb'C',
-if \verb'A' is relatively small and sparse compared with \verb'C'.  In this
-case, $n+c$ are the two dominant terms.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (A))
+(
+    <type> *c,                      // result scalar
+    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
+    const GrB_Monoid monoid,        // monoid to do the reduction
+    const GrB_Matrix A,             // matrix to reduce
+    const GrB_Descriptor desc       // descriptor (currently unused)
+) ;
 
-\item In general, \verb'GxB_subassign' is faster than \verb'GrB_assign'.
-If \verb'GrB_REPLACE' is used with \verb'GrB_assign', the entire matrix
-\verb'C' must be traversed.  This is much slower than \verb'GxB_subassign',
-which only needs to examine the \verb'C(I,J)' submatrix.  Furthermore,
-\verb'GrB_assign' must deal with a much larger \verb'Mask' matrix, whereas
-\verb'GxB_subassign' has a smaller mask.  Since its mask is smaller,
-\verb'GxB_subassign' takes less time than \verb'GrB_assign' to access the mask.
+GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (A))
+(
+    GrB_Scalar c,                   // result scalar
+    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
+    const GrB_Monoid monoid,        // monoid to do the reduction
+    const GrB_Matrix A,             // matrix to reduce
+    const GrB_Descriptor desc       // descriptor (currently unused)
+) ;
+\end{verbatim} } \end{mdframed}
 
-\end{enumerate}
+\verb'GrB_Matrix_reduce_<type>' reduces a matrix \verb'A' to a scalar, roughly
+analogous to \verb't = sum (A (:))' in MATLAB.  This function is identical to
+reducing a vector to a scalar, since the positions of the entries in a matrix
+or vector have no effect on the result.  Refer to the reduction to scalar
+described in the previous Section~\ref{reduce_vector_to_scalar}.
 
-% see GraphBLAS/Test/test46.m
+\newpage
+%===============================================================================
+\subsection{{\sf GrB\_transpose:} transpose a matrix} %=========================
+%===============================================================================
+\label{transpose}
 
-Submatrix assignment in SuiteSparse:GraphBLAS is extremely efficient, even
-without considering the advantages of non-blocking mode discussed in
-Section~\ref{compare_assign}.  It can be up to 1000x faster than MATLAB R2019b,
-or even higher depending on the kind of matrix assignment.  MATLAB logical
-indexing (the mask of GraphBLAS) is extremely faster with GraphBLAS as compared
-in MATLAB R2019b; differences of up to 250,000x have been observed (0.4 seconds
-in GraphBLAS versus 28 hours in MATLAB).
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_transpose              // C<Mask> = accum (C, A')
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Descriptor desc       // descriptor for C, Mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
 
-All of the 28 variants (each with their own source code) are either
-asymptotically optimal, or to within a log factor of being asymptotically
-optimal.  The methods are also fully parallel.  For hypersparse matrices, the
-term $n$ in the expressions in the above discussion is dropped, and is replaced
-with $h \log h$, at the worst case, where $h << n$ is the number of non-empty
-columns of a hypersparse matrix stored by column, or the number of non-empty
-rows of a hypersparse matrix stored by row.  In many methods, $n$ is replaced
-with $h$, not $h \log h$.
+\verb'GrB_transpose'
+transposes a matrix \verb'A', just like the array transpose \verb"T = A.'" in
+MATLAB.  The internal result matrix \verb"T = A'" (or merely \verb"T = A" if
+\verb'A' is transposed via the descriptor) has the same type as \verb'A'.  The
+final step is ${\bf C \langle M \rangle  = C \odot T}$, as described in
+Section~\ref{accummask}, which typecasts \verb'T' as needed and applies the
+mask and accumulator.
+
+To be consistent with the rest of the GraphBLAS API regarding the
+descriptor, the input matrix \verb'A' may be transposed first by
+setting the \verb'GrB_INP0' setting to \verb'GrB_TRAN'.  This results in
+a double transpose, and thus \verb'A' is not transposed is computed.
 
 \newpage
 %===============================================================================
-\subsection{{\sf GrB\_apply:} apply a unary, binary, or index-unary operator}
+\subsection{{\sf GrB\_kronecker:} Kronecker product} %==========================
 %===============================================================================
-\label{apply}
+\label{kron}
 
-\verb'GrB_apply' is the generic name for 92 specific functions:
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_kronecker              // C<Mask> = accum (C, kron(A,B))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const <operator> op,            // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+) ;
+\end{verbatim} } \end{mdframed}
 
-\begin{packed_itemize}
-\item
-\verb'GrB_Vector_apply' and \verb'GrB_Matrix_apply' apply a unary operator to
-the entries of a matrix (two variants).
+\verb'GrB_kronecker' computes the Kronecker product,
+${\bf C \langle M \rangle = C \odot \mbox{kron}(A,B)}$ where
+\[
+\mbox{kron}{\bf (A,B)} =
+\left[
+    \begin{array}{ccc}
+    a_{00} \otimes {\bf B} & \ldots & a_{0,n-1} \otimes {\bf B} \\
+    \vdots & \ddots & \vdots \\
+    a_{m-1,0} \otimes {\bf B} & \ldots & a_{m-1,n-1} \otimes {\bf B} \\
+    \end{array}
+\right]
+\]
+The $\otimes$ operator is defined by the \verb'op' parameter.  It is applied in
+an element-wise fashion (like \verb'GrB_eWiseMult'), where the pattern of the
+submatrix $a_{ij} \otimes {\bf B}$ is the same as the pattern of ${\bf B}$ if
+$a_{ij}$ is an entry in the matrix ${\bf A}$, or empty otherwise.  The input
+matrices \verb'A' and \verb'B' can be of any dimension, and both matrices may
+be transposed first via the descriptor, \verb'desc'.  Entries in \verb'A' and
+\verb'B' are typecast into the input types of the \verb'op'.  The matrix
+\verb'T=kron(A,B)' has the same type as the \verb'ztype' of the binary
+operator, \verb'op'.  The final step is ${\bf C \langle M \rangle  = C \odot
+T}$, as described in Section~\ref{accummask}.
 
-\item \verb'GrB_*_apply_BinaryOp1st_*' applies a binary
-operator where a single scalar is provided as the $x$ input to the binary
-operator.
-There are 30 variants, depending on the type of the scalar: (matrix or vector)
-x (13 built-in types, one for user-defined types, and a version for
-\verb'GrB_Scalar').
+The operator \verb'op' may be a \verb'GrB_BinaryOp', a \verb'GrB_Monoid', or a
+\verb'GrB_Semiring'.  In the latter case, the multiplicative operator of
+the semiring is used.
 
-\item \verb'GrB_*_apply_BinaryOp2nd_*' applies a binary operator where a
-single scalar is provided as the $y$ input to the binary operator.
-There are 30 variants, depending on the type of the scalar: (matrix or vector)
-x (13 built-in types, one for user-defined types, and a version for
-\verb'GrB_Scalar').
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Printing GraphBLAS objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{fprint}
 
-\item \verb'GrB_*_apply_IndexOp_*' applies a \verb'GrB_IndexUnaryOp',
-single scalar is provided as the scalar $y$ input to the index-unary operator.
-There are 30 variants, depending on the type of the scalar: (matrix or vector)
-x (13 built-in types, one for user-defined types, and a version for
-\verb'GrB_Scalar').
+The ten different objects handled by SuiteSparse:GraphBLAS are all opaque,
+although nearly all of their contents can be extracted via methods such as
+\verb'GrB_Matrix_extractTuples', \verb'GrB_Matrix_extractElement',
+\verb'GxB_Matrix_type', and so on.  The GraphBLAS C API has no mechanism for
+printing all the contents of GraphBLAS objects, but this is helpful for
+debugging.  Ten type-specific methods and two type-generic methods are
+provided:
 
-\end{packed_itemize}
+\vspace{0.2in}
+{\footnotesize
+\begin{tabular}{ll}
+\hline
+\verb'GxB_Type_fprint'         & print and check a \verb'GrB_Type' \\
+\verb'GxB_UnaryOp_fprint'      & print and check a \verb'GrB_UnaryOp' \\
+\verb'GxB_BinaryOp_fprint'     & print and check a \verb'GrB_BinaryOp' \\
+\verb'GxB_IndexUnaryOP_fprint' & print and check a \verb'GrB_IndexUnaryOp' \\
+\verb'GxB_Monoid_fprint'       & print and check a \verb'GrB_Monoid' \\
+\verb'GxB_Semiring_fprint'     & print and check a \verb'GrB_Semiring' \\
+\verb'GxB_Descriptor_fprint'   & print and check a \verb'GrB_Descriptor' \\
+\verb'GxB_Matrix_fprint'       & print and check a \verb'GrB_Matrix' \\
+\verb'GxB_Vector_fprint'       & print and check a \verb'GrB_Vector' \\
+\verb'GxB_Scalar_fprint'       & print and check a \verb'GrB_Scalar' \\
+\hline
+\verb'GxB_fprint'             & print/check any object to a file \\
+\verb'GxB_print'              & print/check any object to \verb'stdout' \\
+\hline
+\end{tabular}
+}
+\vspace{0.2in}
 
-The generic
-name appears in the function prototypes, but the specific function name is used
-when describing each variation.  When discussing features that apply to all
-versions, the simple name \verb'GrB_apply' is used.
+These methods do not modify the status of any object, and thus they
+cannot return an error string for use by \verb'GrB_error'.
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_apply:} apply a unary operator to a vector}
-%-------------------------------------------------------------------------------
-\label{apply_vector}
+If a matrix or vector
+has not been completed, the pending computations are guaranteed to {\em not} be
+performed. The reason is simple.  It is possible for a bug in the user
+application (such as accessing memory outside the bounds of an array) to mangle
+the internal content of a GraphBLAS object, and the \verb'GxB_*print' methods
+can be helpful tools to track down this bug.  If \verb'GxB_*print' attempted to
+complete any computations prior to printing or checking the contents of the
+matrix or vector, then further errors could occur, including a segfault.
+
+By contrast, GraphBLAS methods and operations that return values into
+user-provided arrays or variables might finish pending operations before the
+return these values, and this would change their state.  Since they do not
+change the state of any object, the \verb'GxB_*print' methods provide a useful
+alternative for debugging, and for a quick understanding of what GraphBLAS is
+computing while developing a user application.
+
+Each of the methods has a parameter of type \verb'GxB_Print_Level' that
+specifies the amount to print:
 
-\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // w<mask> = accum (w, op(u))
-(
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_UnaryOp op,           // operator to apply to the entries
-    const GrB_Vector u,             // first input:  vector u
-    const GrB_Descriptor desc       // descriptor for w and mask
-) ;
-\end{verbatim} } \end{mdframed}
+typedef enum
+{
+    GxB_SILENT = 0,     // nothing is printed, just check the object
+    GxB_SUMMARY = 1,    // print a terse summary
+    GxB_SHORT = 2,      // short description, about 30 entries of a matrix
+    GxB_COMPLETE = 3,   // print the entire contents of the object
+    GxB_SHORT_VERBOSE = 4,    // GxB_SHORT but with "%.15g" for doubles
+    GxB_COMPLETE_VERBOSE = 5  // GxB_COMPLETE but with "%.15g" for doubles
+}
+GxB_Print_Level ; \end{verbatim}}
 
-\verb'GrB_Vector_apply' applies a unary operator to the entries of a vector,
-analogous to \verb't = op(u)'  in MATLAB except the operator \verb'op' is only
-applied to entries in the pattern of \verb'u'.  Implicit values outside the
-pattern of \verb'u' are not affected.  The entries in \verb'u' are typecasted
-into the \verb'xtype' of the unary operator.  The vector \verb't' has the same
-type as the \verb'ztype' of the unary operator.  The final step is ${\bf w
-\langle m \rangle  = w \odot t}$, as described in Section~\ref{accummask},
-except that all the terms are column vectors instead of matrices.
+The ten type-specific functions include an additional argument, the
+\verb'name' string.  The \verb'name' is printed at the beginning of the display
+(assuming the print level is not \verb'GxB_SILENT') so that the object can be
+more easily identified in the output.  For the type-generic methods
+\verb'GxB_fprint' and \verb'GxB_print', the \verb'name' string is the variable
+name of the object itself.
+
+If the file \verb'f' is \verb'NULL', \verb'stdout' is used.
+If \verb'name' is \verb'NULL', it is treated
+as the empty string.  These are not error conditions.
+
+The methods check their input objects carefully and extensively, even when
+\verb'pr' is equal to \verb'GxB_SILENT'.  The following error codes can be
+returned:
+
+\begin{packed_itemize}
+\item \verb'GrB_SUCCESS':               object is valid
+\item \verb'GrB_UNINITIALIZED_OBJECT':  object is not initialized
+\item \verb'GrB_INVALID_OBJECT':        object is not valid
+\item \verb'GrB_NULL_POINTER':          object is a NULL pointer
+\item \verb'GrB_INVALID_VALUE':         \verb'fprintf' returned an I/O error.
+\end{packed_itemize}
+
+The content of any GraphBLAS object is opaque, and subject to change.  As a
+result, the exact content and format of what is printed is
+implementation-dependent, and will change from version to version of
+SuiteSparse:GraphBLAS.  Do not attempt to rely on the exact content or format
+by trying to parse the resulting output via another program.  The intent of
+these functions is to produce a report of an object for visual inspection.  If
+the user application needs to extract content from a GraphBLAS matrix or
+vector, use \verb'GrB_*_extractTuples' or the import/export methods instead.
+
+GraphBLAS matrices and vectors are zero-based, where indices of an $n$-by-$n$
+matrix are in the range 0 to $n-1$.  However, Octave, MATLAB, and Julia prefer
+to print their matrices and vectors as one-based.  To enable 1-based printing,
+use \verb'GxB_set (GxB_PRINT_1BASED, true)'.  Printing is done as zero-based by
+default.
 
 \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_apply:} apply a unary operator to a matrix}
-%-------------------------------------------------------------------------------
-\label{apply_matrix}
+%===============================================================================
+\subsection{{\sf GxB\_fprint:} Print a GraphBLAS object to a file} %============
+%===============================================================================
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // C<Mask> = accum (C, op(A)) or op(A')
+GrB_Info GxB_fprint                 // print and check a GraphBLAS object
 (
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_UnaryOp op,           // operator to apply to the entries
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Descriptor desc       // descriptor for C, mask, and A
+    GrB_<objecttype> object,        // object to print and check
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_apply'
-applies a unary operator to the entries of a matrix, analogous to
-\verb'T = op(A)'  in MATLAB except the operator \verb'op' is only applied to
-entries in the pattern of \verb'A'.  Implicit values outside the pattern of
-\verb'A' are not affected.  The input matrix \verb'A' may be transposed first.
-The entries in \verb'A' are typecasted into the \verb'xtype' of the unary
-operator.  The matrix \verb'T' has the same type as the \verb'ztype' of the
-unary operator.  The final step is ${\bf C \langle M \rangle  = C \odot T}$, as
-described in Section~\ref{accummask}.
+The \verb'GxB_fprint' function prints the contents of any of the ten GraphBLAS
+objects to the file \verb'f'.  If \verb'f' is \verb'NULL', the results are
+printed to \verb'stdout'.  For example, to print the entire contents of a
+matrix \verb'A' to the file \verb'f', use
+\verb'GxB_fprint (A, GxB_COMPLETE, f)'.
 
-The built-in \verb'GrB_IDENTITY_'$T$ operators (one for each built-in type $T$)
-are very useful when combined with this function, enabling it to compute ${\bf
-C \langle M \rangle  = C \odot A}$.  This makes \verb'GrB_apply' a direct
-interface to the accumulator/mask function for both matrices and vectors.
-The \verb'GrB_IDENTITY_'$T$ operators also provide the fastest stand-alone
-typecasting methods in SuiteSparse:GraphBLAS, with all $13 \times 13=169$
-methods appearing as individual functions, to typecast between any of the 13
-built-in types.
+%===============================================================================
+\subsection{{\sf GxB\_print:} Print a GraphBLAS object to {\sf stdout}} %=======
+%===============================================================================
+\label{gxb_print}
 
-To compute ${\bf C \langle M \rangle = A}$ or ${\bf C \langle M \rangle = C
-\odot A}$ for user-defined types, the user application would need to define an
-identity operator for the type.  Since GraphBLAS cannot detect that it is an
-identity operator, it must call the operator to make the full copy \verb'T=A'
-and apply the operator to each entry of the matrix or vector.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_print                  // print and check a GrB_Vector
+(
+    GrB_<objecttype> object,        // object to print and check
+    GxB_Print_Level pr              // print level
+) ;
+\end{verbatim} } \end{mdframed}
 
-The other GraphBLAS operation that provides a direct interface to the
-accumulator/mask function is \verb'GrB_transpose', which does not require an
-operator to perform this task.  As a result, \verb'GrB_transpose' can be used
-as an efficient and direct interface to the accumulator/mask function for
-both built-in and user-defined types.  However, it is only available for
-matrices, not vectors.
+\verb'GxB_print' is the same as \verb'GxB_fprint', except that it prints the
+contents of the object to \verb'stdout' instead of a file \verb'f'.  For
+example, to print the entire contents of a matrix \verb'A',  use
+\verb'GxB_print (A, GxB_COMPLETE)'.
 
-\newpage
 %===============================================================================
-\subsubsection{{\sf GrB\_Vector\_apply\_BinaryOp1st:} apply a binary operator to a vector; 1st scalar binding}
+\subsection{{\sf GxB\_Type\_fprint:} Print a {\sf GrB\_Type}}
 %===============================================================================
-\label{vector_apply1st}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // w<mask> = accum (w, op(x,u))
+GrB_Info GxB_Type_fprint            // print and check a GrB_Type
 (
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_BinaryOp op,          // operator to apply to the entries
-    <type> x,                       // first input:  scalar x
-    const GrB_Vector u,             // second input: vector u
-    const GrB_Descriptor desc       // descriptor for w and mask
+    GrB_Type type,                  // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_apply_BinaryOp1st_<type>'  applies a binary operator
-$z=f(x,y)$ to a vector, where a scalar $x$ is bound to the first input of the
-operator.
-The scalar \verb'x' can be a non-opaque C scalar corresponding to a built-in
-type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
-It is otherwise identical to \verb'GrB_Vector_apply'.
+For example, \verb'GxB_Type_fprint (GrB_BOOL, "boolean type", GxB_COMPLETE, f)'
+prints the contents of the \verb'GrB_BOOL' object to the file \verb'f'.
 
+\newpage
 %===============================================================================
-\subsubsection{{\sf GrB\_Vector\_apply\_BinaryOp2nd:} apply a binary operator to a vector; 2nd scalar binding}
+\subsection{{\sf GxB\_UnaryOp\_fprint:} Print a {\sf GrB\_UnaryOp}}
 %===============================================================================
-\label{vector_apply2nd}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // w<mask> = accum (w, op(u,y))
+GrB_Info GxB_UnaryOp_fprint         // print and check a GrB_UnaryOp
 (
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_BinaryOp op,          // operator to apply to the entries
-    const GrB_Vector u,             // first input:  vector u
-    <type> y,                       // second input: scalar y
-    const GrB_Descriptor desc       // descriptor for w and mask
+    GrB_UnaryOp unaryop,            // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_apply_BinaryOp2nd_<type>'  applies a binary operator
-$z=f(x,y)$ to a vector, where a scalar $y$ is bound to the second input of the
-operator.
-The scalar \verb'x' can be a non-opaque C scalar corresponding to a built-in
-type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
-It is otherwise identical to \verb'GrB_Vector_apply'.
+For example,
+\verb'GxB_UnaryOp_fprint (GrB_LNOT, "not", GxB_COMPLETE, f)'
+prints the \verb'GrB_LNOT' unary operator to the file \verb'f'.
+
 
-\newpage
 %===============================================================================
-\subsubsection{{\sf GrB\_Vector\_apply\_IndexOp:} apply an index-unary operator to a vector}
+\subsection{{\sf GxB\_BinaryOp\_fprint:} Print a {\sf GrB\_BinaryOp}}
 %===============================================================================
-\label{vector_apply_idxunop}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // w<mask> = accum (w, op(u,y))
+GrB_Info GxB_BinaryOp_fprint        // print and check a GrB_BinaryOp
 (
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_IndexUnaryOp op,      // operator to apply to the entries
-    const GrB_Vector u,             // first input:  vector u
-    const <type> y,                 // second input: scalar y
-    const GrB_Descriptor desc       // descriptor for w and mask
+    GrB_BinaryOp binaryop,          // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_apply_IndexOp_<type>'  applies an index-unary operator
-$z=f(x,i,0,y)$ to a vector.
-The scalar \verb'y' can be a non-opaque C scalar corresponding to a built-in
-type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
-It is otherwise identical to \verb'GrB_Vector_apply'.
+For example,
+\verb'GxB_BinaryOp_fprint (GrB_PLUS_FP64, "plus", GxB_COMPLETE, f)' prints the
+\verb'GrB_PLUS_FP64' binary operator to the file \verb'f'.
+
 
 %===============================================================================
-\subsubsection{{\sf GrB\_Matrix\_apply\_BinaryOp1st:} apply a binary operator to a matrix; 1st scalar binding}
+\subsection{{\sf GxB\_IndexUnaryOp\_fprint:} Print a {\sf GrB\_IndexUnaryOp}}
 %===============================================================================
-\label{matrix_apply1st}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // C<M>=accum(C,op(x,A))
+GrB_Info GxB_IndexUnaryOp_fprint    // print and check a GrB_IndexUnaryOp
 (
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // operator to apply to the entries
-    <type> x,                       // first input:  scalar x
-    const GrB_Matrix A,             // second input: matrix A
-    const GrB_Descriptor desc       // descriptor for C, mask, and A
+    GrB_IndexUnaryOp op,            // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_apply_BinaryOp1st_<type>'  applies a binary operator
-$z=f(x,y)$ to a matrix, where a scalar $x$ is bound to the first input of the
-operator. 
-The scalar \verb'x' can be a non-opaque C scalar corresponding to a built-in
-type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
-It is otherwise identical to \verb'GrB_Matrix_apply'.
+For example,
+\verb'GrB_IndexUnaryOp_fprint (GrB_TRIL, "tril", GxB_COMPLETE, f)' prints
+the \verb'GrB_TRIL' index-unary operator to the file \verb'f'.
 
 \newpage
 %===============================================================================
-\subsubsection{{\sf GrB\_Matrix\_apply\_BinaryOp2nd:} apply a binary operator to a matrix; 2nd scalar binding}
+\subsection{{\sf GxB\_Monoid\_fprint:} Print a {\sf GrB\_Monoid}}
 %===============================================================================
-\label{matrix_apply2nd}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // C<M>=accum(C,op(A,y))
+GrB_Info GxB_Monoid_fprint          // print and check a GrB_Monoid
 (
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // operator to apply to the entries
-    const GrB_Matrix A,             // first input:  matrix A
-    <type> y,                       // second input: scalar y
-    const GrB_Descriptor desc       // descriptor for C, mask, and A
+    GrB_Monoid monoid,              // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_apply_BinaryOp2nd_<type>'  applies a binary operator
-$z=f(x,y)$ to a matrix, where a scalar $x$ is bound to the second input of the
-operator.
-The scalar \verb'y' can be a non-opaque C scalar corresponding to a built-in
-type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
-It is otherwise identical to \verb'GrB_Matrix_apply'.
+For example,
+\verb'GxB_Monoid_fprint (GxB_PLUS_FP64_MONOID, "plus monoid",'
+\verb'GxB_COMPLETE, f)'
+prints the predefined \verb'GxB_PLUS_FP64_MONOID' (based on the binary
+operator \verb'GrB_PLUS_FP64') to the file \verb'f'.
 
 %===============================================================================
-\subsubsection{{\sf GrB\_Matrix\_apply\_IndexOp:} apply an index-unary operator to a matrix}
+\subsection{{\sf GxB\_Semiring\_fprint:} Print a {\sf GrB\_Semiring}}
 %===============================================================================
-\label{matrix_apply_idxunop}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_apply                  // C<M>=accum(C,op(A,y))
+GrB_Info GxB_Semiring_fprint        // print and check a GrB_Semiring
 (
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_IndexUnaryOp op,      // operator to apply to the entries
-    const GrB_Matrix A,             // first input:  matrix A
-    const <type> y,                 // second input: scalar y
-    const GrB_Descriptor desc       // descriptor for C, mask, and A
+    GrB_Semiring semiring,          // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_apply_IndexOp_<type>'  applies an index-unary operator
-$z=f(x,i,j,y)$ to a matrix.
-The scalar \verb'y' can be a non-opaque C scalar corresponding to a built-in
-type, a \verb'void *' for user-defined types, or a \verb'GrB_Scalar'.
-It is otherwise identical to \verb'GrB_Matrix_apply'.
+For example,
+\verb'GxB_Semiring_fprint (GxB_PLUS_TIMES_FP64, "standard",'
+\verb'GxB_COMPLETE, f)'
+prints the predefined \verb'GxB_PLUS_TIMES_FP64' semiring to the file \verb'f'.
 
-\newpage
 %===============================================================================
-\subsection{{\sf GrB\_select:} select entries based on an index-unary operator}
+\subsection{{\sf GxB\_Descriptor\_fprint:} Print a {\sf GrB\_Descriptor}}
 %===============================================================================
-\label{select}
-
-The \verb'GrB_select' function is the generic name for 30 specific functions,
-depending on whether it operates on a matrix or vector, and depending on the
-type of the scalar \verb'y': (matrix or vector) x (13 built-in types,
-\verb'void *' for user-defined types, and a \verb'GrB_Scalar').  The generic
-name appears in the function prototypes, but the specific function name is used
-when describing each variation.  When discussing features that apply to both
-versions, the simple name \verb'GrB_select' is used.
-
-% \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_select:} select entries from a vector}
-%-------------------------------------------------------------------------------
-\label{select_vector}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_select                 // w<mask> = accum (w, op(u))
+GrB_Info GxB_Descriptor_fprint      // print and check a GrB_Descriptor
 (
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_IndexUnaryOp op,      // operator to apply to the entries
-    const GrB_Vector u,             // first input:  vector u
-    const <type> y,                 // second input: scalar y
-    const GrB_Descriptor desc       // descriptor for w and mask
+    GrB_Descriptor descriptor,      // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_select_*' applies a \verb'GrB_IndexUnaryOp' operator to the
-entries of a vector.  If the operator evaluates as \verb'true' for the entry
-\verb'u(i)', it is copied to the vector \verb't', or not copied if the operator
-evaluates to \verb'false'.   The vector \verb't' is then written to the result
-\verb'w' via the mask/accumulator step.  This operation operates on vectors
-just as if they were \verb'm'-by-1 matrices, except that GraphBLAS never
-transposes a vector via the descriptor.  Refer to the next section
-(\ref{select_matrix}) on \verb'GrB_Matrix_select' for more details.
+For example,
+\verb'GxB_Descriptor_fprint (d, "descriptor", GxB_COMPLETE, f)'
+prints the descriptor \verb'd' to the file \verb'f'.
 
 \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_select:} apply a select operator to a matrix}
-%-------------------------------------------------------------------------------
-\label{select_matrix}
+%===============================================================================
+\subsection{{\sf GxB\_Matrix\_fprint:} Print a {\sf GrB\_Matrix}}
+%===============================================================================
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_select                 // C<M>=accum(C,op(A))
+GrB_Info GxB_Matrix_fprint          // print and check a GrB_Matrix
 (
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_IndexUnaryOp op,      // operator to apply to the entries
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Scalar y,             // second input: scalar y
-    const GrB_Descriptor desc       // descriptor for C, mask, and A
+    GrB_Matrix A,                   // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_select_*' applies a \verb'GrB_IndexUnaryOp' operator to the
-entries of a matrix.  If the operator evaluates as \verb'true' for the entry
-\verb'A(i,j)', it is copied to the matrix \verb'T', or not copied if the
-operator evaluates to \verb'false'.  The input matrix \verb'A' may be
-transposed first.  The entries in \verb'A' are typecasted into the \verb'xtype'
-of the select operator.  The final step is ${\bf C \langle M \rangle  = C \odot
-T}$, as described in Section~\ref{accummask}.
-
-The matrix \verb'T' has the same size and type as \verb'A' (or the transpose of
-\verb'A' if the input is transposed via the descriptor).  The entries of
-\verb'T' are a subset of those of \verb'A'.  Each entry \verb'A(i,j)' of
-\verb'A' is passed to the \verb'op', as $z=f(a_{ij},i,j,y)$.  If
-\verb'A' is transposed first then the operator is applied to entries in the
-transposed matrix, \verb"A'".  If $z$ is returned as true, then the entry is
-copied into \verb'T', unchanged.  If it returns false, the entry does not
-appear in \verb'T'.
-
-The action of \verb'GrB_select' with the built-in index-unary operators is
-described in the table below.  The MATLAB analogs are precise for \verb'tril'
-and \verb'triu', but shorthand for the other operations.  The MATLAB
-\verb'diag' function returns a column with the diagonal, if \verb'A' is a
-matrix, whereas the matrix \verb'T' in \verb'GrB_select' always has the same
-size as \verb'A' (or its transpose if the \verb'GrB_INP0' is set to
-\verb'GrB_TRAN').  In the MATLAB analog column, \verb'diag' is as if it
-operates like \verb'GrB_select', where \verb'T' is a matrix.
-
-The following operators may be used on matrices with a user-defined type:
-\verb'GrB_ROWINDEX_*',
-\verb'GrB_COLINDEX_*',
-\verb'GrB_DIAGINDEX_*',
-\verb'GrB_TRIL', \newline
-\verb'GrB_TRIU',
-\verb'GrB_DIAG',
-\verb'GrB_OFFIAG',
-\verb'GrB_COLLE',
-\verb'GrB_COLGT',
-\verb'GrB_ROWLE',
-and
-\verb'GrB_ROWGT'.
-
-For floating-point values, comparisons with \verb'NaN' always return false.
-The \verb'GrB_VALUE*' operators should not be used with a scalar \verb'y' that is
-equal to \verb'NaN'.  For this case, create a user-defined select operator that
-performs the test with the ANSI C \verb'isnan' function instead.
+For example, \verb'GxB_Matrix_fprint (A, "my matrix", GxB_SHORT, f)'
+prints about 30 entries from the matrix \verb'A' to the file \verb'f'.
 
-\vspace{0.2in}
-\noindent
-{\footnotesize
-\begin{tabular}{lll}
-\hline
-GraphBLAS name          & Octave/MATLAB     & description \\
-                        & analog            & \\
-\hline
-\verb'GrB_ROWINDEX_*'    & \verb'z=i+y'         & select \verb'A(i,j)' if \verb'i != -y' \\
-\verb'GrB_COLINDEX_*'    & \verb'z=j+y'         & select \verb'A(i,j)' if \verb'j != -y' \\
-\verb'GrB_DIAGINDEX_*'   & \verb'z=j-(i+y)'     & select \verb'A(i,j)' if \verb'j != i+y' \\
-\hline
-\verb'GrB_TRIL'    & \verb'z=(j<=(i+y))'  & select entries on or below the \verb'y'th diagonal \\
-\verb'GrB_TRIU'    & \verb'z=(j>=(i+y))'  & select entries on or above the \verb'y'th diagonal \\
-\verb'GrB_DIAG'    & \verb'z=(j==(i+y))'  & select entries on the \verb'y'th diagonal \\
-\verb'GrB_OFFDIAG' & \verb'z=(j!=(i+y))'  & select entries not on the \verb'y'th diagonal \\
-\verb'GrB_COLLE'   & \verb'z=(j<=y)'      & select entries in columns 0 to \verb'y' \\
-\verb'GrB_COLGT'   & \verb'z=(j>y)'       & select entries in columns \verb'y+1' and above \\
-\verb'GrB_ROWLE'   & \verb'z=(i<=y)'      & select entries in rows 0 to \verb'y' \\
-\verb'GrB_ROWGT'   & \verb'z=(i>y)'       & select entries in rows \verb'y+1' and above \\
-\hline
-\verb'GrB_VALUENE_T'     & \verb'z=(aij!=y)'    & select \verb'A(i,j)' if it is not equal to \verb'y'\\
-\verb'GrB_VALUEEQ_T'     & \verb'z=(aij==y)'    & select \verb'A(i,j)' is it equal to \verb'y'\\
-\verb'GrB_VALUEGT_T'     & \verb'z=(aij>y)'     & select \verb'A(i,j)' is it greater than \verb'y' \\
-\verb'GrB_VALUEGE_T'     & \verb'z=(aij>=y)'    & select \verb'A(i,j)' is it greater than or equal to \verb'y' \\
-\verb'GrB_VALUELT_T'     & \verb'z=(aij<y)'     & select \verb'A(i,j)' is it less than \verb'y' \\
-\verb'GrB_VALUELE_T'     & \verb'z=(aij<=y)'    & select \verb'A(i,j)' is it less than or equal to \verb'y' \\
-%
-\hline
-\end{tabular}
-}
-\vspace{0.2in}
 
-\newpage
 %===============================================================================
-\subsection{{\sf GrB\_reduce:} reduce to a vector or scalar} %==================
+\subsection{{\sf GxB\_Vector\_fprint:} Print a {\sf GrB\_Vector}}
 %===============================================================================
-\label{reduce}
-
-The generic function name \verb'GrB_reduce' may be used for all specific
-functions discussed in this section.  When the details of a specific function
-are discussed, the specific name is used for clarity.
-
-\begin{alert}
-{\bf SPEC:} 
-All methods below use a monoid for the reduction.  The Specification also
-allows reductions using an associative and commutative binary operator.
-SuiteSparse:GraphBLAS permits the use of a \verb'GrB_BinaryOp' instead of a
-\verb'GrB_Monoid', but only if the binary operator is built-in and corresponds
-to a known built-in monoid.  For example, the binary operator
-\verb'GrB_PLUS_FP64' can be used, since this is the binary operator of the
-built-in \verb'GrB_PLUS_MONOID_FP64'.  For other binary ops (including any
-user-defined ones), \verb'GrB_NOT_IMPLEMENTED' is returned.
-
-\end{alert}
-
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_reduce\_Monoid} reduce a matrix to a vector}
-%-------------------------------------------------------------------------------
-\label{reduce_to_vector}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_reduce                 // w<mask> = accum (w,reduce(A))
+GrB_Info GxB_Vector_fprint          // print and check a GrB_Vector
 (
-    GrB_Vector w,                   // input/output vector for results
-    const GrB_Vector mask,          // optional mask for w, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const GrB_Monoid monoid,        // reduce monoid for t=reduce(A)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Descriptor desc       // descriptor for w, mask, and A
+    GrB_Vector v,                   // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_reduce_Monoid'
-reduces a matrix to a column vector using a monoid, roughly analogous
-to \verb"t = sum (A')" in MATLAB, in the default case, where \verb't' is a
-column vector.  By default, the method reduces across the rows to
-obtain a column vector; use \verb'GrB_TRAN' to reduce down the columns.
-
-The input matrix \verb'A' may be transposed first.  Its entries are then
-typecast into the type of the \verb'reduce' operator or monoid.  The reduction
-is applied to all entries in \verb'A (i,:)' to produce the scalar \verb't (i)'.
-This is done without the use of the identity value of the monoid.  If the
-\verb'i'th row \verb'A (i,:)' has no entries, then \verb'(i)' is not an entry
-in \verb't' and its value is implicit.  If \verb'A (i,:)' has a single entry,
-then that is the result \verb't (i)' and \verb'reduce' is not applied at all
-for the \verb'i'th row.  Otherwise, multiple entries in row \verb'A (i,:)' are
-reduced via the \verb'reduce' operator or monoid to obtain a single scalar,
-the result \verb't (i)'.
-
-The final step is ${\bf w \langle m \rangle  = w \odot t}$, as described
-in Section~\ref{accummask}, except that all the
-terms are column vectors instead of matrices.
+For example, \verb'GxB_Vector_fprint (v, "my vector", GxB_SHORT, f)'
+prints about 30 entries from the vector \verb'v' to the file \verb'f'.
 
-\newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_reduce\_$<$type$>$:} reduce a vector to a scalar}
-%-------------------------------------------------------------------------------
-\label{reduce_vector_to_scalar}
+%===============================================================================
+\subsection{{\sf GxB\_Scalar\_fprint:} Print a {\sf GrB\_Scalar}}
+%===============================================================================
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (u))
-(
-    <type> *c,                      // result scalar
-    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
-    const GrB_Monoid monoid,        // monoid to do the reduction
-    const GrB_Vector u,             // vector to reduce
-    const GrB_Descriptor desc       // descriptor (currently unused)
-) ;
-
-GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (u))
+GrB_Info GxB_Scalar_fprint          // print and check a GrB_Scalar
 (
-    GrB_Scalar c,                   // result scalar
-    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
-    const GrB_Monoid monoid,        // monoid to do the reduction
-    const GrB_Vector u,             // vector to reduce
-    const GrB_Descriptor desc       // descriptor (currently unused)
+    GrB_Scalar s,                   // object to print and check
+    const char *name,               // name of the object
+    GxB_Print_Level pr,             // print level
+    FILE *f                         // file for output
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_reduce_<type>'
-reduces a vector to a scalar, analogous to \verb't = sum (u)' in MATLAB,
-except that in GraphBLAS any commutative and associative monoid can be used
-in the reduction.
+For example, \verb'GxB_Scalar_fprint (s, "my scalar", GxB_SHORT, f)'
+prints a short description of the scalar \verb's' to the file \verb'f'.
 
-The scalar \verb'c' can be a pointer C type: \verb'bool', \verb'int8_t', ...
-\verb'float', \verb'double', or \verb'void *' for a user-defined type,
-or a \verb'GrB_Scalar'.
-If \verb'c' is a \verb'void *' pointer to a user-defined type,
-the type must be identical to the type of the vector \verb'u'.
-This cannot be checked by GraphBLAS and thus results are undefined if the
-types are not the same.
+\newpage
+%===============================================================================
+\subsection{Performance and portability considerations}
+%===============================================================================
+
+Even when the print level is \verb'GxB_SILENT', these methods extensively check
+the contents of the objects passed to them, which can take some time.  They
+should be considered debugging tools only, not for final use in production.
+
+The return value of the \verb'GxB_*print' methods can be relied upon, but the
+output to the file (or \verb'stdout') can change from version to version.  If
+these methods are eventually added to the GraphBLAS C API Specification, a
+conforming implementation might never print anything at all, regardless of the
+\verb'pr' value.  This may be essential if the GraphBLAS library is installed
+in a dedicated device, with no file output, for example.
+
+Some implementations may wish to print nothing at all if the matrix is not yet
+completed, or just an indication that the matrix has pending operations and
+cannot be printed, when non-blocking mode is employed.  In this case, use
+\verb'GrB_Matrix_wait', \verb'GrB_Vector_wait', or \verb'GxB_Scalar_wait' to
+finish all pending computations first.  If a matrix or vector has pending
+operations, SuiteSparse:GraphBLAS prints a list of the {\em pending tuples},
+which are the entries not yet inserted into the primary data structure.  It can
+also print out entries that remain in the data structure but are awaiting
+deletion; these are called {\em zombies} in the output report.
+
+Most of the rest of the report is self-explanatory.
+
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Matrix and Vector iterators} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{iter}
+
+The \verb'GxB_Iterator' is an object that allows user applications to iterate
+over the entries of a matrix or vector, one entry at a time.  Iteration can
+be done in a linear manner (analogous to reading a file one entry at a time,
+from start to finish), or in a random-access pattern (analogous to
+the \verb'fseek' method for repositioning the access to file to a different 
+position).
+
+Multiple iterators can be used on a single matrix or vector, even in parallel
+by multiple user threads.  While a matrix or vector is being used with an
+iterator, the matrix or vector must not be modified.  Doing so will lead to
+undefined results.
 
-If the vector \verb'u' has no entries, that identity value of the \verb'monoid'
-is copied into the scalar \verb't' (unless \verb'c' is a \verb'GrB_Scalar',
-in which case \verb't' is an empty \verb'GrB_Scalar', with no entry).
-Otherwise, all of the entries in the
-vector are reduced to a single scalar using the \verb'monoid'.
+Since accessing a matrix or vector via an iterator requires many calls to
+the iterator methods, they must be very fast.  Error checking is skipped,
+except for the methods that create, attach, or free an iterator.  Methods
+that advance an iterator or that access values or indices from a matrix or
+vector do not return error conditions.  Instead, they have well-defined
+preconditions that must be met (and which should be checked by the user
+application).  If those preconditions are not met, results are undefined.
 
-The descriptor is unused, but it appears in case it is needed in future
-versions of the GraphBLAS API.
-This function has no mask so its accumulator/mask step differs from the other
-GraphBLAS operations.  It does not use the methods described in
-Section~\ref{accummask}, but uses the following method instead.
+The iterator methods are implemented in SuiteSparse:GraphBLAS as both macros
+(via \verb'#define') and as functions of the same name that appear in the
+compiled \verb'libgraphblas.so' library.  This requires that the opaque
+contents of the iterator object be defined in \verb'GraphBLAS.h' itself.  The
+user application must not access these contents directly, but can only do so
+safely via the iterator methods provided by SuiteSparse:GraphBLAS.
 
-If \verb'accum' is \verb'NULL', then the scalar \verb't' is typecast into the
-type of \verb'c', and \verb'c = t' is the final result.  Otherwise, the scalar
-\verb't' is typecast into the \verb'ytype' of the \verb'accum' operator, and
-the value of \verb'c' (on input) is typecast into the \verb'xtype' of the
-\verb'accum' operator.  Next, the scalar \verb'z = accum (c,t)' is computed, of
-the \verb'ztype' of the \verb'accum' operator.  Finally, \verb'z' is typecast
-into the final result, \verb'c'.
+The iterator object can be used in one of four sets of methods,
+for four different access patterns:
 
-If \verb'c' is a non-opaque scalar, no error message can be returned by
-\verb'GrB_error'.  If \verb'c' is a \verb'GrB_Scalar', then
-\verb'GrB_error(&err,c)' can be used to return an error string, if an error
-occurs.
+    \begin{enumerate}
+    \item {\em row iterator}:  iterates across the rows of a matrix, and then
+        within each row to access the entries in a given row.  Accessing all
+        the entries of a matrix using a row iterator requires an outer loop
+        (for the rows) and an inner loop (for the entries in each row).
+        A matrix can be accessed via a row iterator only if its format
+        (determined by \verb'GxB_get (A, GxB_FORMAT, &fmt)') is by-row
+        (that is, \verb'GxB_BY_ROW').
+        See Section~\ref{options}.
+    \item {\em column iterator}:  iterates across the columns of a matrix, and
+        then within each column to access the entries in a given column.
+        Accessing all the entries of a matrix using a column iterator requires
+        an outer loop (for the columns) and an inner loop (for the entries in
+        each column).  A matrix can be accessed via a column iterator only if
+        its format (determined by \verb'GxB_get (A, GxB_FORMAT, &fmt)') is
+        by-column (that is, \verb'GxB_BY_COL').
+        See Section~\ref{options}.
+    \item {\em entry iterator}:  iterates across the entries of a matrix.
+        Accessing all the entries of a matrix using an entry iterator requires
+        just a single loop.  Any matrix can be accessed with an entry iterator.
+    \item {\em vector iterator}:  iterates across the entries of a vector.
+        Accessing all the entries of a vector using a vector iterator requires
+        just a single loop.  Any vector can be accessed with a vector iterator.
+    \end{enumerate}
 
 \newpage
-%-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_reduce\_$<$type$>$:} reduce a matrix to a scalar}
-%-------------------------------------------------------------------------------
-\label{reduce_matrix_to_scalar}
+%===============================================================================
+\subsection{Creating and destroying an iterator}
+%===============================================================================
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (A))
-(
-    <type> *c,                      // result scalar
-    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
-    const GrB_Monoid monoid,        // monoid to do the reduction
-    const GrB_Matrix A,             // matrix to reduce
-    const GrB_Descriptor desc       // descriptor (currently unused)
-) ;
+The process for using an iterator starts with the creation of an iterator, with
+\verb'GxB_Iterator_new'.  This method creates an \verb'iterator' object but
+does not {\em attach} it to any specific matrix or vector:
 
-GrB_Info GrB_reduce                 // c = accum (c, reduce_to_scalar (A))
-(
-    GrB_Scalar c,                   // result scalar
-    const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
-    const GrB_Monoid monoid,        // monoid to do the reduction
-    const GrB_Matrix A,             // matrix to reduce
-    const GrB_Descriptor desc       // descriptor (currently unused)
-) ;
-\end{verbatim} } \end{mdframed}
+    {\footnotesize
+    \begin{verbatim}
+    GxB_Iterator iterator ;
+    GxB_Iterator_new (&iterator) ; \end{verbatim}}
 
-\verb'GrB_Matrix_reduce_<type>' reduces a matrix \verb'A' to a scalar, roughly
-analogous to \verb't = sum (A (:))' in MATLAB.  This function is identical to
-reducing a vector to a scalar, since the positions of the entries in a matrix
-or vector have no effect on the result.  Refer to the reduction to scalar
-described in the previous Section~\ref{reduce_vector_to_scalar}.
+When finished, the \verb'iterator' is freed with either of these methods:
+
+    {\footnotesize
+    \begin{verbatim}
+    GrB_free (&iterator) ;
+    GxB_Iterator_free (&iterator) ; \end{verbatim}}
 
-\newpage
 %===============================================================================
-\subsection{{\sf GrB\_transpose:} transpose a matrix} %=========================
+\subsection{Attaching an iterator to a matrix or vector}
 %===============================================================================
-\label{transpose}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_transpose              // C<Mask> = accum (C, A')
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Descriptor desc       // descriptor for C, Mask, and A
-) ;
-\end{verbatim} } \end{mdframed}
+This new \verb'iterator' object can be {\em attached} to any matrix or vector,
+and used as a row, column, or entry iterator for any matrix, or as an iterator
+for any vector.  The \verb'iterator' can be used in any of these methods before
+it is freed, but with just one access method at a time.
 
-\verb'GrB_transpose'
-transposes a matrix \verb'A', just like the array transpose \verb"T = A.'" in
-MATLAB.  The internal result matrix \verb"T = A'" (or merely \verb"T = A" if
-\verb'A' is transposed via the descriptor) has the same type as \verb'A'.  The
-final step is ${\bf C \langle M \rangle  = C \odot T}$, as described in
-Section~\ref{accummask}, which typecasts \verb'T' as needed and applies the
-mask and accumulator.
+Once it is created, the \verb'iterator' must be attached to a matrix or
+vector.  This process also selects the method by which the \verb'iterator'
+will be used for a matrix.  Each of the four \verb'GxB_*Iterator_attach'
+methods returns a \verb'GrB_Info' result.  The descriptor \verb'desc' in the
+examples below is used only to control the number of threads used for the
+internal call to \verb'GrB_wait', if the matrix \verb'A' or vector \verb'v' has
+pending operations.
 
-To be consistent with the rest of the GraphBLAS API regarding the
-descriptor, the input matrix \verb'A' may be transposed first by
-setting the \verb'GrB_INP0' setting to \verb'GrB_TRAN'.  This results in
-a double transpose, and thus \verb'A' is not transposed is computed.
+    \begin{enumerate}
+    \item {\em row iterator}: 
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_rowIterator_attach (iterator, A, desc) ; \end{verbatim}}
+    \item {\em column iterator}: 
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_colIterator_attach (iterator, A, desc) ; \end{verbatim}}
+    \item {\em entry iterator}:
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_Matrix_Iterator_attach (iterator, A, desc) ; \end{verbatim}}
+    \item {\em vector iterator}:
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_Vector_Iterator_attach (iterator, v, desc) ; \end{verbatim}}
+    \end{enumerate}
+
+On input to \verb'GxB_*Iterator_attach', the \verb'iterator' must already
+exist, having been created by \verb'GxB_Iterator_new'.  If the \verb'iterator'
+is already attached to a matrix or vector, it is detached and then attached to
+the given matrix \verb'A' or vector \verb'v'.
+
+The return values for row/column methods are:
+
+    \begin{itemize}
+    \item
+    \verb'GrB_SUCCESS':         if the \verb'iterator' is successfully
+        attached to the matrix \verb'A'.
+    \item
+    \verb'GrB_NULL_POINTER':    if the \verb'iterator' or \verb'A' are NULL.
+    \item
+    \verb'GrB_INVALID_OBJECT':  if the matrix \verb'A' is invalid.
+    \item
+    \verb'GrB_NOT_IMPLEMENTED': if the matrix \verb'A' cannot be iterated
+        in the requested access method (row iterators require the matrix to
+        be held by-row, and column iterators require the matrix to be held
+        by-column).
+    \item
+    \verb'GrB_OUT_OF_MEMORY':   if the method runs out of memory.
+    \end{itemize}
+
+The other two methods (entry iterator for matrices, or the vector iterator)
+return the same error codes, except that they
+do not return \verb'GrB_NOT_IMPLEMENTED'.
 
-\newpage
 %===============================================================================
-\subsection{{\sf GrB\_kronecker:} Kronecker product} %==========================
+\subsection{Seeking to an arbitrary position}
 %===============================================================================
-\label{kron}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GrB_kronecker              // C<Mask> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const <operator> op,            // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
-) ;
-\end{verbatim} } \end{mdframed}
+Attaching the \verb'iterator' to a matrix or vector does not define a specific
+position for the \verb'iterator'.  To use the \verb'iterator', a single call to
+the corresponding {\em seek} method is required.  These
+\verb'GxB*_Iterator_*seek*' methods may also be used later on to change the
+position of the iterator arbitrarily.
 
-\verb'GrB_kronecker' computes the Kronecker product,
-${\bf C \langle M \rangle = C \odot \mbox{kron}(A,B)}$ where
-\[
-\mbox{kron}{\bf (A,B)} =
-\left[
-    \begin{array}{ccc}
-    a_{00} \otimes {\bf B} & \ldots & a_{0,n-1} \otimes {\bf B} \\
-    \vdots & \ddots & \vdots \\
-    a_{m-1,0} \otimes {\bf B} & \ldots & a_{m-1,n-1} \otimes {\bf B} \\
-    \end{array}
-\right]
-\]
-The $\otimes$ operator is defined by the \verb'op' parameter.  It is applied in
-an element-wise fashion (like \verb'GrB_eWiseMult'), where the pattern of the
-submatrix $a_{ij} \otimes {\bf B}$ is the same as the pattern of ${\bf B}$ if
-$a_{ij}$ is an entry in the matrix ${\bf A}$, or empty otherwise.  The input
-matrices \verb'A' and \verb'B' can be of any dimension, and both matrices may
-be transposed first via the descriptor, \verb'desc'.  Entries in \verb'A' and
-\verb'B' are typecast into the input types of the \verb'op'.  The matrix
-\verb'T=kron(A,B)' has the same type as the \verb'ztype' of the binary
-operator, \verb'op'.  The final step is ${\bf C \langle M \rangle  = C \odot
-T}$, as described in Section~\ref{accummask}.
+    \begin{enumerate}
+    \item {\em row iterator}: 
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_rowIterator_seekRow (iterator, row) ;
+    GrB_Index kount = GxB_rowIterator_kount (iterator) ;
+    GrB_Info info = GxB_rowIterator_kseek (iterator, k) ; \end{verbatim}}
+
+        These methods move a row iterator to a specific row, defined in one of
+        two ways: (1) the row index itself (in range 0 to \verb'nrows'-1), or
+        (2) by specifying \verb'k', which moves the iterator to the \verb'k'th
+        {\em explicit} row (in the range 0 to \verb'kount'-1). For sparse,
+        bitmap, or full matrices, these two methods are identical.  For
+        hypersparse matrices, not all rows are present in the data structure;
+        these {\em implicit} rows are skipped and not included in the
+        \verb'kount'.  Implicit rows contain no entries.  The
+        \verb'GxB_rowIterator_kount' method returns the \verb'kount' of the
+        matrix, where \verb'kount' is equal to \verb'nrows' for sparse, bitmap,
+        and matrices, and \verb'kount' $\le$ \verb'nrows' for hypersparse
+        matrices.  All three methods listed above can be used for any row
+        iterator.
 
-The operator \verb'op' may be a \verb'GrB_BinaryOp', a \verb'GrB_Monoid', or a
-\verb'GrB_Semiring'.  In the latter case, the multiplicative operator of
-the semiring is used.
+        The \verb'GxB_rowIterator_*seek*' methods return \verb'GrB_SUCCESS' if
+        the iterator has been moved to a row that contains at least one entry,
+        \verb'GrB_NO_VALUE' if the row has no entries, or \verb'GxB_EXHAUSTED'
+        if the row is out of bounds (\verb'row' $\ge$ \verb'nrows' or
+        if \verb'k' $\ge$ \verb'kount').
+        None of these return conditions are
+        errors; they are all informational.
 
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Printing GraphBLAS objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{fprint}
+        For sparse, bitmap, and full matrices, \verb'GxB_rowIterator_seekRow'
+        always moves to the given row.  For hypersparse matrices, if the
+        requested row is implicit, the iterator is moved to the first
+        explicit row following it.  If no such row exists, the iterator
+        is exhausted and \verb'GxB_EXHAUSTED' is returned.
+        The \verb'GxB_rowIterator_kseek' method always moves to the \verb'k'th
+        explicit row, for any matrix.
+        Use \verb'GxB_rowIterator_getRowIndex', described below, to determine
+        the row index of the current position.
 
-The ten different objects handled by SuiteSparse:GraphBLAS are all opaque,
-although nearly all of their contents can be extracted via methods such as
-\verb'GrB_Matrix_extractTuples', \verb'GrB_Matrix_extractElement',
-\verb'GxB_Matrix_type', and so on.  The GraphBLAS C API has no mechanism for
-printing all the contents of GraphBLAS objects, but this is helpful for
-debugging.  Ten type-specific methods and two type-generic methods are
-provided:
+        Precondition: on input, the \verb'iterator' must have been successfully
+        attached to a matrix via a prior call to \verb'GxB_rowIterator_attach'.
+        Results are undefined if this precondition is not met.
 
-\vspace{0.2in}
-{\footnotesize
-\begin{tabular}{ll}
-\hline
-\verb'GxB_Type_fprint'         & print and check a \verb'GrB_Type' \\
-\verb'GxB_UnaryOp_fprint'      & print and check a \verb'GrB_UnaryOp' \\
-\verb'GxB_BinaryOp_fprint'     & print and check a \verb'GrB_BinaryOp' \\
-\verb'GxB_IndexUnaryOP_fprint' & print and check a \verb'GrB_IndexUnaryOp' \\
-\verb'GxB_Monoid_fprint'       & print and check a \verb'GrB_Monoid' \\
-\verb'GxB_Semiring_fprint'     & print and check a \verb'GrB_Semiring' \\
-\verb'GxB_Descriptor_fprint'   & print and check a \verb'GrB_Descriptor' \\
-\verb'GxB_Matrix_fprint'       & print and check a \verb'GrB_Matrix' \\
-\verb'GxB_Vector_fprint'       & print and check a \verb'GrB_Vector' \\
-\verb'GxB_Scalar_fprint'       & print and check a \verb'GrB_Scalar' \\
-\hline
-\verb'GxB_fprint'             & print/check any object to a file \\
-\verb'GxB_print'              & print/check any object to \verb'stdout' \\
-\hline
-\end{tabular}
-}
-\vspace{0.2in}
+    \item {\em column iterator}: 
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_colIterator_seekCol (iterator, col) ;
+    GrB_Index kount = GxB_colIterator_kount (iterator) ;
+    GrB_Info info = GxB_colIterator_kseek (iterator, k) ; \end{verbatim}}
 
-These methods do not modify the status of any object, and thus they
-cannot return an error string for use by \verb'GrB_error'.
+        These methods move a column iterator to a specific column, defined in
+        one of two ways: (1) the column index itself (in range 0 to
+        \verb'ncols'-1), or (2) by specifying \verb'k', which moves the
+        iterator to the \verb'k'th {\em explicit} column (in the range 0 to
+        \verb'kount'-1). For sparse, bitmap, or full matrices, these two
+        methods are identical.  For hypersparse matrices, not all columns are
+        present in the data structure; these {\em implicit} columns are skipped
+        and not included in the \verb'kount'.  Implicit columns contain no
+        entries.  The \verb'GxB_colIterator_kount' method returns the
+        \verb'kount' of the matrix, where \verb'kount' is equal to \verb'ncols'
+        for sparse, bitmap, and matrices, and \verb'kount' $\le$ \verb'ncols'
+        for hypersparse matrices.  All three methods listed above can be used
+        for any column iterator.
 
-If a matrix or vector
-has not been completed, the pending computations are guaranteed to {\em not} be
-performed. The reason is simple.  It is possible for a bug in the user
-application (such as accessing memory outside the bounds of an array) to mangle
-the internal content of a GraphBLAS object, and the \verb'GxB_*print' methods
-can be helpful tools to track down this bug.  If \verb'GxB_*print' attempted to
-complete any computations prior to printing or checking the contents of the
-matrix or vector, then further errors could occur, including a segfault.
+        The \verb'GxB_colIterator_*seek*' methods return \verb'GrB_SUCCESS' if
+        the iterator has been moved to a column that contains at least one
+        entry, \verb'GrB_NO_VALUE' if the column has no entries, or
+        \verb'GxB_EXHAUSTED' if the column is out of bounds (\verb'col' $\ge$
+        \verb'ncols' or \verb'k' $\ge$ \verb'kount').
+        None of these return conditions are
+        errors; they are all informational.
 
-By contrast, GraphBLAS methods and operations that return values into
-user-provided arrays or variables might finish pending operations before the
-return these values, and this would change their state.  Since they do not
-change the state of any object, the \verb'GxB_*print' methods provide a useful
-alternative for debugging, and for a quick understanding of what GraphBLAS is
-computing while developing a user application.
+        For sparse, bitmap, and full matrices, \verb'GxB_colIterator_seekCol'
+        always moves to the given column.  For hypersparse matrices, if the
+        requested column is implicit, the iterator is moved to the first
+        explicit column following it.  If no such column exists, the iterator
+        is exhausted and \verb'GxB_EXHAUSTED' is returned.
+        The \verb'GxB_colIterator_kseek' method always moves to the \verb'k'th
+        explicit column, for any matrix.
+        Use \verb'GxB_colIterator_getColIndex', described below, to determine
+        the column index of the current position.
 
-Each of the methods has a parameter of type \verb'GxB_Print_Level' that
-specifies the amount to print:
+        Precondition: on input, the \verb'iterator' must have been successfully
+        attached to a matrix via a prior call to \verb'GxB_colIterator_attach'.
+        Results are undefined if this precondition is not met.
 
-{\footnotesize
-\begin{verbatim}
-typedef enum
-{
-    GxB_SILENT = 0,     // nothing is printed, just check the object
-    GxB_SUMMARY = 1,    // print a terse summary
-    GxB_SHORT = 2,      // short description, about 30 entries of a matrix
-    GxB_COMPLETE = 3,   // print the entire contents of the object
-    GxB_SHORT_VERBOSE = 4,    // GxB_SHORT but with "%.15g" for doubles
-    GxB_COMPLETE_VERBOSE = 5  // GxB_COMPLETE but with "%.15g" for doubles
-}
-GxB_Print_Level ; \end{verbatim}}
+    \item {\em entry iterator}:
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_Matrix_Iterator_seek (iterator, p) ;
+    GrB_Index pmax = GxB_Matrix_Iterator_getpmax (iterator) ;
+    GrB_Index p = GxB_Matrix_Iterator_getp (iterator); \end{verbatim}}
 
-The ten type-specific functions include an additional argument, the
-\verb'name' string.  The \verb'name' is printed at the beginning of the display
-(assuming the print level is not \verb'GxB_SILENT') so that the object can be
-more easily identified in the output.  For the type-generic methods
-\verb'GxB_fprint' and \verb'GxB_print', the \verb'name' string is the variable
-name of the object itself.
+        The \verb'GxB_Matrix_Iterator_seek' method moves the \verb'iterator' to
+        the given position \verb'p', which is in the range 0 to \verb'pmax'-1,
+        where the value of \verb'pmax' is obtained from
+        \verb'GxB_Matrix_Iterator_getpmax'.
+        For sparse, hypersparse, and full matrices, \verb'pmax' is the same as
+        \verb'nvals' returned by \verb'GrB_Matrix_nvals'.  For bitmap matrices,
+        \verb'pmax' is equal to \verb'nrows*ncols'.  If \verb'p' $\ge$
+        \verb'pmax', the iterator is exhausted and \verb'GxB_EXHAUSTED' is
+        returned.  Otherwise, \verb'GrB_SUCCESS' is returned.
 
-If the file \verb'f' is \verb'NULL', \verb'stdout' is used.
-If \verb'name' is \verb'NULL', it is treated
-as the empty string.  These are not error conditions.
+        All entries in the matrix are given an ordinal position, \verb'p'.
+        Seeking to position \verb'p' will either move the \verb'iterator' to
+        that particular position, or to the next higher position containing an
+        entry if there is entry at position \verb'p'.  The latter case only
+        occurs for bitmap matrices.
+        Use \verb'GxB_Matrix_Iterator_getp' to determine the current
+        position of the iterator.
 
-The methods check their input objects carefully and extensively, even when
-\verb'pr' is equal to \verb'GxB_SILENT'.  The following error codes can be
-returned:
+        Precondition: on input, the \verb'iterator' must have been successfully
+        attached to a matrix via a prior call to
+        \verb'GxB_Matrix_Iterator_attach'.  Results are undefined if this
+        precondition is not met.
 
-\begin{packed_itemize}
-\item \verb'GrB_SUCCESS':               object is valid
-\item \verb'GrB_UNINITIALIZED_OBJECT':  object is not initialized
-\item \verb'GrB_INVALID_OBJECT':        object is not valid
-\item \verb'GrB_NULL_POINTER':          object is a NULL pointer
-\item \verb'GrB_INVALID_VALUE':         \verb'fprintf' returned an I/O error.
-\end{packed_itemize}
+    \item {\em vector iterator}:
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_Vector_Iterator_seek (iterator, p) ;
+    GrB_Index pmax = GxB_Vector_Iterator_getpmax (iterator) ;
+    GrB_Index p = GxB_Vector_Iterator_getp (iterator); \end{verbatim}}
 
-The content of any GraphBLAS object is opaque, and subject to change.  As a
-result, the exact content and format of what is printed is
-implementation-dependent, and will change from version to version of
-SuiteSparse:GraphBLAS.  Do not attempt to rely on the exact content or format
-by trying to parse the resulting output via another program.  The intent of
-these functions is to produce a report of an object for visual inspection.  If
-the user application needs to extract content from a GraphBLAS matrix or
-vector, use \verb'GrB_*_extractTuples' or the import/export methods instead.
+        The \verb'GxB_Vector_Iterator_seek' method is identical to the
+        entry iterator of a matrix, but applied to a \verb'GrB_Vector' instead.
 
-GraphBLAS matrices and vectors are zero-based, where indices of an $n$-by-$n$
-matrix are in the range 0 to $n-1$.  However, Octave, MATLAB, and Julia prefer
-to print their matrices and vectors as one-based.  To enable 1-based printing,
-use \verb'GxB_set (GxB_PRINT_1BASED, true)'.  Printing is done as zero-based by
-default.
+        Precondition: on input, the \verb'iterator' must have been successfully
+        attached to a vector via a prior call to
+        \verb'GxB_Vector_Iterator_attach'.  Results are undefined if this
+        precondition is not met.
+
+    \end{enumerate}
 
-\newpage
 %===============================================================================
-\subsection{{\sf GxB\_fprint:} Print a GraphBLAS object to a file} %============
+\subsection{Advancing to the next position}
 %===============================================================================
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_fprint                 // print and check a GraphBLAS object
-(
-    GrB_<objecttype> object,        // object to print and check
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+For best performance, the {\em seek} methods described above should be used
+with care, since some of them require $O(\log n)$ time.  The fastest method
+for changing the position of the iterator is the corresponding {\em next}
+method, described below for each iterator:
 
-The \verb'GxB_fprint' function prints the contents of any of the ten GraphBLAS
-objects to the file \verb'f'.  If \verb'f' is \verb'NULL', the results are
-printed to \verb'stdout'.  For example, to print the entire contents of a
-matrix \verb'A' to the file \verb'f', use
-\verb'GxB_fprint (A, GxB_COMPLETE, f)'.
+    \begin{enumerate}
+    \item {\em row iterator}:  To move to the next row.
 
-%===============================================================================
-\subsection{{\sf GxB\_print:} Print a GraphBLAS object to {\sf stdout}} %=======
-%===============================================================================
-\label{gxb_print}
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_rowIterator_nextRow (iterator) ; \end{verbatim}}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_print                  // print and check a GrB_Vector
-(
-    GrB_<objecttype> object,        // object to print and check
-    GxB_Print_Level pr              // print level
-) ;
-\end{verbatim} } \end{mdframed}
+    The row iterator is a 2-dimensional iterator, requiring an outer loop and
+    an inner loop.  The outer loop iterates over the rows of the matrix, using
+    \verb'GxB_rowIterator_nextRow' to move to the next row.  If the matrix is
+    hypersparse, the next row is always an explicit row; implicit rows are
+    skipped.  The return conditions are identical to
+    \verb'GxB_rowIterator_seekRow'.
+
+    Preconditions: on input, the row iterator must already be attached to a
+    matrix via a prior call to \verb'GxB_rowIterator_attach', and the
+    \verb'iterator' must be at a specific row, via a prior call to
+    \verb'GxB_rowIterator_*seek*' or \verb'GxB_rowIterator_nextRow'. 
+    Results are undefined if these conditions are not met.
+
+    \item {\em row iterator}:  To move to the next entry within a row.
+
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_rowIterator_nextCol (iterator) ; \end{verbatim}}
+
+    The row iterator is moved to the next entry in the current row.
+    The method returns \verb'GrB_NO_VALUE' if the end of the row is reached.
+    The iterator does not move to the next row in this case.
+    The method returns \verb'GrB_SUCCESS' if the iterator has been moved
+    to a specific entry in the current row.
 
-\verb'GxB_print' is the same as \verb'GxB_fprint', except that it prints the
-contents of the object to \verb'stdout' instead of a file \verb'f'.  For
-example, to print the entire contents of a matrix \verb'A',  use
-\verb'GxB_print (A, GxB_COMPLETE)'.
+    Preconditions: the same as \verb'GxB_rowIterator_nextRow'.
 
-%===============================================================================
-\subsection{{\sf GxB\_Type\_fprint:} Print a {\sf GrB\_Type}}
-%===============================================================================
+    \item {\em column iterator}:  To move to the next column
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Type_fprint            // print and check a GrB_Type
-(
-    GrB_Type type,                  // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_colIterator_nextCol (iterator) ; \end{verbatim}}
 
-For example, \verb'GxB_Type_fprint (GrB_BOOL, "boolean type", GxB_COMPLETE, f)'
-prints the contents of the \verb'GrB_BOOL' object to the file \verb'f'.
+    The column iterator is a 2-dimensional iterator, requiring an outer loop
+    and an inner loop.  The outer loop iterates over the columns of the matrix,
+    using \verb'GxB_colIterator_nextCol' to move to the next column.  If the
+    matrix is hypersparse, the next column is always an explicit column;
+    implicit columns are skipped.  The return conditions are identical to
+    \verb'GxB_colIterator_seekCol'.
 
-\newpage
-%===============================================================================
-\subsection{{\sf GxB\_UnaryOp\_fprint:} Print a {\sf GrB\_UnaryOp}}
-%===============================================================================
+    Preconditions: on input, the column iterator must already be attached to a
+    matrix via a prior call to \verb'GxB_colIterator_attach', and the
+    \verb'iterator' must be at a specific column, via a prior call to
+    \verb'GxB_colIterator_*seek*' or \verb'GxB_colIterator_nextCol'.
+    Results are undefined if these conditions are not met.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_UnaryOp_fprint         // print and check a GrB_UnaryOp
-(
-    GrB_UnaryOp unaryop,            // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    {\footnotesize
+    \item {\em column iterator}:  To move to the next entry within a column.
 
-For example,
-\verb'GxB_UnaryOp_fprint (GrB_LNOT, "not", GxB_COMPLETE, f)'
-prints the \verb'GrB_LNOT' unary operator to the file \verb'f'.
+    \begin{verbatim}
+    GrB_Info info = GxB_colIterator_nextRow (iterator) ; \end{verbatim}}
+
+    The column iterator is moved to the next entry in the current column.
+    The method returns \verb'GrB_NO_VALUE' if the end of the column is reached.
+    The iterator does not move to the next column in this case.
+    The method returns \verb'GrB_SUCCESS' if the iterator has been moved
+    to a specific entry in the current column.
 
+    Preconditions: the same as \verb'GxB_colIterator_nextCol'.
 
-%===============================================================================
-\subsection{{\sf GxB\_BinaryOp\_fprint:} Print a {\sf GrB\_BinaryOp}}
-%===============================================================================
+    \item {\em entry iterator}: To move to the next entry.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_Matrix_Iterator_next (iterator) ; \end{verbatim}}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_BinaryOp_fprint        // print and check a GrB_BinaryOp
-(
-    GrB_BinaryOp binaryop,          // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    This method moves an iterator to the next entry of a matrix.
+    It returns \verb'GrB_SUCCESS' if the iterator is at an entry that
+    exists in the matrix, or \verb'GrB_EXHAUSTED' otherwise.
 
-For example,
-\verb'GxB_BinaryOp_fprint (GrB_PLUS_FP64, "plus", GxB_COMPLETE, f)' prints the
-\verb'GrB_PLUS_FP64' binary operator to the file \verb'f'.
+    Preconditions: on input, the entry iterator must be already attached to a
+    matrix via \verb'GxB_Matrix_Iterator_attach', and the position of the
+    iterator must also have been defined by a prior call to
+    \verb'GxB_Matrix_Iterator_seek' or \verb'GxB_Matrix_Iterator_next'.
+    Results are undefined if these conditions are not met.
 
+    \item {\em vector iterator}: To move to the next entry.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Info info = GxB_Vector_Iterator_next (iterator) ; \end{verbatim}}
 
-%===============================================================================
-\subsection{{\sf GxB\_IndexUnaryOp\_fprint:} Print a {\sf GrB\_IndexUnaryOp}}
-%===============================================================================
+    This method moves an iterator to the next entry of a vector.
+    It returns \verb'GrB_SUCCESS' if the iterator is at an entry that
+    exists in the vector, or \verb'GrB_EXHAUSTED' otherwise.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_IndexUnaryOp_fprint    // print and check a GrB_IndexUnaryOp
-(
-    GrB_IndexUnaryOp op,            // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    Preconditions: on input, the iterator must be already attached to a
+    vector via \verb'GxB_Vector_Iterator_attach', and the position of the
+    iterator must also have been defined by a prior call to
+    \verb'GxB_Vector_Iterator_seek' or \verb'GxB_Vector_Iterator_next'.
+    Results are undefined if these conditions are not met.
 
-For example,
-\verb'GrB_IndexUnaryOp_fprint (GrB_TRIL, "tril", GxB_COMPLETE, f)' prints
-the \verb'GrB_TRIL' index-unary operator to the file \verb'f'.
+    \end{enumerate}
 
-\newpage
 %===============================================================================
-\subsection{{\sf GxB\_Monoid\_fprint:} Print a {\sf GrB\_Monoid}}
+\subsection{Accessing the indices of the current entry}
 %===============================================================================
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Monoid_fprint          // print and check a GrB_Monoid
-(
-    GrB_Monoid monoid,              // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+Once the iterator is attached to a matrix or vector, and is placed in position
+at an entry in the matrix or vector, the indices and value of this entry can be
+obtained.  The methods for accessing the value of the entry are described in
+Section~\ref{getvalu}.  Accessing the indices is performed with four different
+sets of methods, depending on which access pattern is in use, described below:
 
-For example,
-\verb'GxB_Monoid_fprint (GxB_PLUS_FP64_MONOID, "plus monoid",'
-\verb'GxB_COMPLETE, f)'
-prints the predefined \verb'GxB_PLUS_FP64_MONOID' (based on the binary
-operator \verb'GrB_PLUS_FP64') to the file \verb'f'.
+    \begin{enumerate}
+    \item {\em row iterator}:  To get the current row index.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Index i = GxB_rowIterator_getRowIndex (iterator) ; \end{verbatim}}
 
-%===============================================================================
-\subsection{{\sf GxB\_Semiring\_fprint:} Print a {\sf GrB\_Semiring}}
-%===============================================================================
+    The method returns \verb'nrows(A)' if the iterator is exhausted, or the
+    current row index \verb'i' otherwise.  There need not be any entry in the
+    current row.  Zero is returned if the iterator is attached to the matrix
+    but \verb'GxB_rowIterator_*seek*' has not been called, but this does not
+    mean the iterator is positioned at row zero.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Semiring_fprint        // print and check a GrB_Semiring
-(
-    GrB_Semiring semiring,          // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    Preconditions: on input, the iterator must be already successfully attached
+    to matrix as a row iterator via \verb'GxB_rowIterator_attach'.
+    Results are undefined if this condition is not met.
 
-For example,
-\verb'GxB_Semiring_fprint (GxB_PLUS_TIMES_FP64, "standard",'
-\verb'GxB_COMPLETE, f)'
-prints the predefined \verb'GxB_PLUS_TIMES_FP64' semiring to the file \verb'f'.
+    \item {\em row iterator}:  To get the current column index.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Index j = GxB_rowIterator_getColIndex (iterator) ; \end{verbatim}}
 
-%===============================================================================
-\subsection{{\sf GxB\_Descriptor\_fprint:} Print a {\sf GrB\_Descriptor}}
-%===============================================================================
+    Preconditions: on input, the iterator must be already successfully attached
+    to matrix as a row iterator via \verb'GxB_rowIterator_attach', and in
+    addition, the row iterator must be positioned at a valid entry present in
+    the matrix.  That is, the last call to \verb'GxB_rowIterator_*seek*' or
+    \verb'GxB_rowIterator_*next*', must have returned \verb'GrB_SUCCESS'.
+    Results are undefined if these conditions are not met.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Descriptor_fprint      // print and check a GrB_Descriptor
-(
-    GrB_Descriptor descriptor,      // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    \item {\em column iterator}:  To get the current column index.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Index j = GxB_colIterator_getColIndex (iterator) ; \end{verbatim}}
 
-For example,
-\verb'GxB_Descriptor_fprint (d, "descriptor", GxB_COMPLETE, f)'
-prints the descriptor \verb'd' to the file \verb'f'.
+    The method returns \verb'ncols(A)' if the iterator is exhausted, or the
+    current column index \verb'j' otherwise.  There need not be any entry in the
+    current column.  Zero is returned if the iterator is attached to the matrix
+    but \verb'GxB_colIterator_*seek*' has not been called, but this does not
+    mean the iterator is positioned at column zero.
 
-\newpage
-%===============================================================================
-\subsection{{\sf GxB\_Matrix\_fprint:} Print a {\sf GrB\_Matrix}}
-%===============================================================================
+    Precondition: on input, the iterator must be already successfully attached
+    to matrix as a column iterator via \verb'GxB_colIterator_attach'.
+    Results are undefined if this condition is not met.
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Matrix_fprint          // print and check a GrB_Matrix
-(
-    GrB_Matrix A,                   // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    \item {\em column iterator}:  To get the current row index.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Index i = GxB_colIterator_getRowIndex (iterator) ; \end{verbatim}}
 
-For example, \verb'GxB_Matrix_fprint (A, "my matrix", GxB_SHORT, f)'
-prints about 30 entries from the matrix \verb'A' to the file \verb'f'.
+    Preconditions: on input, the iterator must be already successfully attached
+    to matrix as a column iterator via \verb'GxB_colIterator_attach', and in
+    addition, the column iterator must be positioned at a valid entry present in
+    the matrix.  That is, the last call to \verb'GxB_colIterator_*seek*' or
+    \verb'GxB_colIterator_*next*', must have returned \verb'GrB_SUCCESS'.
+    Results are undefined if these conditions are not met.
+
+    \item {\em entry iterator}: To get the current row and column index.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Index i, j ;
+    GxB_Matrix_Iterator_getIndex (iterator, &i, &j) ; \end{verbatim}}
+
+    Returns the row and column index of the current entry.
 
+    Preconditions: on input, the entry iterator must be already attached to a
+    matrix via \verb'GxB_Matrix_Iterator_attach', and the position of the
+    iterator must also have been defined by a prior call to
+    \verb'GxB_Matrix_Iterator_seek' or \verb'GxB_Matrix_Iterator_next', with a
+    return value of \verb'GrB_SUCCESS'.
+    Results are undefined if these conditions are not met.
 
-%===============================================================================
-\subsection{{\sf GxB\_Vector\_fprint:} Print a {\sf GrB\_Vector}}
-%===============================================================================
+    \item {\em vector iterator}: To get the current index.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Index i = GxB_Vector_Iterator_getIndex (iterator) ; \end{verbatim}}
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Vector_fprint          // print and check a GrB_Vector
-(
-    GrB_Vector v,                   // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+    Returns the index of the current entry.
 
-For example, \verb'GxB_Vector_fprint (v, "my vector", GxB_SHORT, f)'
-prints about 30 entries from the vector \verb'v' to the file \verb'f'.
+    Preconditions: on input, the entry iterator must be already attached to a
+    matrix via \verb'GxB_Vector_Iterator_attach', and the position of the
+    iterator must also have been defined by a prior call to
+    \verb'GxB_Vector_Iterator_seek' or \verb'GxB_Vector_Iterator_next', with a
+    return value of \verb'GrB_SUCCESS'.
+    Results are undefined if these conditions are not met.
+
+    \end{enumerate}
 
 %===============================================================================
-\subsection{{\sf GxB\_Scalar\_fprint:} Print a {\sf GrB\_Scalar}}
+\subsection{Accessing the value of the current entry}
+\label{getvalu}
 %===============================================================================
 
-\begin{mdframed}[userdefinedwidth=6in]
-{\footnotesize
-\begin{verbatim}
-GrB_Info GxB_Scalar_fprint          // print and check a GrB_Scalar
-(
-    GrB_Scalar s,                   // object to print and check
-    const char *name,               // name of the object
-    GxB_Print_Level pr,             // print level
-    FILE *f                         // file for output
-) ;
-\end{verbatim} } \end{mdframed}
+So far, all methods that create or use an iterator have been split into four
+sets of methods, for the row, column, or entry iterators attached to a matrix,
+or for a vector iterator.  Accessing the value is different.  All four
+iterators use the same set of methods to access the value of their current
+entry.  These methods return the value of the current entry at the position
+determined by the iterator.  The return value can of course be typecasted
+using standard C syntax once the value is returned to the caller.
 
-For example, \verb'GxB_Scalar_fprint (s, "my scalar", GxB_SHORT, f)'
-prints a short description of the scalar \verb's' to the file \verb'f'.
+Preconditions: on input, the prior call to \verb'GxB_*Iterator_*seek*', or
+\verb'GxB_*Iterator_*next*' must have returned \verb'GrB_SUCCESS', indicating
+that the iterator is at a valid current entry for either a matrix or vector.
+No typecasting is permitted, in the sense that the method name must match the
+type of the matrix or vector.
+Results are undefined if these conditions are not met.
 
-\newpage
-%===============================================================================
-\subsection{Performance and portability considerations}
-%===============================================================================
+    {\footnotesize
+    \begin{verbatim}
+    // for built-in types:
+    bool       value = GxB_Iterator_get_BOOL (iterator) ;
+    int8_t     value = GxB_Iterator_get_INT8 (iterator) ;
+    int16_t    value = GxB_Iterator_get_INT16 (iterator) ;
+    int32_t    value = GxB_Iterator_get_INT32 (iterator) ;
+    int64_t    value = GxB_Iterator_get_INT64 (iterator) ;
+    uint8_t    value = GxB_Iterator_get_UINT8 (iterator) ;
+    uint16_t   value = GxB_Iterator_get_UINT16 (iterator) ;
+    uint32_t   value = GxB_Iterator_get_UINT32 (iterator) ;
+    uint64_t   value = GxB_Iterator_get_UINT64 (iterator) ;
+    float      value = GxB_Iterator_get_FP32 (iterator) ;
+    double     value = GxB_Iterator_get_FP64 (iterator) ;
+    GxB_FC32_t value = GxB_Iterator_get_FC32 (iterator) ;
+    GxB_FC64_t value = GxB_Iterator_get_FC64 (iterator) ;
 
-Even when the print level is \verb'GxB_SILENT', these methods extensively check
-the contents of the objects passed to them, which can take some time.  They
-should be considered debugging tools only, not for final use in production.
+    // for user-defined types:
+    <type> value ;
+    GxB_Iterator_get_UDT (iterator, (void *) &value) ; \end{verbatim}}
 
-The return value of the \verb'GxB_*print' methods can be relied upon, but the
-output to the file (or \verb'stdout') can change from version to version.  If
-these methods are eventually added to the GraphBLAS C API Specification, a
-conforming implementation might never print anything at all, regardless of the
-\verb'pr' value.  This may be essential if the GraphBLAS library is installed
-in a dedicated device, with no file output, for example.
+%===============================================================================
+\newpage
+\subsection{Example: row iterator for a matrix}
+%===============================================================================
 
-Some implementations may wish to print nothing at all if the matrix is not yet
-completed, or just an indication that the matrix has pending operations and
-cannot be printed, when non-blocking mode is employed.  In this case, use
-\verb'GrB_Matrix_wait', \verb'GrB_Vector_wait', or \verb'GxB_Scalar_wait' to
-finish all pending computations first.  If a matrix or vector has pending
-operations, SuiteSparse:GraphBLAS prints a list of the {\em pending tuples},
-which are the entries not yet inserted into the primary data structure.  It can
-also print out entries that remain in the data structure but are awaiting
-deletion; these are called {\em zombies} in the output report.
+The following example uses a row iterator to access all of the entries
+in a matrix \verb'A' of type \verb'GrB_FP64'.  Note the inner and outer loops.
+The outer loop iterates over all rows of the matrix.  The inner loop iterates
+over all entries in the row \verb'i'.  This access pattern requires the matrix
+to be held by-row, but otherwise it works for any matrix.  If the matrix is
+held by-column, then use the column iterator methods instead.
 
-Most of the rest of the report is self-explanatory.
+    {\footnotesize
+    \begin{verbatim}
+    // create an iterator
+    GxB_Iterator iterator ;
+    GxB_Iterator_new (&iterator) ;
+    // attach it to the matrix A, known to be type GrB_FP64
+    GrB_Info info = GxB_rowIterator_attach (iterator, A, NULL) ;
+    if (info < 0) { handle the failure ... }
+    // seek to A(0,:)
+    info = GxB_rowIterator_seekRow (iterator, 0) ;
+    while (info != GxB_EXHAUSTED)
+    {
+        // iterate over entries in A(i,:)
+        GrB_Index i = GxB_rowIterator_getRowIndex (iterator) ;
+        while (info == GrB_SUCCESS)
+        {
+            // get the entry A(i,j)
+            GrB_Index j = GxB_rowIterator_getColIndex (iterator) ;
+            double  aij = GxB_Iterator_get_FP64 (iterator) ;
+            // move to the next entry in A(i,:)
+            info = GxB_rowIterator_nextCol (iterator) ;
+        }
+        // move to the next row, A(i+1,:), or a subsequent one if i+1 is implicit
+        info = GxB_rowIterator_nextRow (iterator) ;
+    }
+    GrB_free (&iterator) ; \end{verbatim}}
 
+%===============================================================================
 \newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Matrix and Vector iterators} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{iter}
+\subsection{Example: column iterator for a matrix}
+%===============================================================================
 
-The \verb'GxB_Iterator' is an object that allows user applications to iterate
-over the entries of a matrix or vector, one entry at a time.  Iteration can
-be done in a linear manner (analogous to reading a file one entry at a time,
-from start to finish), or in a random-access pattern (analogous to
-the \verb'fseek' method for repositioning the access to file to a different 
-position).
+The column iterator is analgous to the row iterator.
 
-Multiple iterators can be used on a single matrix or vector, even in parallel
-by multiple user threads.  While a matrix or vector is being used with an
-iterator, the matrix or vector must not be modified.  Doing so will lead to
-undefined results.
+The following example uses a column iterator to access all of the entries in a
+matrix \verb'A' of type \verb'GrB_FP64'.  The outer loop iterates over all
+columns of the matrix.  The inner loop iterates over all entries in the column
+\verb'j'.  This access pattern requires the matrix to be held by-column, but
+otherwise it works for any matrix.  If the matrix is held by-row, then use
+the row iterator methods instead.
 
-Since accessing a matrix or vector via an iterator requires many calls to
-the iterator methods, they must be very fast.  Error checking is skipped,
-except for the methods that create, attach, or free an iterator.  Methods
-that advance an iterator or that access values or indices from a matrix or
-vector do not return error conditions.  Instead, they have well-defined
-preconditions that must be met (and which should be checked by the user
-application).  If those preconditions are not met, results are undefined.
+    {\footnotesize
+    \begin{verbatim}
+    // create an iterator
+    GxB_Iterator iterator ;
+    GxB_Iterator_new (&iterator) ;
+    // attach it to the matrix A, known to be type GrB_FP64
+    GrB_Info info = GxB_colIterator_attach (iterator, A, NULL) ;
+    // seek to A(:,0)
+    info = GxB_colIterator_seekCol (iterator, 0) ;
+    while (info != GxB_EXHAUSTED)
+    {
+        // iterate over entries in A(:,j)
+        GrB_Index j = GxB_colIterator_getColIndex (iterator) ;
+        while (info == GrB_SUCCESS)
+        {
+            // get the entry A(i,j)
+            GrB_Index i = GxB_colIterator_getRowIndex (iterator) ;
+            double  aij = GxB_Iterator_get_FP64 (iterator) ;
+            // move to the next entry in A(:,j)
+            info = GxB_colIterator_nextRow (iterator) ;
+            OK (info) ;
+        }
+        // move to the next column, A(:,j+1), or a subsequent one if j+1 is implicit
+        info = GxB_colIterator_nextCol (iterator) ;
+    }
+    GrB_free (&iterator) ; \end{verbatim}}
 
-The iterator methods are implemented in SuiteSparse:GraphBLAS as both macros
-(via \verb'#define') and as functions of the same name that appear in the
-compiled \verb'libgraphblas.so' library.  This requires that the opaque
-contents of the iterator object be defined in \verb'GraphBLAS.h' itself.  The
-user application must not access these contents directly, but can only do so
-safely via the iterator methods provided by SuiteSparse:GraphBLAS.
+%===============================================================================
+\newpage
+\subsection{Example: entry iterator for a matrix}
+%===============================================================================
 
-The iterator object can be used in one of four sets of methods,
-for four different access patterns:
+The entry iterator allows for a simpler access pattern, with a single loop, but
+using a row or column iterator is faster.  The method works for any matrix.
 
-    \begin{enumerate}
-    \item {\em row iterator}:  iterates across the rows of a matrix, and then
-        within each row to access the entries in a given row.  Accessing all
-        the entries of a matrix using a row iterator requires an outer loop
-        (for the rows) and an inner loop (for the entries in each row).
-        A matrix can be accessed via a row iterator only if its format
-        (determined by \verb'GxB_get (A, GxB_FORMAT, &fmt)') is by-row
-        (that is, \verb'GxB_BY_ROW').
-        See Section~\ref{options}.
-    \item {\em column iterator}:  iterates across the columns of a matrix, and
-        then within each column to access the entries in a given column.
-        Accessing all the entries of a matrix using a column iterator requires
-        an outer loop (for the columns) and an inner loop (for the entries in
-        each column).  A matrix can be accessed via a column iterator only if
-        its format (determined by \verb'GxB_get (A, GxB_FORMAT, &fmt)') is
-        by-column (that is, \verb'GxB_BY_COL').
-        See Section~\ref{options}.
-    \item {\em entry iterator}:  iterates across the entries of a matrix.
-        Accessing all the entries of a matrix using an entry iterator requires
-        just a single loop.  Any matrix can be accessed with an entry iterator.
-    \item {\em vector iterator}:  iterates across the entries of a vector.
-        Accessing all the entries of a vector using a vector iterator requires
-        just a single loop.  Any vector can be accessed with a vector iterator.
-    \end{enumerate}
+    {\footnotesize
+    \begin{verbatim}
+    // create an iterator
+    GxB_Iterator iterator ;
+    GxB_Iterator_new (&iterator) ;
+    // attach it to the matrix A, known to be type GrB_FP64
+    GrB_Info info = GxB_Matrix_Iterator_attach (iterator, A, NULL) ;
+    if (info < 0) { handle the failure ... }
+    // seek to the first entry
+    info = GxB_Matrix_Iterator_seek (iterator, 0) ;
+    while (info != GxB_EXHAUSTED)
+    {
+        // get the entry A(i,j)
+        GrB_Index i, j ;
+        GxB_Matrix_Iterator_getIndex (iterator, &i, &j) ;
+        double aij = GxB_Iterator_get_FP64 (iterator) ;
+        // move to the next entry in A
+        info = GxB_Matrix_Iterator_next (iterator) ;
+    }
+    GrB_free (&iterator) ; \end{verbatim}}
 
-\newpage
 %===============================================================================
-\subsection{Creating and destroying an iterator}
+\subsection{Example: vector iterator}
 %===============================================================================
 
-The process for using an iterator starts with the creation of an iterator, with
-\verb'GxB_Iterator_new'.  This method creates an \verb'iterator' object but
-does not {\em attach} it to any specific matrix or vector:
+A vector iterator is used much like an entry iterator for a matrix.
 
     {\footnotesize
     \begin{verbatim}
+    // create an iterator
     GxB_Iterator iterator ;
-    GxB_Iterator_new (&iterator) ; \end{verbatim}}
-
-When finished, the \verb'iterator' is freed with either of these methods:
-
-    {\footnotesize
-    \begin{verbatim}
-    GrB_free (&iterator) ;
-    GxB_Iterator_free (&iterator) ; \end{verbatim}}
+    GxB_Iterator_new (&iterator) ;
+    // attach it to the vector v, known to be type GrB_FP64
+    GrB_Info info = GxB_Vector_Iterator_attach (iterator, v, NULL) ;
+    if (info < 0) { handle the failure ... }
+    // seek to the first entry
+    info = GxB_Vector_Iterator_seek (iterator, 0) ;
+    while (info != GxB_EXHAUSTED)
+    {
+        // get the entry v(i)
+        GrB_Index i = GxB_Vector_Iterator_getIndex (iterator) ;
+        double vi = GxB_Iterator_get_FP64 (iterator) ;
+        // move to the next entry in v
+        info = GxB_Vector_Iterator_next (iterator) ;
+    }
+    GrB_free (&iterator) ; \end{verbatim}}
 
 %===============================================================================
-\subsection{Attaching an iterator to a matrix or vector}
+\newpage
+\subsection{Performance}
 %===============================================================================
 
-This new \verb'iterator' object can be {\em attached} to any matrix or vector,
-and used as a row, column, or entry iterator for any matrix, or as an iterator
-for any vector.  The \verb'iterator' can be used in any of these methods before
-it is freed, but with just one access method at a time.
-
-Once it is created, the \verb'iterator' must be attached to a matrix or
-vector.  This process also selects the method by which the \verb'iterator'
-will be used for a matrix.  Each of the four \verb'GxB_*Iterator_attach'
-methods returns a \verb'GrB_Info' result.  The descriptor \verb'desc' in the
-examples below is used only to control the number of threads used for the
-internal call to \verb'GrB_wait', if the matrix \verb'A' or vector \verb'v' has
-pending operations.
-
-    \begin{enumerate}
-    \item {\em row iterator}: 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_rowIterator_attach (iterator, A, desc) ; \end{verbatim}}
-    \item {\em column iterator}: 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_colIterator_attach (iterator, A, desc) ; \end{verbatim}}
-    \item {\em entry iterator}:
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_Matrix_Iterator_attach (iterator, A, desc) ; \end{verbatim}}
-    \item {\em vector iterator}:
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_Vector_Iterator_attach (iterator, v, desc) ; \end{verbatim}}
-    \end{enumerate}
-
-On input to \verb'GxB_*Iterator_attach', the \verb'iterator' must already
-exist, having been created by \verb'GxB_Iterator_new'.  If the \verb'iterator'
-is already attached to a matrix or vector, it is detached and then attached to
-the given matrix \verb'A' or vector \verb'v'.
-
-The return values for row/column methods are:
-
-    \begin{itemize}
-    \item
-    \verb'GrB_SUCCESS':         if the \verb'iterator' is successfully
-        attached to the matrix \verb'A'.
-    \item
-    \verb'GrB_NULL_POINTER':    if the \verb'iterator' or \verb'A' are NULL.
-    \item
-    \verb'GrB_INVALID_OBJECT':  if the matrix \verb'A' is invalid.
-    \item
-    \verb'GrB_NOT_IMPLEMENTED': if the matrix \verb'A' cannot be iterated
-        in the requested access method (row iterators require the matrix to
-        be held by-row, and column iterators require the matrix to be held
-        by-column).
-    \item
-    \verb'GrB_OUT_OF_MEMORY':   if the method runs out of memory.
-    \end{itemize}
+I have benchmarked the performance of the row and column iterators to compute
+\verb'y=0' and then \verb'y+=A*x' where \verb'y' is a dense vector and \verb'A'
+is a sparse matrix, using a single thread.  The row and column iterators are
+very fast, sometimes only 1\% slower than calling \verb'GrB_mxv' to compute the
+same thing (also assuming a single thread), for large problems.  For sparse
+matrices that average just 1 or 2 entries per row, the row iterator can be
+about 30\% slower than \verb'GrB_mxv', likely because of the slightly higher
+complexity of moving from one row to the next using these methods.
 
-The other two methods (entry iterator for matrices, or the vector iterator)
-return the same error codes, except that they
-do not return \verb'GrB_NOT_IMPLEMENTED'.
+It is possible to split up the problem for multiple user threads, each with its
+own iterator.  Given the low overhead of the row and column iterator for a
+single thread, this should be very fast.  Care must be taken to ensure a good
+load balance.  Simply spliting up the rows of a matrix and giving the same
+number of rows to each user thread can result in imbalanced work.  This is
+handled internally in \verb'GrB_*' methods, but enabling parallelism when using
+iterators is the responsibility of the user application.
 
-%===============================================================================
-\subsection{Seeking to an arbitrary position}
-%===============================================================================
+The entry iterators are easier to use but harder to implement.  The methods
+must internally fuse both inner and outer loops so that the user application can
+use a single loop.  As a result, the computation \verb'y+=A*x' can be up to
+4x slower (about 2x typical) than when using \verb'GrB_mxv' with a single
+thread.
 
-Attaching the \verb'iterator' to a matrix or vector does not define a specific
-position for the \verb'iterator'.  To use the \verb'iterator', a single call to
-the corresponding {\em seek} method is required.  These
-\verb'GxB*_Iterator_*seek*' methods may also be used later on to change the
-position of the iterator arbitrarily.
+To obtain the best performace possible, many of the iterator methods are
+implemented as macros in \verb'GraphBLAS.h'.  Using macros is the default,
+giving typical C and C++ applications access to the fastest methods possible.
 
-    \begin{enumerate}
-    \item {\em row iterator}: 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_rowIterator_seekRow (iterator, row) ;
-    GrB_Index kount = GxB_rowIterator_kount (iterator) ;
-    GrB_Info info = GxB_rowIterator_kseek (iterator, k) ; \end{verbatim}}
+To ensure access to these methods when not using the macros, these methods are
+also defined as regular functions that appear in the compiled
+\verb'libgraphblas.so' library with the same name as the macros.  Applications
+that cannot use the macro versions can \verb'#undef' the macros after the
+\verb'#include <GraphBLAS.h>' statement, and then they would access the regular
+compiled functions in \verb'libgraphblas.so'.  This non-macro approach is not
+the default, and the iterator methods may be slightly slower.
 
-        These methods move a row iterator to a specific row, defined in one of
-        two ways: (1) the row index itself (in range 0 to \verb'nrows'-1), or
-        (2) by specifying \verb'k', which moves the iterator to the \verb'k'th
-        {\em explicit} row (in the range 0 to \verb'kount'-1). For sparse,
-        bitmap, or full matrices, these two methods are identical.  For
-        hypersparse matrices, not all rows are present in the data structure;
-        these {\em implicit} rows are skipped and not included in the
-        \verb'kount'.  Implicit rows contain no entries.  The
-        \verb'GxB_rowIterator_kount' method returns the \verb'kount' of the
-        matrix, where \verb'kount' is equal to \verb'nrows' for sparse, bitmap,
-        and matrices, and \verb'kount' $\le$ \verb'nrows' for hypersparse
-        matrices.  All three methods listed above can be used for any row
-        iterator.
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Iso-Valued Matrices and Vectors } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{iso}
 
-        The \verb'GxB_rowIterator_*seek*' methods return \verb'GrB_SUCCESS' if
-        the iterator has been moved to a row that contains at least one entry,
-        \verb'GrB_NO_VALUE' if the row has no entries, or \verb'GxB_EXHAUSTED'
-        if the row is out of bounds (\verb'row' $\ge$ \verb'nrows' or
-        if \verb'k' $\ge$ \verb'kount').
-        None of these return conditions are
-        errors; they are all informational.
+The GraphBLAS C API states that the entries in all \verb'GrB_Matrix' and
+\verb'GrB_Vector' objects have a numerical value, with either a built-in or
+user-defined type.  Representing an unweighted graph requires a value to be
+placed on each edge, typically $a_{ij}=1$.  Adding a structure-only data type
+would not mix well with the rest of GraphBLAS, where all operators, monoids,
+and semirings need to operate on a value, of some data type.  And yet
+unweighted graphs are very important in graph algorithms.
 
-        For sparse, bitmap, and full matrices, \verb'GxB_rowIterator_seekRow'
-        always moves to the given row.  For hypersparse matrices, if the
-        requested row is implicit, the iterator is moved to the first
-        explicit row following it.  If no such row exists, the iterator
-        is exhausted and \verb'GxB_EXHAUSTED' is returned.
-        The \verb'GxB_rowIterator_kseek' method always moves to the \verb'k'th
-        explicit row, for any matrix.
-        Use \verb'GxB_rowIterator_getRowIndex', described below, to determine
-        the row index of the current position.
+The solution is simple, and exploiting it in SuiteSparse:GraphBLAS requires
+nearly no extensions to the GraphBLAS C API.   SuiteSparse:GraphBLAS can often
+detect when the user application is creating a matrix or vector where all
+entries in the sparsity pattern take on the same numerical value.
 
-        Precondition: on input, the \verb'iterator' must have been successfully
-        attached to a matrix via a prior call to \verb'GxB_rowIterator_attach'.
-        Results are undefined if this precondition is not met.
+For example, ${\bf C \langle C \rangle} = 1$, when the mask is structural, sets
+all entries in $\bf C$ to the value 1.  SuiteSparse:GraphBLAS detects this, and
+performs this assignment in $O(1)$ time.  It stores a single copy of this
+``iso-value'' and sets an internal flag in the opaque data structure for $\bf
+C$, which states that all entries in the pattern of $\bf C$ are equal to 1.
+This saves both time and memory and allows for the efficient representation of
+sparse adjacency matrices of unweighted graphs, yet does not change the C API.
+To the user application, it still appears that $\bf C$ has \verb'nvals(C)'
+entries, all equal to 1.
 
-    \item {\em column iterator}: 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_colIterator_seekCol (iterator, col) ;
-    GrB_Index kount = GxB_colIterator_kount (iterator) ;
-    GrB_Info info = GxB_colIterator_kseek (iterator, k) ; \end{verbatim}}
+Creating and operating on iso-valued matrices (or just {\em iso matrices} for 
+short) is significantly faster than creating matrices with different data
+values.  A matrix that is iso requires only $O(1)$ space for its numerical
+values.  The sparse and hypersparse formats require an additional $O(n+e)$ or
+$O(e)$ integer space to hold the pattern of an $n$-by-$n$ matrix \verb'C',
+respectively, and a matrix \verb'C' in bitmap format requires $O(n^2)$ space
+for the bitmap.  A full matrix requires no integer storage, so a matrix that is
+both iso and full requires only $O(1)$ space, regardless of its dimension.
 
-        These methods move a column iterator to a specific column, defined in
-        one of two ways: (1) the column index itself (in range 0 to
-        \verb'ncols'-1), or (2) by specifying \verb'k', which moves the
-        iterator to the \verb'k'th {\em explicit} column (in the range 0 to
-        \verb'kount'-1). For sparse, bitmap, or full matrices, these two
-        methods are identical.  For hypersparse matrices, not all columns are
-        present in the data structure; these {\em implicit} columns are skipped
-        and not included in the \verb'kount'.  Implicit columns contain no
-        entries.  The \verb'GxB_colIterator_kount' method returns the
-        \verb'kount' of the matrix, where \verb'kount' is equal to \verb'ncols'
-        for sparse, bitmap, and matrices, and \verb'kount' $\le$ \verb'ncols'
-        for hypersparse matrices.  All three methods listed above can be used
-        for any column iterator.
+The sections below a describe the methods that can be used to create iso
+matrices and vectors.  Let $a$, $b$, and $c$ denote the iso values of \verb'A',
+\verb'B', and \verb'C', respectively.
 
-        The \verb'GxB_colIterator_*seek*' methods return \verb'GrB_SUCCESS' if
-        the iterator has been moved to a column that contains at least one
-        entry, \verb'GrB_NO_VALUE' if the column has no entries, or
-        \verb'GxB_EXHAUSTED' if the column is out of bounds (\verb'col' $\ge$
-        \verb'ncols' or \verb'k' $\ge$ \verb'kount').
-        None of these return conditions are
-        errors; they are all informational.
+%-------------------------------------------------------------------------------
+\subsection{Using iso matrices and vectors in a graph algorithm}
+%-------------------------------------------------------------------------------
+\label{iso_usage}
 
-        For sparse, bitmap, and full matrices, \verb'GxB_colIterator_seekCol'
-        always moves to the given column.  For hypersparse matrices, if the
-        requested column is implicit, the iterator is moved to the first
-        explicit column following it.  If no such column exists, the iterator
-        is exhausted and \verb'GxB_EXHAUSTED' is returned.
-        The \verb'GxB_colIterator_kseek' method always moves to the \verb'k'th
-        explicit column, for any matrix.
-        Use \verb'GxB_colIterator_getColIndex', described below, to determine
-        the column index of the current position.
+There are two primary useful ways to use iso-valued matrices and vectors: (1)
+as iso sparse/hypersparse adjacency matrices for unweighted graphs, and (2) as
+iso full matrices or vectors used with operations that do not need to access
+all of the content of the iso full matrix or vector.
 
-        Precondition: on input, the \verb'iterator' must have been successfully
-        attached to a matrix via a prior call to \verb'GxB_colIterator_attach'.
-        Results are undefined if this precondition is not met.
+In the first use case, simply create a \verb'GrB_Matrix' with values that are
+all the same (those in the sparsity pattern).  The
+\verb'GxB_Matrix_build_Scalar' method can be used for this, since it
+guarantees that the time and work spent on the numerical part of the array
+is only $O(1)$.  The method still must spend $O(e)$ or $O(e \log e)$ time
+on the integer arrays that represent the sparsity pattern, but the reduction
+in time and work on the numerical part of the matrix will improve performance.
 
-    \item {\em entry iterator}:
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_Matrix_Iterator_seek (iterator, p) ;
-    GrB_Index pmax = GxB_Matrix_Iterator_getpmax (iterator) ;
-    GrB_Index p = GxB_Matrix_Iterator_getp (iterator); \end{verbatim}}
+The use of \verb'GxB_Matrix_build_Scalar' is optional.  Matrices can also be
+constructed with \verb'GrB*' methods.  In particular, \verb'GrB_Matrix_build_*'
+can be used.  It first builds a non-iso matrix and then checks if all of the
+values are the same, after assembling any duplicate entries.  This does not
+save time or memory for the construction of the matrix itself, but it will
+lead to savings in time and memory later on, when the matrix is used.
 
-        The \verb'GxB_Matrix_Iterator_seek' method moves the \verb'iterator' to
-        the given position \verb'p', which is in the range 0 to \verb'pmax'-1,
-        where the value of \verb'pmax' is obtained from
-        \verb'GxB_Matrix_Iterator_getpmax'.
-        For sparse, hypersparse, and full matrices, \verb'pmax' is the same as
-        \verb'nvals' returned by \verb'GrB_Matrix_nvals'.  For bitmap matrices,
-        \verb'pmax' is equal to \verb'nrows*ncols'.  If \verb'p' $\ge$
-        \verb'pmax', the iterator is exhausted and \verb'GxB_EXHAUSTED' is
-        returned.  Otherwise, \verb'GrB_SUCCESS' is returned.
+To ensure a matrix \verb'C' is iso-valued, simply use \verb'GrB_assign' to
+compute \verb'C<C,struct>=1', or assign whatever value of scalar you wish.
+It is essential to use a structural mask.  Otherwise, it is not clear that
+all entries in \verb'C' will be assigned the same value.  The following
+code takes $O(1)$ time, and it resets the size of the numerical part of the
+\verb'C' matrix to be $O(1)$ in size:
 
-        All entries in the matrix are given an ordinal position, \verb'p'.
-        Seeking to position \verb'p' will either move the \verb'iterator' to
-        that particular position, or to the next higher position containing an
-        entry if there is entry at position \verb'p'.  The latter case only
-        occurs for bitmap matrices.
-        Use \verb'GxB_Matrix_Iterator_getp' to determine the current
-        position of the iterator.
+{\footnotesize
+\begin{verbatim}
+    bool scalar = true ;
+    GrB_Matrix_assign (C, C, NULL, scalar, GrB_ALL, nrows, GrB_ALL, ncols,
+        GrB_DESC_S) ; \end{verbatim}}
 
-        Precondition: on input, the \verb'iterator' must have been successfully
-        attached to a matrix via a prior call to
-        \verb'GxB_Matrix_Iterator_attach'.  Results are undefined if this
-        precondition is not met.
+The Octave/MATLAB analog of the code above is \verb'C=spones(C)'.
 
-    \item {\em vector iterator}:
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_Vector_Iterator_seek (iterator, p) ;
-    GrB_Index pmax = GxB_Vector_Iterator_getpmax (iterator) ;
-    GrB_Index p = GxB_Vector_Iterator_getp (iterator); \end{verbatim}}
+The second case for where iso matrices and vectors are useful is to use them
+with operations that do not necessarily access all of their content.
+Suppose you have a matrix \verb'A' of arbitrarily large dimension (say
+\verb'n'-by-\verb'n' where \verb'n=2^60', of type \verb'GrB_FP64').  A matrix
+this large can be represented by SuiteSparse:GraphBLAS, but only in a
+hypersparse form.
 
-        The \verb'GxB_Vector_Iterator_seek' method is identical to the
-        entry iterator of a matrix, but applied to a \verb'GrB_Vector' instead.
+Now, suppose you wish to compute the maximum value in each row, reducing the
+matrix to a vector.  This can be done with \verb'GrB_reduce':
 
-        Precondition: on input, the \verb'iterator' must have been successfully
-        attached to a vector via a prior call to
-        \verb'GxB_Vector_Iterator_attach'.  Results are undefined if this
-        precondition is not met.
+{\footnotesize
+\begin{verbatim}
+    GrB_Vector_new (&v, GrB_FP64, n) ;
+    GrB_reduce (v, NULL, GrB_MAX_MONOID_FP64, A, NULL) ; \end{verbatim}}
 
-    \end{enumerate}
+It can also be done with \verb'GrB_mxv', by creating an iso full vector
+\verb'x'.  The creation of \verb'x' takes $O(1)$ time and memory,
+and the \verb'GrB_mxv' computation takes $O(e)$ time (with modest assumptions;
+if \verb'A' needs to be transposed the time would be $O(e \log e)$).
 
-%===============================================================================
-\subsection{Advancing to the next position}
-%===============================================================================
+{\footnotesize
+\begin{verbatim}
+    GrB_Vector_new (&v, GrB_FP64, n) ;
+    GrB_Vector_new (&x, GrB_FP64, n) ;
+    GrB_assign (x, NULL, NULL, 1, GrB_ALL, n, NULL) ;
+    GrB_mxv (v, NULL, NULL, GrB_MAX_FIRST_SEMIRING_FP64, A, x, NULL) ; \end{verbatim}}
 
-For best performance, the {\em seek} methods described above should be used
-with care, since some of them require $O(\log n)$ time.  The fastest method
-for changing the position of the iterator is the corresponding {\em next}
-method, described below for each iterator:
+The above computations are identical in SuiteSparse:GraphBLAS.  Internally,
+\verb'GrB_reduce' creates \verb'x' and calls \verb'GrB_mxv'.  Using
+\verb'GrB_mxm' directly gives the user application additional flexibility in
+creating new computations that exploit the multiplicative operator in the
+semiring.  \verb'GrB_reduce' always uses the \verb'FIRST' operator in its
+semiring, but any other binary operator can be used instead when using
+\verb'GrB_mxv'.
 
-    \begin{enumerate}
-    \item {\em row iterator}:  To move to the next row.
+Below is a method for computing the argmax of each row of a square matrix
+\verb'A' of dimension \verb'n' and type \verb'GrB_FP64'.  The vector \verb'x'
+contains the maximum value in each row, and the vector \verb'p' contains the
+zero-based column index of the maximum value in each row.  If there are
+duplicate maximum values in each row, any one of them is selected arbitrarily
+using the \verb'ANY' monoid.  To select the minimum column index of the
+duplicate maximum values, use the \verb'GxB_MIN_SECONDI_INT64' semiring instead
+(this will be slightly slower than the \verb'ANY' monoid if there are many
+duplicates).
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_rowIterator_nextRow (iterator) ; \end{verbatim}}
+To compute the argmax of each column, use the \verb'GrB_DESC_T0' descriptor
+in \verb'GrB_mxv', and compute \verb'G=A*D' instead of \verb'G=D*A' with
+\verb'GrB_mxm'.  See the \verb'GrB.argmin' and \verb'GrB.argmax' functions
+in the Octave/MATLAB interface for details.
 
-    The row iterator is a 2-dimensional iterator, requiring an outer loop and
-    an inner loop.  The outer loop iterates over the rows of the matrix, using
-    \verb'GxB_rowIterator_nextRow' to move to the next row.  If the matrix is
-    hypersparse, the next row is always an explicit row; implicit rows are
-    skipped.  The return conditions are identical to
-    \verb'GxB_rowIterator_seekRow'.
+% corresponds to GrB.argmax with dim = 2
 
-    Preconditions: on input, the row iterator must already be attached to a
-    matrix via a prior call to \verb'GxB_rowIterator_attach', and the
-    \verb'iterator' must be at a specific row, via a prior call to
-    \verb'GxB_rowIterator_*seek*' or \verb'GxB_rowIterator_nextRow'. 
-    Results are undefined if these conditions are not met.
+{\footnotesize
+\begin{verbatim}
+    GrB_Vector_new (&x, GrB_FP64, n) ;
+    GrB_Vector_new (&y, GrB_FP64, n) ;
+    GrB_Vector_new (&p, GrB_INT64, n) ;
+    // y (:) = 1, an iso full vector
+    GrB_assign (y, NULL, NULL, 1, GrB_ALL, n, NULL) ;
+    // x = max (A) where x(i) = max (A (i,:))
+    GrB_mxv (x, NULL, NULL, GrB_MAX_FIRST_SEMIRING_FP64, A, y, NULL) ;
+    // D = diag (x)
+    GrB_Matrix_diag (&D, x, 0) ;
+    // G = D*A using the ANY_EQ semiring
+    GrB_Matrix_new (&G, GrB_BOOL, n, n) ;
+    GrB_mxm (G, NULL, NULL, GxB_ANY_EQ_FP64, D, A, NULL) ;
+    // drop explicit zeros from G
+    GrB_select (G, NULL, NULL, GrB_VALUENE_BOOL, G, 0, NULL) ;
+    // find the position of any max entry in each row: p = G*y,
+    // so that p(i) = j if x(i) = A(i,j) = max (A (i,:))
+    GrB_mxv (p, NULL, NULL, GxB_ANY_SECONDI_INT64, G, y, NULL) ; \end{verbatim}}
 
-    \item {\em row iterator}:  To move to the next entry within a row.
+No part of the above code takes $\Omega(n)$ time or memory.  The data type of
+the iso full vector \verb'y' can be anything, and its iso value can be
+anything.  It is operated on by the \verb'FIRST' operator in the first
+\verb'GrB_mxv', and the \verb'SECONDI' positional operator in the second
+\verb'GrB_mxv', and both operators are oblivious to the content and even the
+type of \verb'y'.  The semirings simply note that \verb'y' is a full vector and
+compute their result according, by accessing the matrices only (\verb'A' and
+\verb'G', respectively).
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_rowIterator_nextCol (iterator) ; \end{verbatim}}
+For floating-point values, \verb'NaN' values are ignored, and treated as if
+they were not present in the input matrix, unless all entries in a given row
+are equal to \verb'NaN'.  In that case, if all entries in \verb'A(i,:)' are
+equal to \verb'NaN', then \verb'x(i)' is \verb'NaN' and the entry \verb'p(i)'
+is not present.
 
-    The row iterator is moved to the next entry in the current row.
-    The method returns \verb'GrB_NO_VALUE' if the end of the row is reached.
-    The iterator does not move to the next row in this case.
-    The method returns \verb'GrB_SUCCESS' if the iterator has been moved
-    to a specific entry in the current row.
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from matrix multiplication}
+%-------------------------------------------------------------------------------
+\label{iso_mxm}
 
-    Preconditions: the same as \verb'GxB_rowIterator_nextRow'.
+Consider \verb'GrB_mxm', \verb'GrB_mxv', and \verb'GrB_vxm', and
+    let \verb'C=A*B', where no mask is present, or \verb'C<M>=A*B' where
+    \verb'C' is initially empty.  If \verb'C' is not initially empty,
+    then these rules apply to a temporary matrix \verb'T<M>=A*B', which is
+    initially empty and is then assigned to \verb'C' via \verb'C<M>=T'.
 
-    \item {\em column iterator}:  To move to the next column
+    The iso property of \verb'C' is determined with the following rules,
+    where the first rule that fits defines the property and value of \verb'C'.
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_colIterator_nextCol (iterator) ; \end{verbatim}}
+    \begin{itemize}
+    \item If the semiring includes a positional multiplicative operator
+    (\verb'GxB_FIRSTI', \verb'GrB_SECONDI', and related operators), then
+    \verb'C' is never iso.
 
-    The column iterator is a 2-dimensional iterator, requiring an outer loop
-    and an inner loop.  The outer loop iterates over the columns of the matrix,
-    using \verb'GxB_colIterator_nextCol' to move to the next column.  If the
-    matrix is hypersparse, the next column is always an explicit column;
-    implicit columns are skipped.  The return conditions are identical to
-    \verb'GxB_colIterator_seekCol'.
+    \item Define an {\em iso-monoid} as a built-in monoid with the property
+    that reducing a set of $n>1$ identical values $x$ returns the same value
+    $x$.  These are the \verb'MIN' \verb'MAX' \verb'LOR' \verb'LAND' \verb'BOR'
+    \verb'BAND' and \verb'ANY' monoids.  All other monoids are not iso monoids:
+    \verb'PLUS', \verb'TIMES', \verb'LXNOR', \verb'EQ', \verb'BXOR',
+    \verb'BXNOR', and all user-defined monoids.   Currently, there is no
+    mechanism for telling SuiteSparse:GraphBLAS that a user-defined monoid
+    is an iso-monoid.
 
-    Preconditions: on input, the column iterator must already be attached to a
-    matrix via a prior call to \verb'GxB_colIterator_attach', and the
-    \verb'iterator' must be at a specific column, via a prior call to
-    \verb'GxB_colIterator_*seek*' or \verb'GxB_colIterator_nextCol'.
-    Results are undefined if these conditions are not met.
+    \item If the multiplicative op is \verb'PAIR' (same as \verb'ONEB'),
+    and the monoid is an
+    iso-monoid, or the \verb'EQ' or \verb'TIMES' monoids, then \verb'C' is
+    iso with a value of 1.
 
-    {\footnotesize
-    \item {\em column iterator}:  To move to the next entry within a column.
+    \item If both \verb'B' and the monoid are iso, and the multiplicative op is
+    \verb'SECOND' or \verb'ANY', then \verb'C' is iso with a value of $b$.
 
-    \begin{verbatim}
-    GrB_Info info = GxB_colIterator_nextRow (iterator) ; \end{verbatim}}
+    \item If both \verb'A' and the monoid are iso, and the multiplicative op is
+    \verb'FIRST' or \verb'ANY', then \verb'C' is iso with a value of $a$.
 
-    The column iterator is moved to the next entry in the current column.
-    The method returns \verb'GrB_NO_VALUE' if the end of the column is reached.
-    The iterator does not move to the next column in this case.
-    The method returns \verb'GrB_SUCCESS' if the iterator has been moved
-    to a specific entry in the current column.
+    \item If \verb'A', \verb'B', and the monoid are all iso, then \verb'C'
+    is iso, with a value $c=f(a,b)$, where $f$ is any multiplicative op
+    (including user-defined, which assumes that a user-defined $f$ has no 
+    side effects).
 
-    Preconditions: the same as \verb'GxB_colIterator_nextCol'.
+    \item If \verb'A' and \verb'B' are both iso and full (all entries present,
+    regardless of the format of the matrices), then \verb'C' is iso and full.
+    Its iso value is computed in $O(\log(n))$ time, via a reduction of $n$
+    copies of the value $t=f(a,b)$ to a scalar.  The storage required to
+    represent \verb'C' is just $O(1)$, regardless of its dimension.
+    Technically, the \verb'PLUS' monoid could be computed as $c=nt$ in $O(1)$
+    time, but the log-time reduction works for any monoid, including
+    user-defined ones.
 
-    \item {\em entry iterator}: To move to the next entry.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_Matrix_Iterator_next (iterator) ; \end{verbatim}}
+    \item Otherwise, \verb'C' is not iso.
+    \end{itemize}
 
-    This method moves an iterator to the next entry of a matrix.
-    It returns \verb'GrB_SUCCESS' if the iterator is at an entry that
-    exists in the matrix, or \verb'GrB_EXHAUSTED' otherwise.
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from eWiseMult and kronecker}
+%-------------------------------------------------------------------------------
+\label{iso_emult}
 
-    Preconditions: on input, the entry iterator must be already attached to a
-    matrix via \verb'GxB_Matrix_Iterator_attach', and the position of the
-    iterator must also have been defined by a prior call to
-    \verb'GxB_Matrix_Iterator_seek' or \verb'GxB_Matrix_Iterator_next'.
-    Results are undefined if these conditions are not met.
+Consider \verb'GrB_eWiseMult'.  Let
+\verb'C=A.*B', or \verb'C<M>=A.*B' with any mask and where \verb'C' is
+initially empty, where \verb'.*' denotes a binary operator $f(x,y)$
+applied with \verb'eWiseMult'.  These rules also apply to \verb'GrB_kronecker'.
 
-    \item {\em vector iterator}: To move to the next entry.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Info info = GxB_Vector_Iterator_next (iterator) ; \end{verbatim}}
+    \begin{itemize}
+    \item If the operator is positional (\verb'GxB_FIRSTI' and related) then
+    \verb'C' is not iso.
 
-    This method moves an iterator to the next entry of a vector.
-    It returns \verb'GrB_SUCCESS' if the iterator is at an entry that
-    exists in the vector, or \verb'GrB_EXHAUSTED' otherwise.
+    \item If the op is \verb'PAIR' (same as \verb'ONEB'),
+        then \verb'C' is iso with $c=1$.
 
-    Preconditions: on input, the iterator must be already attached to a
-    vector via \verb'GxB_Vector_Iterator_attach', and the position of the
-    iterator must also have been defined by a prior call to
-    \verb'GxB_Vector_Iterator_seek' or \verb'GxB_Vector_Iterator_next'.
-    Results are undefined if these conditions are not met.
+    \item If \verb'B' is iso and the op is \verb'SECOND' or \verb'ANY',
+        then \verb'C' is iso with $c=b$.
 
-    \end{enumerate}
+    \item If \verb'A' is iso and the op is \verb'FIRST' or \verb'ANY',
+        then \verb'C' is iso with $c=a$.
 
-%===============================================================================
-\subsection{Accessing the indices of the current entry}
-%===============================================================================
+    \item If both \verb'A' and \verb'B' are iso,
+        then \verb'C' is iso with $c=f(a,b)$.
 
-Once the iterator is attached to a matrix or vector, and is placed in position
-at an entry in the matrix or vector, the indices and value of this entry can be
-obtained.  The methods for accessing the value of the entry are described in
-Section~\ref{getvalu}.  Accessing the indices is performed with four different
-sets of methods, depending on which access pattern is in use, described below:
+    \item Otherwise, \verb'C' is not iso.
+    \end{itemize}
 
-    \begin{enumerate}
-    \item {\em row iterator}:  To get the current row index.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Index i = GxB_rowIterator_getRowIndex (iterator) ; \end{verbatim}}
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from eWiseAdd}
+%-------------------------------------------------------------------------------
+\label{iso_add}
 
-    The method returns \verb'nrows(A)' if the iterator is exhausted, or the
-    current row index \verb'i' otherwise.  There need not be any entry in the
-    current row.  Zero is returned if the iterator is attached to the matrix
-    but \verb'GxB_rowIterator_*seek*' has not been called, but this does not
-    mean the iterator is positioned at row zero.
+Consider \verb'GrB_eWiseAdd', and also the accumulator phase of \verb'C<M>+=T'
+when an accumulator operator is present.  Let \verb'C=A+B', or \verb'C<M>=A+B'
+with any mask and where \verb'C' is initially empty.
 
-    Preconditions: on input, the iterator must be already successfully attached
-    to matrix as a row iterator via \verb'GxB_rowIterator_attach'.
-    Results are undefined if this condition is not met.
+    \begin{itemize}
+    \item If both \verb'A' and \verb'B' are full (all entries present), then
+    the rules for \verb'eWiseMult' in Section~\ref{iso_emult} are used
+    instead.
 
-    \item {\em row iterator}:  To get the current column index.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Index j = GxB_rowIterator_getColIndex (iterator) ; \end{verbatim}}
+    \item If the operator is positional (\verb'GxB_FIRSTI' and related) then
+    \verb'C' is not iso.
 
-    Preconditions: on input, the iterator must be already successfully attached
-    to matrix as a row iterator via \verb'GxB_rowIterator_attach', and in
-    addition, the row iterator must be positioned at a valid entry present in
-    the matrix.  That is, the last call to \verb'GxB_rowIterator_*seek*' or
-    \verb'GxB_rowIterator_*next*', must have returned \verb'GrB_SUCCESS'.
-    Results are undefined if these conditions are not met.
+    \item If $a$ and $b$ differ (when typecasted to the type of \verb'C'),
+    then \verb'C' is not iso.
 
-    \item {\em column iterator}:  To get the current column index.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Index j = GxB_colIterator_getColIndex (iterator) ; \end{verbatim}}
+    \item If $c=f(a,b) = a = b$ holds, then \verb'C' is iso,
+    where $f(a,b)$ is the operator.
 
-    The method returns \verb'ncols(A)' if the iterator is exhausted, or the
-    current column index \verb'j' otherwise.  There need not be any entry in the
-    current column.  Zero is returned if the iterator is attached to the matrix
-    but \verb'GxB_colIterator_*seek*' has not been called, but this does not
-    mean the iterator is positioned at column zero.
+    \item Otherwise, \verb'C' is not iso.
+    \end{itemize}
 
-    Precondition: on input, the iterator must be already successfully attached
-    to matrix as a column iterator via \verb'GxB_colIterator_attach'.
-    Results are undefined if this condition is not met.
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from eWiseUnion}
+%-------------------------------------------------------------------------------
+\label{iso_union}
 
-    \item {\em column iterator}:  To get the current row index.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Index i = GxB_colIterator_getRowIndex (iterator) ; \end{verbatim}}
+\verb'GxB_eWiseUnion' is very similar to \verb'GrB_eWiseAdd', but the rules
+for when the result is iso-valued are very different.
 
-    Preconditions: on input, the iterator must be already successfully attached
-    to matrix as a column iterator via \verb'GxB_colIterator_attach', and in
-    addition, the column iterator must be positioned at a valid entry present in
-    the matrix.  That is, the last call to \verb'GxB_colIterator_*seek*' or
-    \verb'GxB_colIterator_*next*', must have returned \verb'GrB_SUCCESS'.
-    Results are undefined if these conditions are not met.
+    \begin{itemize}
+    \item If both \verb'A' and \verb'B' are full (all entries present), then
+    the rules for \verb'eWiseMult' in Section~\ref{iso_emult} are used
+    instead.
 
-    \item {\em entry iterator}: To get the current row and column index.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Index i, j ;
-    GxB_Matrix_Iterator_getIndex (iterator, &i, &j) ; \end{verbatim}}
+    \item If the operator is positional (\verb'GxB_FIRSTI' and related) then
+    \verb'C' is not iso.
 
-    Returns the row and column index of the current entry.
+    \item If the op is \verb'PAIR' (same as \verb'ONEB'),
+        then \verb'C' is iso with $c=1$.
 
-    Preconditions: on input, the entry iterator must be already attached to a
-    matrix via \verb'GxB_Matrix_Iterator_attach', and the position of the
-    iterator must also have been defined by a prior call to
-    \verb'GxB_Matrix_Iterator_seek' or \verb'GxB_Matrix_Iterator_next', with a
-    return value of \verb'GrB_SUCCESS'.
-    Results are undefined if these conditions are not met.
+    \item If \verb'B' is iso and the op is \verb'SECOND' or \verb'ANY',
+        and the input scalar \verb'beta' matches $b$
+        (the iso-value of \verb'B'),
+        then \verb'C' is iso with $c=b$.
 
-    \item {\em vector iterator}: To get the current index.
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Index i = GxB_Vector_Iterator_getIndex (iterator) ; \end{verbatim}}
+    \item If \verb'A' is iso and the op is \verb'FIRST' or \verb'ANY',
+        and the input scalar \verb'alpha' matches $a$
+        (the iso-value of \verb'A'),
+        then \verb'C' is iso with $c=a$.
 
-    Returns the index of the current entry.
+    \item If both \verb'A' and \verb'B' are iso,
+        and $f(a,b) = f(\alpha,b) = f(a,\beta)$,
+        then \verb'C' is iso with $c=f(a,b)$.
 
-    Preconditions: on input, the entry iterator must be already attached to a
-    matrix via \verb'GxB_Vector_Iterator_attach', and the position of the
-    iterator must also have been defined by a prior call to
-    \verb'GxB_Vector_Iterator_seek' or \verb'GxB_Vector_Iterator_next', with a
-    return value of \verb'GrB_SUCCESS'.
-    Results are undefined if these conditions are not met.
+    \item Otherwise, \verb'C' is not iso.
+    \end{itemize}
 
-    \end{enumerate}
+%-------------------------------------------------------------------------------
+\subsection{Reducing iso matrices to a scalar or vector}
+%-------------------------------------------------------------------------------
+\label{iso_reduce}
 
-%===============================================================================
-\subsection{Accessing the value of the current entry}
-\label{getvalu}
-%===============================================================================
+If \verb'A' is iso with $e$ entries, reducing it to a scalar takes $O(\log(e))$
+time, regardless of the monoid used to reduce the matrix to a scalar.  Reducing
+\verb'A' to a vector \verb'c' is the same as the matrix-vector multiply
+\verb"c=A*x" or \verb"c=A'*x", depending on the descriptor, where \verb'x'
+is an iso full vector (refer to Section~\ref{iso_mxm}).
 
-So far, all methods that create or use an iterator have been split into four
-sets of methods, for the row, column, or entry iterators attached to a matrix,
-or for a vector iterator.  Accessing the value is different.  All four
-iterators use the same set of methods to access the value of their current
-entry.  These methods return the value of the current entry at the position
-determined by the iterator.  The return value can of course be typecasted
-using standard C syntax once the value is returned to the caller.
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from apply}
+%-------------------------------------------------------------------------------
+\label{iso_apply}
 
-Preconditions: on input, the prior call to \verb'GxB_*Iterator_*seek*', or
-\verb'GxB_*Iterator_*next*' must have returned \verb'GrB_SUCCESS', indicating
-that the iterator is at a valid current entry for either a matrix or vector.
-No typecasting is permitted, in the sense that the method name must match the
-type of the matrix or vector.
-Results are undefined if these conditions are not met.
+Let \verb'C=f(A)' denote the application of a unary operator \verb'f',
+and let \verb'C=f(A,s)' and \verb'C=f(s,A)' denote the application of a binary
+operator with \verb's' a scalar.
 
-    {\footnotesize
-    \begin{verbatim}
-    // for built-in types:
-    bool       value = GxB_Iterator_get_BOOL (iterator) ;
-    int8_t     value = GxB_Iterator_get_INT8 (iterator) ;
-    int16_t    value = GxB_Iterator_get_INT16 (iterator) ;
-    int32_t    value = GxB_Iterator_get_INT32 (iterator) ;
-    int64_t    value = GxB_Iterator_get_INT64 (iterator) ;
-    uint8_t    value = GxB_Iterator_get_UINT8 (iterator) ;
-    uint16_t   value = GxB_Iterator_get_UINT16 (iterator) ;
-    uint32_t   value = GxB_Iterator_get_UINT32 (iterator) ;
-    uint64_t   value = GxB_Iterator_get_UINT64 (iterator) ;
-    float      value = GxB_Iterator_get_FP32 (iterator) ;
-    double     value = GxB_Iterator_get_FP64 (iterator) ;
-    GxB_FC32_t value = GxB_Iterator_get_FC32 (iterator) ;
-    GxB_FC64_t value = GxB_Iterator_get_FC64 (iterator) ;
+    \begin{itemize}
+    \item If the operator is positional (\verb'GxB_POSITION*',
+    \verb'GxB_FIRSTI', and related) then \verb'C' is not iso.
 
-    // for user-defined types:
-    <type> value ;
-    GxB_Iterator_get_UDT (iterator, (void *) &value) ; \end{verbatim}}
+    \item If the operator is \verb'ONE' or \verb'PAIR' (same as \verb'ONEB'),
+        then \verb'C' iso with $c=1$.
 
-%===============================================================================
-\newpage
-\subsection{Example: row iterator for a matrix}
-%===============================================================================
+    \item If the operator is \verb'FIRST' or \verb'ANY' with \verb'C=f(s,A)',
+        then \verb'C' iso with $c=s$.
 
-The following example uses a row iterator to access all of the entries
-in a matrix \verb'A' of type \verb'GrB_FP64'.  Note the inner and outer loops.
-The outer loop iterates over all rows of the matrix.  The inner loop iterates
-over all entries in the row \verb'i'.  This access pattern requires the matrix
-to be held by-row, but otherwise it works for any matrix.  If the matrix is
-held by-column, then use the column iterator methods instead.
+    \item If the operator is \verb'SECOND' or \verb'ANY' with \verb'C=f(A,s)',
+        then \verb'C' iso with $c=s$.
 
-    {\footnotesize
-    \begin{verbatim}
-    // create an iterator
-    GxB_Iterator iterator ;
-    GxB_Iterator_new (&iterator) ;
-    // attach it to the matrix A, known to be type GrB_FP64
-    GrB_Info info = GxB_rowIterator_attach (iterator, A, NULL) ;
-    if (info < 0) { handle the failure ... }
-    // seek to A(0,:)
-    info = GxB_rowIterator_seekRow (iterator, 0) ;
-    while (info != GxB_EXHAUSTED)
-    {
-        // iterate over entries in A(i,:)
-        GrB_Index i = GxB_rowIterator_getRowIndex (iterator) ;
-        while (info == GrB_SUCCESS)
-        {
-            // get the entry A(i,j)
-            GrB_Index j = GxB_rowIterator_getColIndex (iterator) ;
-            double  aij = GxB_Iterator_get_FP64 (iterator) ;
-            // move to the next entry in A(i,:)
-            info = GxB_rowIterator_nextCol (iterator) ;
-        }
-        // move to the next row, A(i+1,:), or a subsequent one if i+1 is implicit
-        info = GxB_rowIterator_nextRow (iterator) ;
-    }
-    GrB_free (&iterator) ; \end{verbatim}}
+    \item If \verb'A' is iso then \verb'C' is iso, with the following value
+        of $c$:
+
+        \begin{itemize}
+        \item If the op is \verb'IDENTITY', then $c=a$.
+        \item If the op is unary with \verb'C=f(A)', then $c=f(a)$.
+        \item If the op is binary with \verb'C=f(s,A)', then $c=f(s,a)$.
+        \item If the op is binary with \verb'C=f(A,s)', then $c=f(a,s)$.
+        \end{itemize}
 
-%===============================================================================
-\newpage
-\subsection{Example: column iterator for a matrix}
-%===============================================================================
 
-The column iterator is analgous to the row iterator.
+    \item Otherwise, \verb'C' is not iso.
+    \end{itemize}
 
-The following example uses a column iterator to access all of the entries in a
-matrix \verb'A' of type \verb'GrB_FP64'.  The outer loop iterates over all
-columns of the matrix.  The inner loop iterates over all entries in the column
-\verb'j'.  This access pattern requires the matrix to be held by-column, but
-otherwise it works for any matrix.  If the matrix is held by-row, then use
-the row iterator methods instead.
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from select}
+%-------------------------------------------------------------------------------
+\label{iso_select}
 
-    {\footnotesize
-    \begin{verbatim}
-    // create an iterator
-    GxB_Iterator iterator ;
-    GxB_Iterator_new (&iterator) ;
-    // attach it to the matrix A, known to be type GrB_FP64
-    GrB_Info info = GxB_colIterator_attach (iterator, A, NULL) ;
-    // seek to A(:,0)
-    info = GxB_colIterator_seekCol (iterator, 0) ;
-    while (info != GxB_EXHAUSTED)
-    {
-        // iterate over entries in A(:,j)
-        GrB_Index j = GxB_colIterator_getColIndex (iterator) ;
-        while (info == GrB_SUCCESS)
-        {
-            // get the entry A(i,j)
-            GrB_Index i = GxB_colIterator_getRowIndex (iterator) ;
-            double  aij = GxB_Iterator_get_FP64 (iterator) ;
-            // move to the next entry in A(:,j)
-            info = GxB_colIterator_nextRow (iterator) ;
-            OK (info) ;
-        }
-        // move to the next column, A(:,j+1), or a subsequent one if j+1 is implicit
-        info = GxB_colIterator_nextCol (iterator) ;
-    }
-    GrB_free (&iterator) ; \end{verbatim}}
+Let \verb'C=select(A)' denote the application of a \verb'GrB_IndexUnaryOp' operator
+in \verb'GrB_select'.
 
-%===============================================================================
-\newpage
-\subsection{Example: entry iterator for a matrix}
-%===============================================================================
+    \begin{itemize}
+    \item If \verb'A' is iso, then \verb'C' is iso with $c=a$.
+    \item If the operator is any \verb'GrB_VALUE*_BOOL' operator,
+        with no typecasting, and the test is true only for a single boolean
+        value, then \verb'C' is iso.
+    \item If the operator is \verb'GrB_VALUEEQ_*', with no typecasting,
+        then \verb'C' is iso, with $c=t$ where $t$ is the value of the scalar
+        \verb'y'.
+    \item If the operator is \verb'GrB_VALUELE_UINT*', with no typecasting,
+        and the scalar \verb'y' is zero, then \verb'C' is iso with $c=0$.
+    \item Otherwise, \verb'C' is not iso.
+    \end{itemize}
 
-The entry iterator allows for a simpler access pattern, with a single loop, but
-using a row or column iterator is faster.  The method works for any matrix.
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from assign and subassign}
+%-------------------------------------------------------------------------------
+\label{iso_assign}
 
-    {\footnotesize
-    \begin{verbatim}
-    // create an iterator
-    GxB_Iterator iterator ;
-    GxB_Iterator_new (&iterator) ;
-    // attach it to the matrix A, known to be type GrB_FP64
-    GrB_Info info = GxB_Matrix_Iterator_attach (iterator, A, NULL) ;
-    if (info < 0) { handle the failure ... }
-    // seek to the first entry
-    info = GxB_Matrix_Iterator_seek (iterator, 0) ;
-    while (info != GxB_EXHAUSTED)
-    {
-        // get the entry A(i,j)
-        GrB_Index i, j ;
-        GxB_Matrix_Iterator_getIndex (iterator, &i, &j) ;
-        double aij = GxB_Iterator_get_FP64 (iterator) ;
-        // move to the next entry in A
-        info = GxB_Matrix_Iterator_next (iterator) ;
-    }
-    GrB_free (&iterator) ; \end{verbatim}}
+These rules are somewhat complex.  Consider the assignment \verb'C<M>(I,J)=...'
+with \verb'GrB_assign'.  Internally, this assignment is converted into
+\verb'C(I,J)<M(I,J)>=...' and then \verb'GxB_subassign' is used.  Thus,
+all of the rules below assume the form \verb'C(I,J)<M>=...' where \verb'M'
+has the same size as the submatrix \verb'C(I,J)'.
 
-%===============================================================================
-\subsection{Example: vector iterator}
-%===============================================================================
+\subsubsection{Assignment with no accumulator operator}
 
-A vector iterator is used much like an entry iterator for a matrix.
+If no accumulator operator is present, the following rules are used.
 
-    {\footnotesize
-    \begin{verbatim}
-    // create an iterator
-    GxB_Iterator iterator ;
-    GxB_Iterator_new (&iterator) ;
-    // attach it to the vector v, known to be type GrB_FP64
-    GrB_Info info = GxB_Vector_Iterator_attach (iterator, v, NULL) ;
-    if (info < 0) { handle the failure ... }
-    // seek to the first entry
-    info = GxB_Vector_Iterator_seek (iterator, 0) ;
-    while (info != GxB_EXHAUSTED)
-    {
-        // get the entry v(i)
-        GrB_Index i = GxB_Vector_Iterator_getIndex (iterator) ;
-        double vi = GxB_Iterator_get_FP64 (iterator) ;
-        // move to the next entry in v
-        info = GxB_Vector_Iterator_next (iterator) ;
-    }
-    GrB_free (&iterator) ; \end{verbatim}}
+\begin{itemize}
+\item 
+For matrix assignment, \verb'A' must be iso.  For scalar assignment, the single
+scalar is implicitly expanded into an iso matrix \verb'A' of the right size.
+If these rules do not hold, \verb'C' is not iso.
 
-%===============================================================================
-\newpage
-\subsection{Performance}
-%===============================================================================
+\item
+If \verb'A' is not iso, or if \verb'C' is not iso on input, then \verb'C' is
+not iso on output.
 
-I have benchmarked the performance of the row and column iterators to compute
-\verb'y=0' and then \verb'y+=A*x' where \verb'y' is a dense vector and \verb'A'
-is a sparse matrix, using a single thread.  The row and column iterators are
-very fast, sometimes only 1\% slower than calling \verb'GrB_mxv' to compute the
-same thing (also assuming a single thread), for large problems.  For sparse
-matrices that average just 1 or 2 entries per row, the row iterator can be
-about 30\% slower than \verb'GrB_mxv', likely because of the slightly higher
-complexity of moving from one row to the next using these methods.
+\item
+If \verb'C' is iso or empty on input, and \verb'A' is iso (or scalar assignment
+is begin performed) and the iso values $c$ and $a$ (or the scalar $s$) match,
+then the following forms of assignment result in an iso matrix \verb'C'  on
+output:
 
-It is possible to split up the problem for multiple user threads, each with its
-own iterator.  Given the low overhead of the row and column iterator for a
-single thread, this should be very fast.  Care must be taken to ensure a good
-load balance.  Simply spliting up the rows of a matrix and giving the same
-number of rows to each user thread can result in imbalanced work.  This is
-handled internally in \verb'GrB_*' methods, but enabling parallelism when using
-iterators is the responsibility of the user application.
+                \begin{itemize}
+                \item \verb'C(I,J) = scalar'
+                \item \verb'C(I,J)<M> = scalar'
+                \item \verb'C(I,J)<!M> = scalar'
+                \item \verb'C(I,J)<M,replace> = scalar'
+                \item \verb'C(I,J)<!M,replace> = scalar'
+                \item \verb'C(I,J) = A'
+                \item \verb'C(I,J)<M> = A'
+                \item \verb'C(I,J)<!M> = A'
+                \item \verb'C(I,J)<M,replace> = A'
+                \item \verb'C(I,J)<!M,replace> = A'
+                \end{itemize}
 
-The entry iterators are easier to use but harder to implement.  The methods
-must internally fuse both inner and outer loops so that the user application can
-use a single loop.  As a result, the computation \verb'y+=A*x' can be up to
-4x slower (about 2x typical) than when using \verb'GrB_mxv' with a single
-thread.
+\item
+For these forms of assignment, \verb'C' is always iso on output, regardless
+of its iso property on input:
 
-To obtain the best performace possible, many of the iterator methods are
-implemented as macros in \verb'GraphBLAS.h'.  Using macros is the default,
-giving typical C and C++ applications access to the fastest methods possible.
+                \begin{itemize}
+                \item \verb'C = scalar'
+                \item \verb'C<M,struct>=scalar'; C empty on input.
+                \item \verb'C<C,struct>=scalar'
+                \end{itemize}
 
-To ensure access to these methods when not using the macros, these methods are
-also defined as regular functions that appear in the compiled
-\verb'libgraphblas.so' library with the same name as the macros.  Applications
-that cannot use the macro versions can \verb'#undef' the macros after the
-\verb'#include <GraphBLAS.h>' statement, and then they would access the regular
-compiled functions in \verb'libgraphblas.so'.  This non-macro approach is not
-the default, and the iterator methods may be slightly slower.
+\item
+For these forms of assignment, \verb'C' is always iso on output if \verb'A'
+is iso:
 
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Iso-Valued Matrices and Vectors } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{iso}
+                \begin{itemize}
+                \item \verb'C = A'
+                \item \verb'C<M,str> = A'; C empty on input.
+                \end{itemize}
+\end{itemize}
 
-The GraphBLAS C API states that the entries in all \verb'GrB_Matrix' and
-\verb'GrB_Vector' objects have a numerical value, with either a built-in or
-user-defined type.  Representing an unweighted graph requires a value to be
-placed on each edge, typically $a_{ij}=1$.  Adding a structure-only data type
-would not mix well with the rest of GraphBLAS, where all operators, monoids,
-and semirings need to operate on a value, of some data type.  And yet
-unweighted graphs are very important in graph algorithms.
 
-The solution is simple, and exploiting it in SuiteSparse:GraphBLAS requires
-nearly no extensions to the GraphBLAS C API.   SuiteSparse:GraphBLAS can often
-detect when the user application is creating a matrix or vector where all
-entries in the sparsity pattern take on the same numerical value.
+\subsubsection{Assignment with an accumulator operator}
 
-For example, ${\bf C \langle C \rangle} = 1$, when the mask is structural, sets
-all entries in $\bf C$ to the value 1.  SuiteSparse:GraphBLAS detects this, and
-performs this assignment in $O(1)$ time.  It stores a single copy of this
-``iso-value'' and sets an internal flag in the opaque data structure for $\bf
-C$, which states that all entries in the pattern of $\bf C$ are equal to 1.
-This saves both time and memory and allows for the efficient representation of
-sparse adjacency matrices of unweighted graphs, yet does not change the C API.
-To the user application, it still appears that $\bf C$ has \verb'nvals(C)'
-entries, all equal to 1.
+If an accumulator operator is present, the following rules are used.
+Positional operators (\verb'GxB_FIRSTI' and related) cannot be used as
+accumulator operators, so these rules do not consider that case.
 
-Creating and operating on iso-valued matrices (or just {\em iso matrices} for 
-short) is significantly faster than creating matrices with different data
-values.  A matrix that is iso requires only $O(1)$ space for its numerical
-values.  The sparse and hypersparse formats require an additional $O(n+e)$ or
-$O(e)$ integer space to hold the pattern of an $n$-by-$n$ matrix \verb'C',
-respectively, and a matrix \verb'C' in bitmap format requires $O(n^2)$ space
-for the bitmap.  A full matrix requires no integer storage, so a matrix that is
-both iso and full requires only $O(1)$ space, regardless of its dimension.
+\begin{itemize}
+\item 
+For matrix assignment, \verb'A' must be iso.  For scalar assignment, the single
+scalar is implicitly expanded into an iso matrix \verb'A' of the right size.
+If these rules do not hold, \verb'C' is not iso.
+
+\item For these forms of assignment \verb'C' is iso if \verb'C' is 
+empty on input, or if $c=c+a$ for the where $a$ is the iso value of \verb'A' or
+the value of the scalar for scalar assignment.
+
+                \begin{itemize}
+                \item \verb'C(I,J) += scalar'
+                \item \verb'C(I,J)<M> += scalar'
+                \item \verb'C(I,J)<!M> += scalar'
+                \item \verb'C(I,J)<M,replace> += scalar'
+                \item \verb'C(I,J)<!M,replace> += scalar'
+                \item \verb'C(I,J)<M,replace> += A'
+                \item \verb'C(I,J)<!M,replace> += A'
+                \item \verb'C(I,J) += A'
+                \item \verb'C(I,J)<M> += A'
+                \item \verb'C(I,J)<!M> += A '
+                \item \verb'C += A'
+                \end{itemize}
+\end{itemize}
+
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices from build methods}
+%-------------------------------------------------------------------------------
+\label{iso_build}
 
-The sections below a describe the methods that can be used to create iso
-matrices and vectors.  Let $a$, $b$, and $c$ denote the iso values of \verb'A',
-\verb'B', and \verb'C', respectively.
+\verb'GxB_Matrix_build_Scalar' and \verb'GxB_Vector_build_Scalar'
+always construct an iso matrix/vector.
+
+\verb'GrB_Matrix_build' and \verb'GrB_Vector_build' can also construct iso
+matrices and vectors.  A non-iso matrix/vector is constructed first, and then
+the entries are checked to see if they are all equal.  The resulting iso-valued
+matrix/vector will be efficient to use and will use less memory than a non-iso
+matrix/vector.  However, constructing an iso matrix/vector with
+\verb'GrB_Matrix_build' and \verb'GrB_Vector_build' will take more time
+and memory than constructing the matrix/vector with
+\verb'GxB_Matrix_build_Scalar' or \verb'GxB_Vector_build_Scalar'.
 
 %-------------------------------------------------------------------------------
-\subsection{Using iso matrices and vectors in a graph algorithm}
+\subsection{Iso matrices from other methods}
 %-------------------------------------------------------------------------------
-\label{iso_usage}
+\label{iso_other}
 
-There are two primary useful ways to use iso-valued matrices and vectors: (1)
-as iso sparse/hypersparse adjacency matrices for unweighted graphs, and (2) as
-iso full matrices or vectors used with operations that do not need to access
-all of the content of the iso full matrix or vector.
+\begin{itemize}
+\item
+For \verb'GrB_Matrix_dup' and \verb'GrB_Vector_dup', the output matrix/vector
+has the same iso property as the input matrix/vector.
 
-In the first use case, simply create a \verb'GrB_Matrix' with values that are
-all the same (those in the sparsity pattern).  The
-\verb'GxB_Matrix_build_Scalar' method can be used for this, since it
-guarantees that the time and work spent on the numerical part of the array
-is only $O(1)$.  The method still must spend $O(e)$ or $O(e \log e)$ time
-on the integer arrays that represent the sparsity pattern, but the reduction
-in time and work on the numerical part of the matrix will improve performance.
+\item
+\verb'GrB_*_setElement_*' preserves the iso property of the matrix/vector it
+modifies, if the input scalar is equal to the iso value of the matrix/vector.
+If the matrix or vector has no entries, the first call to \verb'setElement'
+makes it iso.  This allows a sequence of \verb'setElement' calls with the same
+scalar value to create an entire iso matrix or vector, if starting from
+an empty matrix or vector.
 
-The use of \verb'GxB_Matrix_build_Scalar' is optional.  Matrices can also be
-constructed with \verb'GrB*' methods.  In particular, \verb'GrB_Matrix_build_*'
-can be used.  It first builds a non-iso matrix and then checks if all of the
-values are the same, after assembling any duplicate entries.  This does not
-save time or memory for the construction of the matrix itself, but it will
-lead to savings in time and memory later on, when the matrix is used.
+\item
+\verb'GxB_Matrix_concat' constructs an iso matrix as its result if all input
+tiles are either empty or iso.
 
-To ensure a matrix \verb'C' is iso-valued, simply use \verb'GrB_assign' to
-compute \verb'C<C,struct>=1', or assign whatever value of scalar you wish.
-It is essential to use a structural mask.  Otherwise, it is not clear that
-all entries in \verb'C' will be assigned the same value.  The following
-code takes $O(1)$ time, and it resets the size of the numerical part of the
-\verb'C' matrix to be $O(1)$ in size:
+\item
+\verb'GxB_Matrix_split' constructs its output tiles as iso if its input
+matrix is iso.
 
-{\footnotesize
-\begin{verbatim}
-    bool scalar = true ;
-    GrB_Matrix_assign (C, C, NULL, scalar, GrB_ALL, nrows, GrB_ALL, ncols,
-        GrB_DESC_S) ; \end{verbatim}}
+\item
+\verb'GxB_Matrix_diag' and \verb'GrB_Matrix_diag' construct an iso matrix if
+its input vector is iso.
 
-The Octave/MATLAB analog of the code above is \verb'C=spones(C)'.
+\item
+\verb'GxB_Vector_diag' constructs an iso vector if its input matrix is iso.
 
-The second case for where iso matrices and vectors are useful is to use them
-with operations that do not necessarily access all of their content.
-Suppose you have a matrix \verb'A' of arbitrarily large dimension (say
-\verb'n'-by-\verb'n' where \verb'n=2^60', of type \verb'GrB_FP64'.  A matrix
-this large can be represented by SuiteSparse:GraphBLAS, but only in a
-hypersparse form.
+\item
+\verb'GrB_*extract' constructs an iso matrix/vector if its input matrix/vector
+is iso.
 
-Now, suppose you wish to compute the maximum value in each row, reducing the
-matrix to a vector.  This can be done with \verb'GrB_reduce':
+\item
+\verb'GrB_transpose' constructs an iso matrix if its input is iso.
 
-{\footnotesize
-\begin{verbatim}
-    GrB_Vector_new (&v, GrB_FP64, n) ;
-    GrB_reduce (v, NULL, GrB_MAX_MONOID_FP64, A, NULL) ; \end{verbatim}}
+\item
+The \verb'GxB_import/export/pack/unpack' methods preserve the iso property
+of their matrices/vectors.
+\end{itemize}
 
-It can also be done with \verb'GrB_mxv', by creating an iso full vector
-\verb'x'.  The creation of \verb'x' takes $O(1)$ time and memory,
-and the \verb'GrB_mxv' computation takes $O(e)$ time (with modest assumptions;
-if \verb'A' needs to be transposed the time would be $O(e \log e)$).
+%-------------------------------------------------------------------------------
+\subsection{Iso matrices not exploited}
+%-------------------------------------------------------------------------------
 
-{\footnotesize
-\begin{verbatim}
-    GrB_Vector_new (&v, GrB_FP64, n) ;
-    GrB_Vector_new (&x, GrB_FP64, n) ;
-    GrB_assign (x, NULL, NULL, 1, GrB_ALL, n, NULL) ;
-    GrB_mxv (v, NULL, NULL, GrB_MAX_FIRST_SEMIRING_FP64, A, x, NULL) ; \end{verbatim}}
+There are many cases where an matrix may have the iso property but it is not
+detected by SuiteSparse:GraphBLAS.  For example, if \verb'A' is non-iso,
+\verb'C=A(I,J)' from \verb'GrB_extract' may be iso, if all entries in the
+extracted submatrix have the same value.  Future versions of
+SuiteSparse:GraphBLAS may extend the rules described in this section to detect
+these cases.
 
-The above computations are identical in SuiteSparse:GraphBLAS.  Internally,
-\verb'GrB_reduce' creates \verb'x' and calls \verb'GrB_mxv'.  Using
-\verb'GrB_mxm' directly gives the user application additional flexibility in
-creating new computations that exploit the multiplicative operator in the
-semiring.  \verb'GrB_reduce' always uses the \verb'FIRST' operator in its
-semiring, but any other binary operator can be used instead when using
-\verb'GrB_mxv'.
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Performance} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{perf}
 
-Below is a method for computing the argmax of each row of a square matrix
-\verb'A' of dimension \verb'n' and type \verb'GrB_FP64'.  The vector \verb'x'
-contains the maximum value in each row, and the vector \verb'p' contains the
-zero-based column index of the maximum value in each row.  If there are
-duplicate maximum values in each row, any one of them is selected arbitrarily
-using the \verb'ANY' monoid.  To select the minimum column index of the
-duplicate maximum values, use the \verb'GxB_MIN_SECONDI_INT64' semiring instead
-(this will be slightly slower than the \verb'ANY' monoid if there are many
-duplicates).
+Getting the best performance out of an algorithm that relies on GraphBLAS can
+depend on many factors.  This section describes some of the possible
+performance pitfalls you can hit when using SuiteSparse:GraphBLAS, and how to
+avoid them (or at least know when you've encountered them).
 
-To compute the argmax of each column, use the \verb'GrB_DESC_T0' descriptor
-in \verb'GrB_mxv', and compute \verb'G=A*D' instead of \verb'G=D*A' with
-\verb'GrB_mxm'.  See the \verb'GrB.argmin' and \verb'GrB.argmax' functions
-in the Octave/MATLAB interface for details.
+%-------------------------------------------------------------------------------
+\subsection{The burble is your friend}
+%-------------------------------------------------------------------------------
 
-% corresponds to GrB.argmax with dim = 2
+Turn on the burble with \verb'GxB_set (GxB_BURBLE, true)'.  You will get a
+single line of output from each (significant) call to GraphBLAS.
+The burble output can help you detect when you are likely using sub-optimal
+methods, as described in the next sections.
 
-{\footnotesize
-\begin{verbatim}
-    GrB_Vector_new (&x, GrB_FP64, n) ;
-    GrB_Vector_new (&y, GrB_FP64, n) ;
-    GrB_Vector_new (&p, GrB_INT64, n) ;
-    // y (:) = 1, an iso full vector
-    GrB_assign (y, NULL, NULL, 1, GrB_ALL, n, NULL) ;
-    // x = max (A) where x(i) = max (A (i,:))
-    GrB_mxv (x, NULL, NULL, GrB_MAX_FIRST_SEMIRING_FP64, A, y, NULL) ;
-    // D = diag (x)
-    GrB_Matrix_diag (&D, x, 0) ;
-    // G = D*A using the ANY_EQ semiring
-    GrB_Matrix_new (&G, GrB_BOOL, n, n) ;
-    GrB_mxm (G, NULL, NULL, GxB_ANY_EQ_FP64, D, A, NULL) ;
-    // drop explicit zeros from G
-    GrB_select (G, NULL, NULL, GrB_VALUENE_BOOL, G, 0, NULL) ;
-    // find the position of any max entry in each row: p = G*y,
-    // so that p(i) = j if x(i) = A(i,j) = max (A (i,:))
-    GrB_mxv (p, NULL, NULL, GxB_ANY_SECONDI_INT64, G, y, NULL) ; \end{verbatim}}
+%-------------------------------------------------------------------------------
+\subsection{Data types and typecasting}
+%-------------------------------------------------------------------------------
 
-No part of the above code takes $\Omega(n)$ time or memory.  The data type of
-the iso full vector \verb'y' can be anything, and its iso value can be
-anything.  It is operated on by the \verb'FIRST' operator in the first
-\verb'GrB_mxv', and the \verb'SECONDI' positional operator in the second
-\verb'GrB_mxv', and both operators are oblivious to the content and even the
-type of \verb'y'.  The semirings simply note that \verb'y' is a full vector and
-compute their result according, by accessing the matrices only (\verb'A' and
-\verb'G', respectively).
+Avoid mixing data types and relying on typecasting as much as possible.
+SuiteSparse:GraphBLAS has a set of highly-tuned kernels for each data type,
+and many operators and semirings, but there are too many combinations to
+generate ahead of time.  If typecasting is required, or if
+SuiteSparse:GraphBLAS does not have a kernel for the specific operator or
+semiring, the word \verb'generic' will appear in the burble.  The generic
+methods rely on function pointers for each operation on every scalar, so they
+are slow.  A future JIT will avoid this problem.
 
-For floating-point values, \verb'NaN' values are ignored, and treated as if
-they were not present in the input matrix, unless all entries in a given row
-are equal to \verb'NaN'.  In that case, if all entries in \verb'A(i,:)' are
-equal to \verb'NaN', then \verb'x(i)' is \verb'NaN' and the entry \verb'p(i)'
-is not present.
+The only time that typecasting is fast is when computing \verb'C=A' via
+\verb'GrB_assign' or \verb'GrB_apply', where the data types of \verb'C' and
+\verb'A' can differ.  In this case, one of $13^2 = 169$ kernels are called,
+each of which performs the specific typecasting requested, without relying on
+function pointers.
 
 %-------------------------------------------------------------------------------
-\subsection{Iso matrices from matrix multiplication}
+\subsection{Matrix data structures: sparse, hypersparse, bitmap, or full}
 %-------------------------------------------------------------------------------
-\label{iso_mxm}
 
-Consider \verb'GrB_mxm', \verb'GrB_mxv', and \verb'GrB_vxm', and
-    let \verb'C=A*B', where no mask is present, or \verb'C<M>=A*B' where
-    \verb'C' is initially empty.  If \verb'C' is not initially empty,
-    then these rules apply to a temporary matrix \verb'T<M>=A*B', which is
-    initially empty and is then assigned to \verb'C' via \verb'C<M>=T'.
+SuiteSparse:GraphBLAS tries to automatically determine the best data structure
+for your matrices and vectors, selecting between sparse, hypersparse, bitmap,
+and full formats.  By default, all 4 formats can be used.  A matrix typically
+starts out hypersparse when it is created by \verb'GrB_Matrix_new', and then
+changes during its lifetime, possibly taking on all four different formats
+at different times.  This can be modified via \verb'GxB_set'.  For example,
+this line of code:
 
-    The iso property of \verb'C' is determined with the following rules,
-    where the first rule that fits defines the property and value of \verb'C'.
+    {\scriptsize
+    \begin{verbatim}
+    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_BITMAP) ; \end{verbatim}}
 
-    \begin{itemize}
-    \item If the semiring includes a positional multiplicative operator
-    (\verb'GxB_FIRSTI', \verb'GrB_SECONDI', and related operators), then
-    \verb'C' is never iso.
+\noindent
+tells SuiteSparse that the matrix \verb'A' can be held in either sparse or
+bitmap format (at its discretion), but not hypersparse or full.  The bitmap
+format will be used if the matrix has enough entries, or sparse otherwise.
+Sometimes this selection is best controlled by the user algorithm, so a single
+format can be requested:
 
-    \item Define an {\em iso-monoid} as a built-in monoid with the property
-    that reducing a set of $n>1$ identical values $x$ returns the same value
-    $x$.  These are the \verb'MIN' \verb'MAX' \verb'LOR' \verb'LAND' \verb'BOR'
-    \verb'BAND' and \verb'ANY' monoids.  All other monoids are not iso monoids:
-    \verb'PLUS', \verb'TIMES', \verb'LXNOR', \verb'EQ', \verb'BXOR',
-    \verb'BXNOR', and all user-defined monoids.   Currently, there is no
-    mechanism for telling SuiteSparse:GraphBLAS that a user-defined monoid
-    is an iso-monoid.
+    {\scriptsize
+    \begin{verbatim}
+    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE) ; \end{verbatim}}
+
+This ensures that SuiteSparse will primarily use the sparse format.  This is
+still just a hint, however.  The data structure is opaque and SuiteSparse is
+free to choose otherwise.  In particular, if you insist on using only the
+\verb'GxB_FULL' format, then that format is used when all entries are present.
+However, if the matrix is not actually full with all entries present, then the
+bitmap format is used instead.  The full format does not preserve the sparsity
+structure in this case.  Any GraphBLAS library must preserve the proper
+structure, per the C Specification.  This is critical in a graph algorithm,
+since an edge $(i,j)$ of weight zero, say, is not the same as no edge $(i,j)$
+at all.
+
+%-------------------------------------------------------------------------------
+\subsection{Matrix formats: by row or by column, or using the transpose of
+a matrix}
+%-------------------------------------------------------------------------------
+
+By default, SuiteSparse uses a simple rule:
+all matrices are held by row, unless the consist of a single
+column, in which case they are held by column.  All vectors are treated as if
+they are $n$-by-1 matrices with a single column.  Changing formats from
+row-oriented to column-oriented can have significant performance implications,
+so SuiteSparse never tries to outguess the application.  It just uses this
+simple rule.
 
-    \item If the multiplicative op is \verb'PAIR' (same as \verb'ONEB'),
-    and the monoid is an
-    iso-monoid, or the \verb'EQ' or \verb'TIMES' monoids, then \verb'C' is
-    iso with a value of 1.
+However, there are cases where changing the format can greatly improve
+performance.  There are two ways to handle this, which in the end are
+equivalent in the SuiteSparse internals.  You can change the format (row to
+column oriented, or visa versa), or work with the explicit transpose of a
+matrix in the same storage orientation.
 
-    \item If both \verb'B' and the monoid are iso, and the multiplicative op is
-    \verb'SECOND' or \verb'ANY', then \verb'C' is iso with a value of $b$.
+There are cases where SuiteSparse must explicitly transpose an input matrix, or
+the output matrix, in order to perform a computation.  For example, if all
+matrices are held in row-oriented fashion, SuiteSparse does not have a method
+for computing \verb"C=A'*B", where \verb'A' is transposed.  Thus, SuiteSparse
+either computes a temporary transpose of its input matrix \verb'AT=A' and then
+\verb'C=AT*B', or it swaps the computations, performing \verb"C=(B'*A)'", which
+requires an explicit transpose of \verb'BT=B', and a transpose of the final
+result to obtain \verb'C'.
 
-    \item If both \verb'A' and the monoid are iso, and the multiplicative op is
-    \verb'FIRST' or \verb'ANY', then \verb'C' is iso with a value of $a$.
+These temporary transposes are costly to compute, taking time and memory.  They
+are not kept, but are discarded when the method returns to the user
+application.  If you see the term \verb'transpose' in the burble output, and if
+you need to perform this computation many times, try constructing your own
+explicit transpose, say \verb"AT=A'", via \verb'GrB_transpose', or create a
+copy of \verb'A' but held in another orientation via \verb'GxB_set'.  For
+example, assuming the default matrix format is by-row, and that \verb'A' is
+\verb'm'-by-\verb'n' of type \verb'GrB_FP32':
 
-    \item If \verb'A', \verb'B', and the monoid are all iso, then \verb'C'
-    is iso, with a value $c=f(a,b)$, where $f$ is any multiplicative op
-    (including user-defined, which assumes that a user-defined $f$ has no 
-    side effects).
+    {\scriptsize
+    \begin{verbatim}
+    // method 1: AT = A'
+    GrB_Matrix_new (AT, GrB_FP32, n, m) ;
+    GrB_transpose (AT, NULL, NULL, A, NULL) ;
 
-    \item If \verb'A' and \verb'B' are both iso and full (all entries present,
-    regardless of the format of the matrices), then \verb'C' is iso and full.
-    Its iso value is computed in $O(\log(n))$ time, via a reduction of $n$
-    copies of the value $t=f(a,b)$ to a scalar.  The storage required to
-    represent \verb'C' is just $O(1)$, regardless of its dimension.
-    Technically, the \verb'PLUS' monoid could be computed as $c=nt$ in $O(1)$
-    time, but the log-time reduction works for any monoid, including
-    user-defined ones.
+    // method 2: A2 = A but held by column instead of by row
+    // note: doing the set before the assign is faster than the reverse
+    GrB_Matrix_new (A2, GrB_FP32, m, n) ;
+    GxB_set (A2, GxB_FORMAT, GxB_BY_COL) ;
+    GrB_assign (A2, NULL, NULL, A, GrB_ALL, m, GrB_ALL, n, NULL) ; \end{verbatim}}
 
-    \item Otherwise, \verb'C' is not iso.
-    \end{itemize}
+Internally, the data structure for \verb'AT' and \verb'A2' are nearly identical
+(that is, the tranpose of \verb'A' held in row format is the same as \verb'A'
+held in column format).  Using either of them in subsequent calls to GraphBLAS
+will allow SuiteSparse to avoid computing an explicit transpose.  The two
+matrices \verb'AT' and \verb'A2' do differ in one very significant way:  their
+dimensions are different, and they behave differement mathematically.
+Computing \verb"C=A'*B" using these matrices would differ:
 
-%-------------------------------------------------------------------------------
-\subsection{Iso matrices from eWiseMult and kronecker}
-%-------------------------------------------------------------------------------
-\label{iso_emult}
+    {\scriptsize
+    \begin{verbatim}
+    // method 1: C=A'*B using AT
+    GrB_mxm (C, NULL, NULL, semiring, AT, B, NULL) ;
 
-Consider \verb'GrB_eWiseMult'.  Let
-\verb'C=A.*B', or \verb'C<M>=A.*B' with any mask and where \verb'C' is
-initially empty, where \verb'.*' denotes a binary operator $f(x,y)$
-applied with \verb'eWiseMult'.  These rules also apply to \verb'GrB_kronecker'.
+    // method 2: C=A'*B using A2
+    GrB_mxm (C, NULL, NULL, semiring, A2, B, GrB_DESC_T0) ; \end{verbatim}}
 
-    \begin{itemize}
-    \item If the operator is positional (\verb'GxB_FIRSTI' and related) then
-    \verb'C' is not iso.
+The first method computes \verb'C=AT*B'.  The second method computes
+\verb"C=A2'*B", but the result of both computations is the same, and internally
+the same kernels will be used.
 
-    \item If the op is \verb'PAIR' (same as \verb'ONEB'),
-        then \verb'C' is iso with $c=1$.
+%-------------------------------------------------------------------------------
+\subsection{Push/pull optimization}
+%-------------------------------------------------------------------------------
 
-    \item If \verb'B' is iso and the op is \verb'SECOND' or \verb'ANY',
-        then \verb'C' is iso with $c=b$.
+Closely related to the discussion above on when to use a matrix or its
+transpose is the exploitation of ``push/pull'' direction optimization.  In
+linear algebraic terms, this is simply deciding whether to multiply by the
+matrix or its transpose.  Examples can be see in the BFS and
+Betweeness-Centrality methods of LAGraph.  Here is the BFS kernel:
 
-    \item If \verb'A' is iso and the op is \verb'FIRST' or \verb'ANY',
-        then \verb'C' is iso with $c=a$.
+    {\scriptsize
+    \begin{verbatim}
+    int sparsity = do_push ? GxB_SPARSE : GxB_BITMAP ;
+    GxB_set (q, GxB_SPARSITY_CONTROL, sparsity) ;
+    if (do_push)
+    {
+        // q'{!pi} = q'*A
+        GrB_vxm (q, pi, NULL, semiring, q, A, GrB_DESC_RSC) ;
+    }
+    else
+    {
+        // q{!pi} = AT*q
+        GrB_mxv (q, pi, NULL, semiring, AT, q, GrB_DESC_RSC) ;
+    }\end{verbatim}}
 
-    \item If both \verb'A' and \verb'B' are iso,
-        then \verb'C' is iso with $c=f(a,b)$.
+The call to \verb'GxB_set' is optional, since SuiteSparse will likely already
+determine that a bitmap format will work best when the frontier \verb'q' has
+many entries, which is also when the pull step is fastest.  The push step
+relies on a sparse vector times sparse matrix method originally due to
+Gustavson.  The output is computed as a set union of all rows \verb'A(i,:)'
+where \verb'q(i)' is present on input.  This set union is very fast when
+\verb'q' is very sparse.  The pull step relies on a sequence of dot product
+computations, one per possible entry in the output \verb'q', and it uses the
+matrix \verb"AT" which is a row-oriented copy of the explicit transpose of the
+adjacency matrix \verb'A'.
 
-    \item Otherwise, \verb'C' is not iso.
-    \end{itemize}
+Mathematically, the results of the two methods are identical, but internally,
+the data format of the input matrices is very different (using \verb'A' held
+by row, or \verb'AT' held by row which is the same as a copy of \verb'A' that
+is held by column), and the algorithms used are very different.
 
 %-------------------------------------------------------------------------------
-\subsection{Iso matrices from eWiseAdd}
+\subsection{Computing with full matrices and vectors}
 %-------------------------------------------------------------------------------
-\label{iso_add}
 
-Consider \verb'GrB_eWiseAdd', and also the accumulator phase of \verb'C<M>+=T'
-when an accumulator operator is present.  Let \verb'C=A+B', or \verb'C<M>=A+B'
-with any mask and where \verb'C' is initially empty.
+Sometimes the best approach to getting the highest performance is to use dense
+vectors, and occassionaly dense matrices are tall-and-thin or short-and-fat.
+Packages such as Julia, Octave, or MATLAB, when dealing with the conventional
+plus-times semirings, assume that multiplying a sparse matrix \verb'A' times a
+dense vector \verb'x', \verb'y=A*x', will result in a dense vector \verb'y'.
+This is not always the case, however. GraphBLAS must always return a result
+that respects the sparsity structure of the output matrix or vector.  If the
+$i$th row of \verb'A' has no entries then \verb'y(i)' must not appear as an
+entry in the vector \verb'y', so it cannot be held as a full vector.  As a
+result, the following computation can be slower than it could be:
 
-    \begin{itemize}
-    \item If both \verb'A' and \verb'B' are full (all entries present), then
-    the rules for \verb'eWiseMult' in Section~\ref{iso_emult} are used
-    instead.
+    {\scriptsize
+    \begin{verbatim}
+    GrB_mxv (y, NULL, NULL, semiring, A, x, NULL) ; \end{verbatim}}
 
-    \item If the operator is positional (\verb'GxB_FIRSTI' and related) then
-    \verb'C' is not iso.
+SuiteSparse must do extra work to compute the sparsity of this vector \verb'y',
+but if this is not needed, and \verb'y' can be padded with zeros (or 
+the identity value of the monoid, to be precise), a faster method can be used,
+by relying on the accumulator.  Instead of computing \verb'y=A*x', set all
+entries of \verb'y' to zero first, and then compute \verb'y+=A*x' where the
+accumulator operator and type matches the monoid of the semiring.  SuiteSparse
+has special kernels for this case; you can see them in the burble as
+\verb'F+=S*F' for example.
 
-    \item If $a$ and $b$ differ (when typecasted to the type of \verb'C'),
-    then \verb'C' is not iso.
+    {\scriptsize
+    \begin{verbatim}
+    // y = 0
+    GrB_assign (y, NULL, NULL, 0, GrB_ALL, n, NULL) ;
+    // y += A*x
+    GrB_mxv (y, NULL, GrB_PLUS_FP32, GrB_PLUS_TIMES_SEMIRING_FP32, A, x, NULL) ; \end{verbatim}}
 
-    \item If $c=f(a,b) = a = b$ holds, then \verb'C' is iso,
-    where $f(a,b)$ is the operator.
+You can see this computation in the LAGraph PageRank method, where all
+entries of \verb'r' are set to the \verb'teleport' scalar first.
 
-    \item Otherwise, \verb'C' is not iso.
-    \end{itemize}
+    {\scriptsize
+    \begin{verbatim}
+    for (iters = 0 ; iters < itermax && rdiff > tol ; iters++)
+    {
+        // swap t and r ; now t is the old score
+        GrB_Vector temp = t ; t = r ; r = temp ;
+        // w = t ./ d
+        GrB_eWiseMult (w, NULL, NULL, GrB_DIV_FP32, t, d, NULL) ;
+        // r = teleport
+        GrB_assign (r, NULL, NULL, teleport, GrB_ALL, n, NULL) ;
+        // r += A'*w
+        GrB_mxv (r, NULL, GrB_PLUS_FP32, LAGraph_plus_second_fp32, AT, w, NULL) ;
+        // t -= r
+        GrB_assign (t, NULL, GrB_MINUS_FP32, r, GrB_ALL, n, NULL) ;
+        // t = abs (t)
+        GrB_apply (t, NULL, NULL, GrB_ABS_FP32, t, NULL) ;
+        // rdiff = sum (t)
+        GrB_reduce (&rdiff, NULL, GrB_PLUS_MONOID_FP32, t, NULL) ;
+    } \end{verbatim}}
+
+SuiteSparse exploits the iso-valued property of the scalar-to-vector assignment
+of \verb'y=0', or \verb'r=teleport', and performs these assignments in O(1)
+time and space.  Because the \verb'r' vector start out as full on input to
+\verb'GrB_mxv', and because there is an accumulatr with no mask, no entries in
+the input/output vector \verb'r' will be deleted, even if \verb'A' has empty
+rows.  The call to \verb'GrB_mxv' exploits this, and is able to use a fast
+kernel for this computation.  SuiteSparse does not need to compute the sparsity
+pattern of the vector \verb'r'.
 
 %-------------------------------------------------------------------------------
-\subsection{Iso matrices from eWiseUnion}
+\subsection{Iso-valued matrices and vectors}
 %-------------------------------------------------------------------------------
-\label{iso_union}
 
-\verb'GxB_eWiseUnion' is very similar to \verb'GrB_eWiseAdd', but the rules
-for when the result is iso-valued are very different.
-
-    \begin{itemize}
-    \item If both \verb'A' and \verb'B' are full (all entries present), then
-    the rules for \verb'eWiseMult' in Section~\ref{iso_emult} are used
-    instead.
-
-    \item If the operator is positional (\verb'GxB_FIRSTI' and related) then
-    \verb'C' is not iso.
-
-    \item If the op is \verb'PAIR' (same as \verb'ONEB'),
-        then \verb'C' is iso with $c=1$.
-
-    \item If \verb'B' is iso and the op is \verb'SECOND' or \verb'ANY',
-        and the input scalar \verb'beta' matches $b$
-        (the iso-value of \verb'B'),
-        then \verb'C' is iso with $c=b$.
+Using iso-valued matrices and vectors is always faster than using matrices and
+vectors whose entries can have different values.  Iso-valued matrices are very
+important in graph algorithms.  For example, an unweighted graph is best
+represented as an iso-valued sparse matrix, and unweighted graphs are very
+common.  The burble output, or the \verb'GxB_print', \verb'GxB_Matrix_iso', or
+\verb'GxB_Vector_iso' can all be used to report whether or not your matrix or
+vector is iso-valued.
 
-    \item If \verb'A' is iso and the op is \verb'FIRST' or \verb'ANY',
-        and the input scalar \verb'alpha' matches $a$
-        (the iso-value of \verb'A'),
-        then \verb'C' is iso with $c=a$.
+Sometimes a matrix or vector may have values that are all the same, but
+SuiteSparse hasn't detected this.  If this occurs, you can force a matrix
+or vector to be iso-valued by assigning a single scalar to all its entries.
 
-    \item If both \verb'A' and \verb'B' are iso,
-        and $f(a,b) = f(\alpha,b) = f(a,\beta)$,
-        then \verb'C' is iso with $c=f(a,b)$.
+    {\scriptsize
+    \begin{verbatim}
+    // C<s(C)> = 3.14159
+    GrB_assign (C, C, NULL, 3.14159, GrB_ALL, m, GrB_ALL, n, GrB_DESC_S) ; \end{verbatim}}
 
-    \item Otherwise, \verb'C' is not iso.
-    \end{itemize}
+The matrix \verb'C' is used as its own mask.  The descriptor is essential here,
+telling the mask to be used in a structural sense, without regard to the values
+of the entries in the mask.  This assignment sets all entries that already
+exist in \verb'C' to be equal to a single value, 3.14159. The sparsity
+structure of \verb'C' does not change.  Of course, any scalar can be used; the
+value 1 is common for unweighted graphs.  SuiteSparse:GraphBLAS performs the
+above assignment in O(1) time and space, independent of the dimension of
+\verb'C' or the number of entries in contains.
 
 %-------------------------------------------------------------------------------
-\subsection{Reducing iso matrices to a scalar or vector}
+\subsection{User-defined types and operators}
 %-------------------------------------------------------------------------------
-\label{iso_reduce}
 
-If \verb'A' is iso with $e$ entries, reducing it to a scalar takes $O(\log(e))$
-time, regardless of the monoid used to reduce the matrix to a scalar.  Reducing
-\verb'A' to a vector \verb'c' is the same as the matrix-vector multiply
-\verb"c=A*x" or \verb"c=A'*x", depending on the descriptor, where \verb'x'
-is an iso full vector (refer to Section~\ref{iso_mxm}).
+These are currently slow.  Once SuiteSparse:GraphBLAS employs a JIT
+accelerator, these data types and operators will be just as fast as built-in
+types and operators.  This work is in progress for the GPU, in CUDA, in
+collaboration with Joe Eaton and Corey Nolet.
 
 %-------------------------------------------------------------------------------
-\subsection{Iso matrices from apply}
+\subsection{About NUMA systems}
 %-------------------------------------------------------------------------------
-\label{iso_apply}
-
-Let \verb'C=f(A)' denote the application of a unary operator \verb'f',
-and let \verb'C=f(A,s)' and \verb'C=f(s,A)' denote the application of a binary
-operator with \verb's' a scalar.
-
-    \begin{itemize}
-    \item If the operator is positional (\verb'GxB_POSITION*',
-    \verb'GxB_FIRSTI', and related) then \verb'C' is not iso.
-
-    \item If the operator is \verb'ONE' or \verb'PAIR' (same as \verb'ONEB'),
-        then \verb'C' iso with $c=1$.
-
-    \item If the operator is \verb'FIRST' or \verb'ANY' with \verb'C=f(s,A)',
-        then \verb'C' iso with $c=s$.
 
-    \item If the operator is \verb'SECOND' or \verb'ANY' with \verb'C=f(A,s)',
-        then \verb'C' iso with $c=s$.
+I have tested this package extensively on multicore single-socket systems, but
+have not yet optimized it for multi-socket systems with a NUMA architecture.
+That will be done in a future release.  If you publish benchmarks
+with this package, please state the SuiteSparse:GraphBLAS version, and a caveat
+if appropriate.  If you see significant performance issues when going from a
+single-socket to multi-socket system, I would like to hear from you so I can
+look into it.
 
-    \item If \verb'A' is iso then \verb'C' is iso, with the following value
-        of $c$:
+\newpage
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Examples} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\label{examples}
 
-        \begin{itemize}
-        \item If the op is \verb'IDENTITY', then $c=a$.
-        \item If the op is unary with \verb'C=f(A)', then $c=f(a)$.
-        \item If the op is binary with \verb'C=f(s,A)', then $c=f(s,a)$.
-        \item If the op is binary with \verb'C=f(A,s)', then $c=f(a,s)$.
-        \end{itemize}
+Several examples of how to use GraphBLAS are listed below.  They all
+appear in the \verb'Demo' folder of SuiteSparse:GraphBLAS.  Programs in
+the \verb'Demo' folder are meant as simple examples; for the fastest methods,
+see LAgraph (Section~\ref{lagraph}).
 
+\begin{enumerate}
+\item creating a random matrix
+\item creating a finite-element matrix
+\item reading a matrix from a file
+\item complex numbers as a user-defined type
+\item matrix import/export
+\end{enumerate}
 
-    \item Otherwise, \verb'C' is not iso.
-    \end{itemize}
+Additional examples appear in the newly created LAGraph project, currently in
+progress.
 
 %-------------------------------------------------------------------------------
-\subsection{Iso matrices from select}
+\subsection{LAGraph}
 %-------------------------------------------------------------------------------
-\label{iso_select}
+\label{lagraph}
 
-Let \verb'C=select(A)' denote the application of a \verb'GrB_IndexUnaryOp' operator
-in \verb'GrB_select'.
+The LAGraph project is a community-wide effort to create graph algorithms based
+on GraphBLAS (any implementation of the API, not just SuiteSparse: GraphBLAS).
+Some of the algorithms and utilities in LAGraph are listed in the table below.
+Many additional algorithms are planned.  Refer to
+\url{https://github.com/GraphBLAS/LAGraph} for a current list of algorithms. All
+functions in the \verb'Demo/' folder in SuiteSparse:GraphBLAS will eventually
+be translated into algorithms or utilities for LAGraph, and then removed
+from \verb'GraphBLAS/Demo'.
 
-    \begin{itemize}
-    \item If \verb'A' is iso, then \verb'C' is iso with $c=a$.
-    \item If the operator is any \verb'GrB_VALUE*_BOOL' operator,
-        with no typecasting, and the test is true only for a single boolean
-        value, then \verb'C' is iso.
-    \item If the operator is \verb'GrB_VALUEEQ_*', with no typecasting,
-        then \verb'C' is iso, with $c=t$ where $t$ is the value of the scalar
-        \verb'y'.
-    \item If the operator is \verb'GrB_VALUELE_UINT*', with no typecasting,
-        and the scalar \verb'y' is zero, then \verb'C' is iso with $c=0$.
-    \item Otherwise, \verb'C' is not iso.
-    \end{itemize}
+To use LAGraph with SuiteSparse:GraphBLAS, place the two folders \verb'LAGraph'
+and \verb'GraphBLAS' in the same parent directory.  This allows the
+\verb'cmake' script in LAGraph to find the copy of GraphBLAS.  Alternatively,
+the GraphBLAS source could be placed anywhere, as long as
+\verb'sudo make install' is performed.
 
 %-------------------------------------------------------------------------------
-\subsection{Iso matrices from assign and subassign}
+\subsection{Creating a random matrix}
 %-------------------------------------------------------------------------------
-\label{iso_assign}
+\label{random}
 
-These rules are somewhat complex.  Consider the assignment \verb'C<M>(I,J)=...'
-with \verb'GrB_assign'.  Internally, this assignment is converted into
-\verb'C(I,J)<M(I,J)>=...' and then \verb'GxB_subassign' is used.  Thus,
-all of the rules below assume the form \verb'C(I,J)<M>=...' where \verb'M'
-has the same size as the submatrix \verb'C(I,J)'.
+The \verb'random_matrix' function in the \verb'Demo' folder generates a random
+matrix with a specified dimension and number of entries, either symmetric or
+unsymmetric, and with or without self-edges (diagonal entries in the matrix).
+It relies on \verb'simple_rand*' functions in the \verb'Demo' folder to provide
+a portable random number generator that creates the same sequence on any
+computer and operating system.
 
-\subsubsection{Assignment with no accumulator operator}
+\verb'random_matrix' can use one of two methods: \verb'GrB_Matrix_setElement'
+and \verb'GrB_Matrix_build'.  The former method is very simple to use:
 
-If no accumulator operator is present, the following rules are used.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Matrix_new (&A, GrB_FP64, nrows, ncols) ;
+    for (int64_t k = 0 ; k < ntuples ; k++)
+    {
+        GrB_Index i = simple_rand_i ( ) % nrows ;
+        GrB_Index j = simple_rand_i ( ) % ncols ;
+        if (no_self_edges && (i == j)) continue ;
+        double x = simple_rand_x ( ) ;
+        // A (i,j) = x
+        GrB_Matrix_setElement (A, x, i, j) ;
+        if (make_symmetric)
+        {
+            // A (j,i) = x
+            GrB_Matrix_setElement (A, x, j, i) ;
+        }
+    } \end{verbatim}}
 
-\begin{itemize}
-\item 
-For matrix assignment, \verb'A' must be iso.  For scalar assignment, the single
-scalar is implicitly expanded into an iso matrix \verb'A' of the right size.
-If these rules do not hold, \verb'C' is not iso.
+The above code can generate a million-by-million sparse \verb'double' matrix
+with 200 million entries in 66 seconds (6 seconds of which is the time to
+generate the random \verb'i', \verb'j', and \verb'x'), including the time
+to finish all pending computations.  The user application does not need to
+create a list of all the tuples, nor does it need to know how many entries will
+appear in the matrix.  It just starts from an empty matrix and adds them one at
+a time in arbitrary order.  GraphBLAS handles the rest.  This method is not
+feasible in MATLAB.
 
-\item
-If \verb'A' is not iso, or if \verb'C' is not iso on input, then \verb'C' is
-not iso on output.
+The next method uses \verb'GrB_Matrix_build'.  It is more complex to use than
+\verb'setElement' since it requires the user application to allocate and fill
+the tuple lists, and it requires knowledge of how many entries will appear in
+the matrix, or at least a good upper bound, before the matrix is constructed.
+It is slightly faster, creating the same matrix in 60 seconds, 51 seconds
+of which is spent in \verb'GrB_Matrix_build'.
 
-\item
-If \verb'C' is iso or empty on input, and \verb'A' is iso (or scalar assignment
-is begin performed) and the iso values $c$ and $a$ (or the scalar $s$) match,
-then the following forms of assignment result in an iso matrix \verb'C'  on
-output:
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Index *I, *J ;
+    double *X ;
+    int64_t s = ((make_symmetric) ? 2 : 1) * nedges + 1 ;
+    I = malloc (s * sizeof (GrB_Index)) ;
+    J = malloc (s * sizeof (GrB_Index)) ;
+    X = malloc (s * sizeof (double   )) ;
+    if (I == NULL || J == NULL || X == NULL)
+    {
+        // out of memory
+        if (I != NULL) free (I) ;
+        if (J != NULL) free (J) ;
+        if (X != NULL) free (X) ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    int64_t ntuples = 0 ;
+    for (int64_t k = 0 ; k < nedges ; k++)
+    {
+        GrB_Index i = simple_rand_i ( ) % nrows ;
+        GrB_Index j = simple_rand_i ( ) % ncols ;
+        if (no_self_edges && (i == j)) continue ;
+        double x = simple_rand_x ( ) ;
+        // A (i,j) = x
+        I [ntuples] = i ;
+        J [ntuples] = j ;
+        X [ntuples] = x ;
+        ntuples++ ;
+        if (make_symmetric)
+        {
+            // A (j,i) = x
+            I [ntuples] = j ;
+            J [ntuples] = i ;
+            X [ntuples] = x ;
+            ntuples++ ;
+        }
+    }
+    GrB_Matrix_build (A, I, J, X, ntuples, GrB_SECOND_FP64) ; \end{verbatim}}
 
-                \begin{itemize}
-                \item \verb'C(I,J) = scalar'
-                \item \verb'C(I,J)<M> = scalar'
-                \item \verb'C(I,J)<!M> = scalar'
-                \item \verb'C(I,J)<M,replace> = scalar'
-                \item \verb'C(I,J)<!M,replace> = scalar'
-                \item \verb'C(I,J) = A'
-                \item \verb'C(I,J)<M> = A'
-                \item \verb'C(I,J)<!M> = A'
-                \item \verb'C(I,J)<M,replace> = A'
-                \item \verb'C(I,J)<!M,replace> = A'
-                \end{itemize}
+The equivalent \verb'sprandsym' function in MATLAB takes 150 seconds, but
+\verb'sprandsym' uses a much higher-quality random number generator to create
+the tuples \verb'[I,J,X]'.  Considering just the time for
+\verb'sparse(I,J,X,n,n)' in \verb'sprandsym' (equivalent to
+\verb'GrB_Matrix_build'), the time is 70 seconds.  That is, each of these three
+methods, \verb'setElement' and \verb'build' in SuiteSparse:GraphBLAS, and
+\verb'sparse' in MATLAB, are equally fast.
 
-\item
-For these forms of assignment, \verb'C' is always iso on output, regardless
-of its iso property on input:
+%-------------------------------------------------------------------------------
+\subsection{Creating a finite-element matrix}
+%-------------------------------------------------------------------------------
+\label{fem}
 
-                \begin{itemize}
-                \item \verb'C = scalar'
-                \item \verb'C<M,struct>=scalar'; C empty on input.
-                \item \verb'C<C,struct>=scalar'
-                \end{itemize}
+Suppose a finite-element matrix is being constructed, with \verb'k=40,000'
+finite-element matrices, each of size \verb'8'-by-\verb'8'.  The following
+operations (in pseudo-MATLAB notation) are very efficient in
+SuiteSparse:GraphBLAS.
 
-\item
-For these forms of assignment, \verb'C' is always iso on output if \verb'A'
-is iso:
+    {\footnotesize
+    \begin{verbatim}
+    A = sparse (m,n) ; % create an empty n-by-n sparse GraphBLAS matrix
+    for i = 1:k
+        construct a 8-by-8 sparse or dense finite-element F
+        I and J define where the matrix F is to be added:
+        I = a list of 8 row indices
+        J = a list of 8 column indices
+        % using GrB_assign, with the 'plus' accum operator:
+        A (I,J) = A (I,J) + F
+    end \end{verbatim}}
+
+If this were done in MATLAB or in GraphBLAS with blocking mode enabled, the
+computations would be extremely slow.  A far better approach is to construct a
+list of tuples \verb'[I,J,X]' and to use \verb'sparse(I,J,X,n,n)'. This is
+identical to creating the same list of tuples in GraphBLAS and using the
+\verb'GrB_Matrix_build', which is equally fast.
+
+In SuiteSparse:GraphBLAS, the performance of both methods is essentially
+identical, and roughly as fast as \verb'sparse' in MATLAB.  Inside
+SuiteSparse:GraphBLAS, \verb'GrB_assign' is doing the same thing. When
+performing \verb'A(I,J)=A(I,J)+F', if it finds that it cannot quickly insert an
+update into the \verb'A' matrix, it creates a list of pending tuples to be
+assembled later on.   When the matrix is ready for use in a subsequent
+GraphBLAS operation (one that normally cannot use a matrix with pending
+computations), the tuples are assembled all at once via
+\verb'GrB_Matrix_build'.
 
-                \begin{itemize}
-                \item \verb'C = A'
-                \item \verb'C<M,str> = A'; C empty on input.
-                \end{itemize}
-\end{itemize}
+GraphBLAS operations on other matrices have no effect on when the pending
+updates of a matrix are completed.  Thus, any GraphBLAS method or operation can
+be used to construct the \verb'F' matrix in the example above, without
+affecting when the pending updates to \verb'A' are completed.
 
+The MATLAB \verb'wathen.m' script is part of Higham's \verb'gallery' of
+matrices \cite{Higham}.  It creates a finite-element matrix with random
+coefficients for a 2D mesh of size \verb'nx'-by-\verb'ny', a matrix formulation
+by Wathen \cite{Wathen}.  The pattern of the matrix is fixed; just the values
+are randomized.  The GraphBLAS equivalent can use either
+\verb'GrB_Matrix_build', or \verb'GrB_assign'.  Both methods have good
+performance.  The \verb'GrB_Matrix_build' version below is about 15\% to 20\%
+faster than the MATLAB \verb'wathen.m' function, regardless of the problem
+size.  It uses the identical algorithm as \verb'wathen.m'.
 
-\subsubsection{Assignment with an accumulator operator}
+    {\footnotesize
+    \begin{verbatim}
+    int64_t ntriplets = nx*ny*64 ;
+    I = malloc (ntriplets * sizeof (int64_t)) ;
+    J = malloc (ntriplets * sizeof (int64_t)) ;
+    X = malloc (ntriplets * sizeof (double )) ;
+    if (I == NULL || J == NULL || X == NULL)
+    {
+        FREE_ALL ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    ntriplets = 0 ;
+    for (int j = 1 ; j <= ny ; j++)
+    {
+        for (int i = 1 ; i <= nx ; i++)
+        {
+            nn [0] = 3*j*nx + 2*i + 2*j + 1 ;
+            nn [1] = nn [0] - 1 ;
+            nn [2] = nn [1] - 1 ;
+            nn [3] = (3*j-1)*nx + 2*j + i - 1 ;
+            nn [4] = 3*(j-1)*nx + 2*i + 2*j - 3 ;
+            nn [5] = nn [4] + 1 ;
+            nn [6] = nn [5] + 1 ;
+            nn [7] = nn [3] + 1 ;
+            for (int krow = 0 ; krow < 8 ; krow++) nn [krow]-- ;
+            for (int krow = 0 ; krow < 8 ; krow++)
+            {
+                for (int kcol = 0 ; kcol < 8 ; kcol++)
+                {
+                    I [ntriplets] = nn [krow] ;
+                    J [ntriplets] = nn [kcol] ;
+                    X [ntriplets] = em (krow,kcol) ;
+                    ntriplets++ ;
+                }
+            }
+        }
+    }
+    // A = sparse (I,J,X,n,n) ;
+    GrB_Matrix_build (A, I, J, X, ntriplets, GrB_PLUS_FP64) ; \end{verbatim}}
 
-If an accumulator operator is present, the following rules are used.
-Positional operators (\verb'GxB_FIRSTI' and related) cannot be used as
-accumulator operators, so these rules do not consider that case.
+The \verb'GrB_assign' version has the advantage of not requiring the
+user application to construct the tuple list, and is almost as fast as using
+\verb'GrB_Matrix_build'.  The code is more elegant than either the MATLAB
+\verb'wathen.m' function or its GraphBLAS equivalent above.  Its performance is
+comparable with the other two methods, but slightly slower, being about 5\%
+slower than the MATLAB \verb'wathen', and 20\% slower than the GraphBLAS
+method above.
 
-\begin{itemize}
-\item 
-For matrix assignment, \verb'A' must be iso.  For scalar assignment, the single
-scalar is implicitly expanded into an iso matrix \verb'A' of the right size.
-If these rules do not hold, \verb'C' is not iso.
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Matrix_new (&F, GrB_FP64, 8, 8) ;
+    for (int j = 1 ; j <= ny ; j++)
+    {
+        for (int i = 1 ; i <= nx ; i++)
+        {
+            nn [0] = 3*j*nx + 2*i + 2*j + 1 ;
+            nn [1] = nn [0] - 1 ;
+            nn [2] = nn [1] - 1 ;
+            nn [3] = (3*j-1)*nx + 2*j + i - 1 ;
+            nn [4] = 3*(j-1)*nx + 2*i + 2*j - 3 ;
+            nn [5] = nn [4] + 1 ;
+            nn [6] = nn [5] + 1 ;
+            nn [7] = nn [3] + 1 ;
+            for (int krow = 0 ; krow < 8 ; krow++) nn [krow]-- ;
+            for (int krow = 0 ; krow < 8 ; krow++)
+            {
+                for (int kcol = 0 ; kcol < 8 ; kcol++)
+                {
+                    // F (krow,kcol) = em (krow, kcol)
+                    GrB_Matrix_setElement (F, em (krow,kcol), krow, kcol) ;
+                }
+            }
+            // A (nn,nn) += F
+            GrB_assign (A, NULL, GrB_PLUS_FP64, F, nn, 8, nn, 8, NULL) ;
+        }
+    } \end{verbatim}}
 
-\item For these forms of assignment \verb'C' is iso if \verb'C' is 
-empty on input, or if $c=c+a$ for the where $a$ is the iso value of \verb'A' or
-the value of the scalar for scalar assignment.
+Since there is no \verb'Mask', and since \verb'GrB_REPLACE' is not used, the call
+to \verb'GrB_assign' in the example above is identical to \verb'GxB_subassign'.
+Either one can be used, and their performance would be identical.
 
-                \begin{itemize}
-                \item \verb'C(I,J) += scalar'
-                \item \verb'C(I,J)<M> += scalar'
-                \item \verb'C(I,J)<!M> += scalar'
-                \item \verb'C(I,J)<M,replace> += scalar'
-                \item \verb'C(I,J)<!M,replace> += scalar'
-                \item \verb'C(I,J)<M,replace> += A'
-                \item \verb'C(I,J)<!M,replace> += A'
-                \item \verb'C(I,J) += A'
-                \item \verb'C(I,J)<M> += A'
-                \item \verb'C(I,J)<!M> += A '
-                \item \verb'C += A'
-                \end{itemize}
-\end{itemize}
+Refer to the \verb'wathen.c' function in the \verb'Demo' folder, which
+uses GraphBLAS to implement the two methods above, and two additional ones.
 
 %-------------------------------------------------------------------------------
-\subsection{Iso matrices from build methods}
+\subsection{Reading a matrix from a file}
 %-------------------------------------------------------------------------------
-\label{iso_build}
-
-\verb'GxB_Matrix_build_Scalar' and \verb'GxB_Vector_build_Scalar'
-always construct an iso matrix/vector.
+\label{read}
 
-\verb'GrB_Matrix_build' and \verb'GrB_Vector_build' can also construct iso
-matrices and vectors.  A non-iso matrix/vector is constructed first, and then
-the entries are checked to see if they are all equal.  The resulting iso-valued
-matrix/vector will be efficient to use and will use less memory than a non-iso
-matrix/vector.  However, constructing an iso matrix/vector with
-\verb'GrB_Matrix_build' and \verb'GrB_Vector_build' will take more time
-and memory than constructing the matrix/vector with
-\verb'GxB_Matrix_build_Scalar' or \verb'GxB_Vector_build_Scalar'.
+See also \verb'LAGraph_mmread' and \verb'LAGraph_mmwrite', which
+can read and write any matrix in Matrix Market format, and
+\verb'LAGraph_binread' and \verb'LAGraph_binwrite', which read/write a matrix
+from a binary file.  The binary file I/O functions are much faster than
+the \verb'read_matrix' function described here, and also much faster than
+\verb'LAGraph_mmread' and \verb'LAGraph_mmwrite'.
 
-%-------------------------------------------------------------------------------
-\subsection{Iso matrices from other methods}
-%-------------------------------------------------------------------------------
-\label{iso_other}
+The \verb'read_matrix' function in the \verb'Demo' reads in a triplet matrix
+from a file, one line per entry, and then uses \verb'GrB_Matrix_build' to
+create the matrix.  It creates a second copy with \verb'GrB_Matrix_setElement',
+just to test that method and compare the run times.
+Section~\ref{random} has already compared
+\verb'build' versus \verb'setElement'.
 
-\begin{itemize}
-\item
-For \verb'GrB_Matrix_dup' and \verb'GrB_Vector_dup', the output matrix/vector
-has the same iso property as the input matrix/vector.
+The function can return the matrix as-is, which may be rectangular or
+unsymmetric.  If an input parameter is set to make the matrix symmetric,
+\verb'read_matrix' computes \verb"A=(A+A')/2" if \verb'A' is square (turning
+all directed edges into undirected ones).  If \verb'A' is rectangular, it
+creates a bipartite graph, which is the same as the augmented matrix,
+\verb"A = [0 A ; A' 0]".
+If \verb'C' is an \verb'n'-by-\verb'n' matrix, then \verb"C=(C+C')/2" can be
+computed as follows in GraphBLAS, (the \verb'scale2' function divides an entry
+by 2):
 
-\item
-\verb'GrB_*_setElement_*' preserves the iso property of the matrix/vector it
-modifies, if the input scalar is equal to the iso value of the matrix/vector.
-If the matrix or vector has no entries, the first call to \verb'setElement'
-makes it iso.  This allows a sequence of \verb'setElement' calls with the same
-scalar value to create an entire iso matrix or vector, if starting from
-an empty matrix or vector.
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Descriptor_new (&dt2) ;
+    GrB_Descriptor_set (dt2, GrB_INP1, GrB_TRAN) ;
+    GrB_Matrix_new (&A, GrB_FP64, n, n) ;
+    GrB_eWiseAdd (A, NULL, NULL, GrB_PLUS_FP64, C, C, dt2) ;    // A=C+C'
+    GrB_free (&C) ;
+    GrB_Matrix_new (&C, GrB_FP64, n, n) ;
+    GrB_UnaryOp_new (&scale2_op, scale2, GrB_FP64, GrB_FP64) ;
+    GrB_apply (C, NULL, NULL, scale2_op, A, NULL) ;             // C=A/2
+    GrB_free (&A) ;
+    GrB_free (&scale2_op) ; \end{verbatim}}
 
-\item
-\verb'GxB_Matrix_concat' constructs an iso matrix as its result if all input
-tiles are either empty or iso.
+This is of course not nearly as elegant as \verb"A=(A+A')/2" in MATLAB, but
+with minor changes it can work on any type and use any built-in operators
+instead of \verb'PLUS', or it can use any user-defined operators and types.
+The above code in SuiteSparse:GraphBLAS takes 0.60 seconds for the
+\verb'Freescale2' matrix, slightly slower than MATLAB (0.55 seconds).
 
-\item
-\verb'GxB_Matrix_split' constructs its output tiles as iso if its input
-matrix is iso.
+Constructing the augmented system is more complicated using the GraphBLAS C API
+Specification since it does not yet have a simple way of specifying a range of
+row and column indices, as in \verb'A(10:20,30:50)' in MATLAB (\verb'GxB_RANGE'
+is a SuiteSparse:GraphBLAS extension that is not in the Specification).  Using
+the C API in the Specification, the application must instead build a list of
+indices first, \verb'I=[10, 11' \verb'...' \verb'20]'.
 
-\item
-\verb'GxB_Matrix_diag' and \verb'GrB_Matrix_diag' construct an iso matrix if
-its input vector is iso.
+Thus, to compute the MATLAB equivalent of \verb"A = [0 A ; A' 0]", index lists
+\verb'I' and \verb'J' must first be constructed:
 
-\item
-\verb'GxB_Vector_diag' constructs an iso vector if its input matrix is iso.
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    int64_t n = nrows + ncols ;
+    I = malloc (nrows * sizeof (int64_t)) ;
+    J = malloc (ncols * sizeof (int64_t)) ;
+    // I = 0:nrows-1
+    // J = nrows:n-1
+    if (I == NULL || J == NULL)
+    {
+        if (I != NULL) free (I) ;
+        if (J != NULL) free (J) ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    for (int64_t k = 0 ; k < nrows ; k++) I [k] = k ;
+    for (int64_t k = 0 ; k < ncols ; k++) J [k] = k + nrows ; \end{verbatim}}
 
-\item
-\verb'GrB_*extract' constructs an iso matrix/vector if its input matrix/vector
-is iso.
+Once the index lists are generated, however, the resulting GraphBLAS operations
+are fairly straightforward, computing \verb"A=[0 C ; C' 0]".
 
-\item
-\verb'GrB_transpose' constructs an iso matrix if its input is iso.
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    GrB_Descriptor_new (&dt1) ;
+    GrB_Descriptor_set (dt1, GrB_INP0, GrB_TRAN) ;
+    GrB_Matrix_new (&A, GrB_FP64, n, n) ;
+    // A (nrows:n-1, 0:nrows-1) = C'
+    GrB_assign (A, NULL, NULL, C, J, ncols, I, nrows, dt1) ;
+    // A (0:nrows-1, nrows:n-1) = C
+    GrB_assign (A, NULL, NULL, C, I, nrows, J, ncols, NULL) ; \end{verbatim}}
 
-\item
-The \verb'GxB_import/export/pack/unpack' methods preserve the iso property
-of their matrices/vectors.
-\end{itemize}
+This takes 1.38 seconds for the \verb'Freescale2' matrix, almost as fast as \newline
+\verb"A=[sparse(m,m) C ; C' sparse(n,n)]" in MATLAB (1.25 seconds).
+The \verb'GxB_Matrix_concat' function would be faster still (this example
+was written prior to \verb'GxB_Matrix_concat' was added to SuiteSparse:GraphBLAS).
 
-%-------------------------------------------------------------------------------
-\subsection{Iso matrices not exploited}
-%-------------------------------------------------------------------------------
+Both calls to \verb'GrB_assign' use no accumulator, so the second one
+causes the partial matrix \verb"A=[0 0 ; C' 0]" to be built first, followed by
+the final build of \verb"A=[0 C ; C' 0]".  A better method, but not an obvious
+one, is to use the \verb'GrB_FIRST_FP64' accumulator for both assignments.  An
+accumulator enables SuiteSparse:GraphBLAS to determine that that entries
+created by the first assignment cannot be deleted by the second, and thus it
+need not force completion of the pending updates prior to the second
+assignment.
 
-There are many cases where an matrix may have the iso property but it is not
-detected by SuiteSparse:GraphBLAS.  For example, if \verb'A' is non-iso,
-\verb'C=A(I,J)' from \verb'GrB_extract' may be iso, if all entries in the
-extracted submatrix have the same value.  Future versions of
-SuiteSparse:GraphBLAS may extend the rules described in this section to detect
-these cases.
+SuiteSparse:GraphBLAS also adds a \verb'GxB_RANGE' mechanism that mimics
+the MATLAB colon notation.  This speeds up the method and simplifies the
+code the user needs to write to compute \verb"A=[0 C ; C' 0]":
 
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Performance} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{perf}
+    \vspace{-0.05in}
+    {\footnotesize
+    \begin{verbatim}
+    int64_t n = nrows + ncols ;
+    GrB_Matrix_new (&A, xtype, n, n) ;
+    GrB_Index I_range [3], J_range [3] ;
+    I_range [GxB_BEGIN] = 0 ;
+    I_range [GxB_END  ] = nrows-1 ;
+    J_range [GxB_BEGIN] = nrows ;
+    J_range [GxB_END  ] = ncols+nrows-1 ;
+    // A (nrows:n-1, 0:nrows-1) += C'
+    GrB_assign (A, NULL, GrB_FIRST_FP64, // or NULL,
+        C, J_range, GxB_RANGE, I_range, GxB_RANGE, dt1) ;
+    // A (0:nrows-1, nrows:n-1) += C
+    GrB_assign (A, NULL, GrB_FIRST_FP64, // or NULL,
+        C, I_range, GxB_RANGE, J_range, GxB_RANGE, NULL) ; \end{verbatim}}
 
-Getting the best performance out of an algorithm that relies on GraphBLAS can
-depend on many factors.  This section describes some of the possible
-performance pitfalls you can hit when using SuiteSparse:GraphBLAS, and how to
-avoid them (or at least know when you've encountered them).
+Any operator will suffice because it is not actually applied.  An operator is
+only applied to the set intersection, and the two assignments do not overlap.
+If an \verb'accum' operator is used, only the final matrix is built, and the
+time in GraphBLAS drops slightly to 1.25 seconds.  This is a very small
+improvement because in this particular case, SuiteSparse:GraphBLAS is able to
+detect that no sorting is required for the first build, and the second one is a
+simple concatenation.  In general, however, allowing GraphBLAS to postpone
+pending updates can lead to significant reductions in run time.
 
 %-------------------------------------------------------------------------------
-\subsection{The burble is your friend}
+\subsection{User-defined types and operators}
 %-------------------------------------------------------------------------------
+\label{user}
 
-Turn on the burble with \verb'GxB_set (GxB_BURBLE, true)'.  You will get a
-single line of output from each (significant) call to GraphBLAS.
-The burble output can help you detect when you are likely using sub-optimal
-methods, as described in the next sections.
+The \verb'Demo' folder contains two working examples of user-defined types,
+first discussed in Section~\ref{type_new}: \verb'double complex', and a
+user-defined \verb'typedef' called \verb'wildtype' with a \verb'struct'
+containing a string and a 4-by-4 \verb'float' matrix.
 
-%-------------------------------------------------------------------------------
-\subsection{Data types and typecasting}
-%-------------------------------------------------------------------------------
+{\bf Double Complex:}
+Prior to v3.3, GraphBLAS did not have a native complex type.  It now appears as
+the \verb'GxB_FC64' predefined type, but a complex type can also easily added
+as a user-defined type.  The \verb'Complex_init' function in the
+\verb'usercomplex.c' file in the \verb'Demo' folder creates the \verb'Complex'
+type based on the ANSI C11 \verb'double complex' type.
+It creates a full suite of operators that correspond to every
+built-in GraphBLAS operator, both binary and unary.  In addition, it
+creates the operators listed in the following table, where $D$ is
+\verb'double' and $C$ is \verb'Complex'.
 
-Avoid mixing data types and relying on typecasting as much as possible.
-SuiteSparse:GraphBLAS has a set of highly-tuned kernels for each data type,
-and many operators and semirings, but there are too many combinations to
-generate ahead of time.  If typecasting is required, or if
-SuiteSparse:GraphBLAS does not have a kernel for the specific operator or
-semiring, the word \verb'generic' will appear in the burble.  The generic
-methods rely on function pointers for each operation on every scalar, so they
-are slow.  A future JIT will avoid this problem.
+\vspace{0.1in}
+{\footnotesize
+\begin{tabular}{llll}
+\hline
+name                    & types             & Octave/MATLAB & description \\
+                        &                   & equivalent    & \\
+\hline
+\verb'Complex_complex'  & $D \times D \rightarrow C$ & \verb'z=complex(x,y)' & complex from real and imag. \\
+\hline
+\verb'Complex_conj'     & $C \rightarrow C$ & \verb'z=conj(x)'  & complex conjugate \\
+\verb'Complex_real'     & $C \rightarrow D$ & \verb'z=real(x)'  & real part \\
+\verb'Complex_imag'     & $C \rightarrow D$ & \verb'z=imag(x)'  & imaginary part \\
+\verb'Complex_angle'    & $C \rightarrow D$ & \verb'z=angle(x)' & phase angle \\
+\verb'Complex_complex_real'  & $D \rightarrow C$ & \verb'z=complex(x,0)' & real to complex real \\
+\verb'Complex_complex_imag'  & $D \rightarrow C$ & \verb'z=complex(0,x)' & real to complex imag. \\
+\hline
+\end{tabular}
+}
 
-The only time that typecasting is fast is when computing \verb'C=A' via
-\verb'GrB_assign' or \verb'GrB_apply', where the data types of \verb'C' and
-\verb'A' can differ.  In this case, one of $13^2 = 169$ kernels are called,
-each of which performs the specific typecasting requested, without relying on
-function pointers.
+The \verb'Complex_init' function creates two monoids (\verb'Complex_add_monoid'
+and \verb'Complex_times_monoid') and a semiring \verb'Complex_plus_times' that
+corresponds to the conventional linear algebra for complex matrices.  The
+include file \verb'usercomplex.h' in the \verb'Demo' folder is available so
+that this user-defined \verb'Complex' type can easily be imported into any
+other user application.  When the user application is done, the
+\verb'Complex_finalize' function frees the \verb'Complex' type and its
+operators, monoids, and semiring.
+NOTE: the \verb'Complex' type is not supported in this Demo in Microsoft
+Visual Studio.
+
+{\bf Struct-based:}
+In addition, the \verb'wildtype.c' program  creates a user-defined
+\verb'typedef' of a \verb'struct' containing a dense 4-by-4 \verb'float'
+matrix, and a 64-character string.  It constructs an additive monoid that adds
+two 4-by-4 dense matrices, and a multiplier operator that multiplies two 4-by-4
+matrices.  Each of these 4-by-4 matrices is treated by GraphBLAS as a
+``scalar'' value, and they can be manipulated in the same way any other
+GraphBLAS type can be manipulated. The purpose of this type is illustrate the
+endless possibilities of user-defined types and their use in GraphBLAS.
 
 %-------------------------------------------------------------------------------
-\subsection{Matrix data structures: sparse, hypersparse, bitmap, or full}
+\subsection{User applications using OpenMP or other threading models}
 %-------------------------------------------------------------------------------
+\label{threads}
 
-SuiteSparse:GraphBLAS tries to automatically determine the best data structure
-for your matrices and vectors, selecting between sparse, hypersparse, bitmap,
-and full formats.  By default, all 4 formats can be used.  A matrix typically
-starts out hypersparse when it is created by \verb'GrB_Matrix_new', and then
-changes during its lifetime, possibly taking on all four different formats
-at different times.  This can be modified via \verb'GxB_set'.  For example,
-this line of code:
-
-    {\scriptsize
-    \begin{verbatim}
-    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_BITMAP) ; \end{verbatim}}
-
-\noindent
-tells SuiteSparse that the matrix \verb'A' can be held in either sparse or
-bitmap format (at its discretion), but not hypersparse or full.  The bitmap
-format will be used if the matrix has enough entries, or sparse otherwise.
-Sometimes this selection is best controlled by the user algorithm, so a single
-format can be requested:
+An example demo program (\verb'openmp_demo') is included that illustrates how a
+multi-threaded user application can use GraphBLAS.
 
-    {\scriptsize
-    \begin{verbatim}
-    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE) ; \end{verbatim}}
+The results from the \verb'openmp_demo' program may appear out of order.  This
+is by design, simply to show that the user application is running in parallel.
+The output of each thread should be the same.  In particular, each thread
+generates an intentional error, and later on prints it with \verb'GrB_error'.
+It will print its own error, not an error from another thread.  When all the
+threads finish, the leader thread prints out each matrix generated by each
+thread.
 
-This ensures that SuiteSparse will primarily use the sparse format.  This is
-still just a hint, however.  The data structure is opaque and SuiteSparse is
-free to choose otherwise.  In particular, if you insist on using only the
-\verb'GxB_FULL' format, then that format is used when all entries are present.
-However, if the matrix is not actually full with all entries present, then the
-bitmap format is used instead.  The full format does not preserve the sparsity
-structure in this case.  Any GraphBLAS library must preserve the proper
-structure, per the C Specification.  This is critical in a graph algorithm,
-since an edge $(i,j)$ of weight zero, say, is not the same as no edge $(i,j)$
-at all.
+GraphBLAS can also be combined with user applications that rely on MPI, the
+Intel TBB threading library, POSIX pthreads, Microsoft Windows threads, or any
+other threading library.  In all cases, GraphBLAS will be thread safe.
 
+\newpage
 %-------------------------------------------------------------------------------
-\subsection{Matrix formats: by row or by column, or using the transpose of
-a matrix}
+\section{Compiling and Installing SuiteSparse:GraphBLAS}
 %-------------------------------------------------------------------------------
+\label{sec:install}
 
-By default, SuiteSparse uses a simple rule:
-all matrices are held by row, unless the consist of a single
-column, in which case they are held by column.  All vectors are treated as if
-they are $n$-by-1 matrices with a single column.  Changing formats from
-row-oriented to column-oriented can have significant performance implications,
-so SuiteSparse never tries to outguess the application.  It just uses this
-simple rule.
-
-However, there are cases where changing the format can greatly improve
-performance.  There are two ways to handle this, which in the end are
-equivalent in the SuiteSparse internals.  You can change the format (row to
-column oriented, or visa versa), or work with the explicit transpose of a
-matrix in the same storage orientation.
+%----------------------------------------
+\subsection{On Linux and Mac}
+%----------------------------------------
 
-There are cases where SuiteSparse must explicitly transpose an input matrix, or
-the output matrix, in order to perform a computation.  For example, if all
-matrices are held in row-oriented fashion, SuiteSparse does not have a method
-for computing \verb"C=A'*B", where \verb'A' is transposed.  Thus, SuiteSparse
-either computes a temporary transpose of its input matrix \verb'AT=A' and then
-\verb'C=AT*B', or it swaps the computations, performing \verb"C=(B'*A)'", which
-requires an explicit transpose of \verb'BT=B', and a transpose of the final
-result to obtain \verb'C'.
+GraphBLAS makes extensive use of features in the ANSI C11 standard, and thus a
+C compiler supporting this version of the C standard is required to use
+all features of GraphBLAS. 
 
-These temporary transposes are costly to compute, taking time and memory.  They
-are not kept, but are discarded when the method returns to the user
-application.  If you see the term \verb'transpose' in the burble output, and if
-you need to perform this computation many times, try constructing your own
-explicit transpose, say \verb"AT=A'", via \verb'GrB_transpose', or create a
-copy of \verb'A' but held in another orientation via \verb'GxB_set'.  For
-example, assuming the default matrix format is by-row, and that \verb'A' is
-\verb'm'-by-\verb'n' of type \verb'GrB_FP32':
+{\bf Any version of the Intel \verb'icx' compiler is highly recommended.} In
+most cases, the Intel \verb'icx' and the Intel OpenMP library (\verb'libiomp')
+result in the best performance.  The \verb'gcc' and the GNU OpenMP library
+(\verb'libgomp') generally gives good performance: typically on par with icx
+but in a few special cases significantly slower.  The Intel \verb'icc' compiler
+is not recommended; it results in poor performance for
+\verb'#pragma omp atomic'.
 
-    {\scriptsize
-    \begin{verbatim}
-    // method 1: AT = A'
-    GrB_Matrix_new (AT, GrB_FP32, n, m) ;
-    GrB_transpose (AT, NULL, NULL, A, NULL) ;
+On the Mac (OS X), \verb'clang' 8.0.0 in \verb'Xcode' version 8.2.1 is
+sufficient, although earlier versions of \verb'Xcode' may work as well.  For
+the GNU \verb'gcc' compiler, version 4.9 or later is required, but best
+performance is obtained in 9.3 or later.  Version 3.13 or later of \verb'cmake'
+is required; version 3.17 is preferred.
 
-    // method 2: A2 = A but held by column instead of by row
-    // note: doing the set before the assign is faster than the reverse
-    GrB_Matrix_new (A2, GrB_FP32, m, n) ;
-    GxB_set (A2, GxB_FORMAT, GxB_BY_COL) ;
-    GrB_assign (A2, NULL, NULL, A, GrB_ALL, m, GrB_ALL, n, NULL) ; \end{verbatim}}
+If you are using a pre-C11 ANSI C compiler, such as Microsoft Visual Studio,
+then the \verb'_Generic' keyword is not available.  SuiteSparse:GraphBLAS
+will still compile, but you will not have access to polymorphic functions
+such as \verb'GrB_assign'.  You will need to use the non-polymorphic functions
+instead.
 
-Internally, the data structure for \verb'AT' and \verb'A2' are nearly identical
-(that is, the tranpose of \verb'A' held in row format is the same as \verb'A'
-held in column format).  Using either of them in subsequent calls to GraphBLAS
-will allow SuiteSparse to avoid computing an explicit transpose.  The two
-matrices \verb'AT' and \verb'A2' do differ in one very significant way:  their
-dimensions are different, and they behave differement mathematically.
-Computing \verb"C=A'*B" using these matrices would differ:
+To compile SuiteSparse:GraphBLAS, simply type \verb'make' in the main GraphBLAS
+folder, which compiles the library with your default system compiler.  This
+compile GraphBLAS using 8 threads, which will take a long time.  To compile with
+more threads (40, for this example), use:
 
-    {\scriptsize
+    {\small
     \begin{verbatim}
-    // method 1: C=A'*B using AT
-    GrB_mxm (C, NULL, NULL, semiring, AT, B, NULL) ;
+    make JOBS=40 \end{verbatim} }
 
-    // method 2: C=A'*B using A2
-    GrB_mxm (C, NULL, NULL, semiring, A2, B, GrB_DESC_T0) ; \end{verbatim}}
+To use a non-default compiler with 4 threads:
 
-The first method computes \verb'C=AT*B'.  The second method computes
-\verb"C=A2'*B", but the result of both computations is the same, and internally
-the same kernels will be used.
+    {\small
+    \begin{verbatim}
+    make CC=icx CXX=icpx JOBS=4 \end{verbatim} }
 
-%-------------------------------------------------------------------------------
-\subsection{Push/pull optimization}
-%-------------------------------------------------------------------------------
+GraphBLAS v6.1.3 and later use the \verb'cpu_features' package by Google to
+determine if the target architecture supports AVX2 and/or AVX512F (on Intel
+x86\_64 architectures only).  In case you have build issues with this package,
+you can compile without it (and then AVX2 and AVX512F acceleration will not
+be used):
 
-Closely related to the discussion above on when to use a matrix or its
-transpose is the exploitation of ``push/pull'' direction optimization.  In
-linear algebraic terms, this is simply deciding whether to multiply by the
-matrix or its transpose.  Examples can be see in the BFS and
-Betweeness-Centrality methods of LAGraph.  Here is the BFS kernel:
+    {\small
+    \begin{verbatim}
+    make CMAKE_OPTIONS='-DGBNCPUFEAT=1'  \end{verbatim} }
 
-    {\scriptsize
+Without \verb'cpu_features', it is still possible to enable AVX2 and AVX512F.
+Rather than relying on run-time tests, you can use these flags to enable
+both AVX2 and AVX512F, without relying on \verb'cpu_features':
+
+    {\small
     \begin{verbatim}
-    int sparsity = do_push ? GxB_SPARSE : GxB_BITMAP ;
-    GxB_set (q, GxB_SPARSITY_CONTROL, sparsity) ;
-    if (do_push)
-    {
-        // q'{!pi} = q'*A
-        GrB_vxm (q, pi, NULL, semiring, q, A, GrB_DESC_RSC) ;
-    }
-    else
-    {
-        // q{!pi} = AT*q
-        GrB_mxv (q, pi, NULL, semiring, AT, q, GrB_DESC_RSC) ;
-    }\end{verbatim}}
+    make CMAKE_OPTIONS='-DGBNCPUFEAT=1 -DGBAVX2=1 -DGBAVX512F=1' \end{verbatim} }
 
-The call to \verb'GxB_set' is optional, since SuiteSparse will likely already
-determine that a bitmap format will work best when the frontier \verb'q' has
-many entries, which is also when the pull step is fastest.  The push step
-relies on a sparse vector times sparse matrix method originally due to
-Gustavson.  The output is computed as a set union of all rows \verb'A(i,:)'
-where \verb'q(i)' is present on input.  This set union is very fast when
-\verb'q' is very sparse.  The pull step relies on a sequence of dot product
-computations, one per possible entry in the output \verb'q', and it uses the
-matrix \verb"AT" which is a row-oriented copy of the explicit transpose of the
-adjacency matrix \verb'A'.
+To use multiple options, separate them by a space.  For example, to build
+just the library but not \verb'cpu_features', and to enable
+AVX2 but not AVX512F, and use 40 threads to compile:
 
-Mathematically, the results of the two methods are identical, but internally,
-the data format of the input matrices is very different (using \verb'A' held
-by row, or \verb'AT' held by row which is the same as a copy of \verb'A' that
-is held by column), and the algorithms used are very different.
+    {\small
+    \begin{verbatim}
+    make CMAKE_OPTIONS='-DGBNCPUFEAT=1 -DGBAVX2=1' JOBS=40 \end{verbatim} }
 
-%-------------------------------------------------------------------------------
-\subsection{Computing with full matrices and vectors}
-%-------------------------------------------------------------------------------
+After compiling the library, you can compile the demos with
+\verb'make all' and then \verb'make run' while in the top-level
+GraphBLAS folder.
 
-Sometimes the best approach to getting the highest performance is to use dense
-vectors, and occassionaly dense matrices are tall-and-thin or short-and-fat.
-Packages such as Julia, Octave, or MATLAB, when dealing with the conventional
-plus-times semirings, assume that multiplying a sparse matrix \verb'A' times a
-dense vector \verb'x', \verb'y=A*x', will result in a dense vector \verb'y'.
-This is not always the case, however. GraphBLAS must always return a result
-that respects the sparsity structure of the output matrix or vector.  If the
-$i$th row of \verb'A' has no entries then \verb'y(i)' must not appear as an
-entry in the vector \verb'y', so it cannot be held as a full vector.  As a
-result, the following computation can be slower than it could be:
+If \verb'cmake' or \verb'make' fail, it might be that your default compiler
+does not support ANSI C11.  Try another compiler.  For example, try one of
+these options.  Go into the \verb'build' directory and type one of these:
 
-    {\scriptsize
+    {\small
     \begin{verbatim}
-    GrB_mxv (y, NULL, NULL, semiring, A, x, NULL) ; \end{verbatim}}
+    CC=gcc cmake ..
+    CC=gcc-11 cmake ..
+    CC=xlc cmake ..
+    CC=icx cmake ..  \end{verbatim} }
 
-SuiteSparse must do extra work to compute the sparsity of this vector \verb'y',
-but if this is not needed, and \verb'y' can be padded with zeros (or 
-the identity value of the monoid, to be precise), a faster method can be used,
-by relying on the accumulator.  Instead of computing \verb'y=A*x', set all
-entries of \verb'y' to zero first, and then compute \verb'y+=A*x' where the
-accumulator operator and type matches the monoid of the semiring.  SuiteSparse
-has special kernels for this case; you can see them in the burble as
-\verb'F+=S*F' for example.
+You can also do the following in the top-level GraphBLAS folder instead:
 
-    {\scriptsize
+    {\small
     \begin{verbatim}
-    // y = 0
-    GrB_assign (y, NULL, NULL, 0, GrB_ALL, n, NULL) ;
-    // y += A*x
-    GrB_mxv (y, NULL, GrB_PLUS_FP32, GrB_PLUS_TIMES_SEMIRING_FP32, A, x, NULL) ; \end{verbatim}}
+    CC=gcc make
+    CC=gcc-11 make
+    CC=xlc make
+    CC=icx make \end{verbatim} }
 
-You can see this computation in the LAGraph PageRank method, where all
-entries of \verb'r' are set to the \verb'teleport' scalar first.
+For faster compilation, you can specify a parallel make.  For example,
+to use 32 parallel jobs and the \verb'gcc' compiler, do the following:
 
-    {\scriptsize
+    {\small
     \begin{verbatim}
-    for (iters = 0 ; iters < itermax && rdiff > tol ; iters++)
-    {
-        // swap t and r ; now t is the old score
-        GrB_Vector temp = t ; t = r ; r = temp ;
-        // w = t ./ d
-        GrB_eWiseMult (w, NULL, NULL, GrB_DIV_FP32, t, d, NULL) ;
-        // r = teleport
-        GrB_assign (r, NULL, NULL, teleport, GrB_ALL, n, NULL) ;
-        // r += A'*w
-        GrB_mxv (r, NULL, GrB_PLUS_FP32, LAGraph_plus_second_fp32, AT, w, NULL) ;
-        // t -= r
-        GrB_assign (t, NULL, GrB_MINUS_FP32, r, GrB_ALL, n, NULL) ;
-        // t = abs (t)
-        GrB_apply (t, NULL, NULL, GrB_ABS_FP32, t, NULL) ;
-        // rdiff = sum (t)
-        GrB_reduce (&rdiff, NULL, GrB_PLUS_MONOID_FP32, t, NULL) ;
-    } \end{verbatim}}
+    JOBS=32 CC=gcc make \end{verbatim} }
 
-SuiteSparse exploits the iso-valued property of the scalar-to-vector assignment
-of \verb'y=0', or \verb'r=teleport', and performs these assignments in O(1)
-time and space.  Because the \verb'r' vector start out as full on input to
-\verb'GrB_mxv', and because there is an accumulatr with no mask, no entries in
-the input/output vector \verb'r' will be deleted, even if \verb'A' has empty
-rows.  The call to \verb'GrB_mxv' exploits this, and is able to use a fast
-kernel for this computation.  SuiteSparse does not need to compute the sparsity
-pattern of the vector \verb'r'.
+If you do not have \verb'cmake', refer to Section~\ref{altmake}.
 
-%-------------------------------------------------------------------------------
-\subsection{Iso-valued matrices and vectors}
-%-------------------------------------------------------------------------------
+%----------------------------------------
+\subsection{More details on the Mac}
+%----------------------------------------
 
-Using iso-valued matrices and vectors is always faster than using matrices and
-vectors whose entries can have different values.  Iso-valued matrices are very
-important in graph algorithms.  For example, an unweighted graph is best
-represented as an iso-valued sparse matrix, and unweighted graphs are very
-common.  The burble output, or the \verb'GxB_print', \verb'GxB_Matrix_iso', or
-\verb'GxB_Vector_iso' can all be used to report whether or not your matrix or
-vector is iso-valued.
+SuiteSparse:GraphBLAS requires OpenMP for its internal parallelism, but
+OpenMP is not on the Mac by default.
 
-Sometimes a matrix or vector may have values that are all the same, but
-SuiteSparse hasn't detected this.  If this occurs, you can force a matrix
-or vector to be iso-valued by assigning a single scalar to all its entries.
+If you have the Intel compiler and OpenMP library, then use the following
+in the top-level \verb'GraphBLAS' folder.  OpenMP will be found automatically:
 
-    {\scriptsize
+    {\small
     \begin{verbatim}
-    // C<s(C)> = 3.14159
-    GrB_assign (C, C, NULL, 3.14159, GrB_ALL, m, GrB_ALL, n, GrB_DESC_S) ; \end{verbatim}}
+    make CC=icc CXX=icpc \end{verbatim} }
 
-The matrix \verb'C' is used as its own mask.  The descriptor is essential here,
-telling the mask to be used in a structural sense, without regard to the values
-of the entries in the mask.  This assignment sets all entries that already
-exist in \verb'C' to be equal to a single value, 3.14159. The sparsity
-structure of \verb'C' does not change.  Of course, any scalar can be used; the
-value 1 is common for unweighted graphs.  SuiteSparse:GraphBLAS performs the
-above assignment in O(1) time and space, independent of the dimension of
-\verb'C' or the number of entries in contains.
+The following instructions work on MacOS Big Sur (v11.3)
+and MacOS Monterey (12.1), using
+cmake 3.13 or later:
 
-%-------------------------------------------------------------------------------
-\subsection{User-defined types and operators}
-%-------------------------------------------------------------------------------
+First install Xcode (see \url{https://developer.apple.com/xcode}),
+and then install the command line tools for Xcode:
 
-These are currently slow.  Once SuiteSparse:GraphBLAS employs a JIT
-accelerator, these data types and operators will be just as fast as built-in
-types and operators.  This work is in progress for the GPU, in CUDA, in
-collaboration with Joe Eaton and Corey Nolet.
+    {\small
+    \begin{verbatim}
+    cd /Applications/Utilities
+    xcode-select —install \end{verbatim} }
 
-\newpage
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Examples} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\label{examples}
+Next, install brew, at \url{https://brew.sh}.
 
-Several examples of how to use GraphBLAS are listed below.  They all
-appear in the \verb'Demo' folder of SuiteSparse:GraphBLAS.  Programs in
-the \verb'Demo' folder are meant as simple examples; for the fastest methods,
-see LAgraph (Section~\ref{lagraph}).
+If not used for the MATLAB mexFunction interface, a recent update of the Apple
+Clang compiler now works with \verb'libomp' and the
+\verb'GraphBLAS/CMakeLists.txt'.  To use the MATLAB mexFunction, however, you
+must use \verb'gcc' (\verb'gcc-11' is recommended).  Using Clang will result in
+a segfault when you attempt to use the \verb'@GrB' interface in MATLAB.
 
-\begin{enumerate}
-\item creating a random matrix
-\item creating a finite-element matrix
-\item reading a matrix from a file
-\item complex numbers as a user-defined type
-\item matrix import/export
-\end{enumerate}
+With MacOS Big Sur install \verb'gcc-11', \verb'cmake', and OpenMP, and then
+compile GraphBLAS.  cmake 3.13 or later is required.  For the MATLAB
+mexFunctions, you must use \verb'gcc-11'; the \verb'libomp' from \verb'brew'
+will allow you to compile the mexFunctions but they will not work properly.
 
-Additional examples appear in the newly created LAGraph project, currently in
-progress.
+    {\small
+    \begin{verbatim}
+    brew install cmake
+    brew install libomp
+    brew install gcc
+    cd GraphBLAS/GraphBLAS
+    make CC=gcc-11 CXX=g++-11 JOBS=8 \end{verbatim} }
 
-%-------------------------------------------------------------------------------
-\subsection{LAGraph}
-%-------------------------------------------------------------------------------
-\label{lagraph}
+The above instructions assume MATLAB R2021a, using
+\verb'libgraphblas_renamed.dylib', since that version of MATLAB includes its
+own copy of SuiteSparse:GraphBLAS (\verb'libmwgraphblas.dylib') but at version
+v3.3.3, not the latest version.
 
-The LAGraph project is a community-wide effort to create graph algorithms based
-on GraphBLAS (any implementation of the API, not just SuiteSparse: GraphBLAS).
-Some of the algorithms and utilities in LAGraph are listed in the table below.
-Many additional algorithms are planned.  Refer to
-\url{https://github.com/GraphBLAS/LAGraph} for a current list of algorithms. All
-functions in the \verb'Demo/' folder in SuiteSparse:GraphBLAS will eventually
-be translated into algorithms or utilities for LAGraph, and then removed
-from \verb'GraphBLAS/Demo'.
+Next, compile the MATLAB mexFunctions.  I had to edit this file first:
+
+{\small
+\begin{verbatim}
+/Users/davis/Library/Application Support/MathWorks/MATLAB/R2021a/mex_C_maci64.xml \end{verbatim} }
+
+where you would replace \verb'davis' with your MacOS user name.
+Change lines 4 and 18, where both cases of \verb'MACOSX_DEPLOYMENT_TARGET=10.14'
+must become \verb"MACOSX_DEPLOYMENT_TARGET=11.3".  Otherwise, MATLAB
+complains that the \verb'libgraphblas_renamed.dylib' was built for 11.3 but
+linked for 10.14.
 
-To use LAGraph with SuiteSparse:GraphBLAS, place the two folders \verb'LAGraph'
-and \verb'GraphBLAS' in the same parent directory.  This allows the
-\verb'cmake' script in LAGraph to find the copy of GraphBLAS.  Alternatively,
-the GraphBLAS source could be placed anywhere, as long as
-\verb'sudo make install' is performed.
+Next, type the following in the MATLAB Command Window:
 
-%-------------------------------------------------------------------------------
-\subsection{Creating a random matrix}
-%-------------------------------------------------------------------------------
-\label{random}
+    {\small
+    \begin{verbatim}
+    cd GraphBLAS/GraphBLAS/@GrB/private
+    gbmake \end{verbatim} }
 
-The \verb'random_matrix' function in the \verb'Demo' folder generates a random
-matrix with a specified dimension and number of entries, either symmetric or
-unsymmetric, and with or without self-edges (diagonal entries in the matrix).
-It relies on \verb'simple_rand*' functions in the \verb'Demo' folder to provide
-a portable random number generator that creates the same sequence on any
-computer and operating system.
+Then add the paths to your \verb'startup.m' file (usually in
+\verb'~/Documents/MATLAB/startup.m').  For example, my path is:
 
-\verb'random_matrix' can use one of two methods: \verb'GrB_Matrix_setElement'
-and \verb'GrB_Matrix_build'.  The former method is very simple to use:
+    {\small
+    \begin{verbatim}
+    addpath ('/Users/davis/GraphBLAS/GraphBLAS') ;
+    addpath ('/Users/davis/GraphBLAS/GraphBLAS/build') ; \end{verbatim} }
 
-    {\footnotesize
+Finally, you can run the tests to see if your installation works:
+
+    {\small
     \begin{verbatim}
-    GrB_Matrix_new (&A, GrB_FP64, nrows, ncols) ;
-    for (int64_t k = 0 ; k < ntuples ; k++)
-    {
-        GrB_Index i = simple_rand_i ( ) % nrows ;
-        GrB_Index j = simple_rand_i ( ) % ncols ;
-        if (no_self_edges && (i == j)) continue ;
-        double x = simple_rand_x ( ) ;
-        // A (i,j) = x
-        GrB_Matrix_setElement (A, x, i, j) ;
-        if (make_symmetric)
-        {
-            // A (j,i) = x
-            GrB_Matrix_setElement (A, x, j, i) ;
-        }
-    } \end{verbatim}}
+    cd ../../test
+    gbtest \end{verbatim} }
 
-The above code can generate a million-by-million sparse \verb'double' matrix
-with 200 million entries in 66 seconds (6 seconds of which is the time to
-generate the random \verb'i', \verb'j', and \verb'x'), including the time
-to finish all pending computations.  The user application does not need to
-create a list of all the tuples, nor does it need to know how many entries will
-appear in the matrix.  It just starts from an empty matrix and adds them one at
-a time in arbitrary order.  GraphBLAS handles the rest.  This method is not
-feasible in MATLAB.
+%----------------------------------------
+\subsection{On the ARM64 architecture}
+%----------------------------------------
 
-The next method uses \verb'GrB_Matrix_build'.  It is more complex to use than
-\verb'setElement' since it requires the user application to allocate and fill
-the tuple lists, and it requires knowledge of how many entries will appear in
-the matrix, or at least a good upper bound, before the matrix is constructed.
-It is slightly faster, creating the same matrix in 60 seconds, 51 seconds
-of which is spent in \verb'GrB_Matrix_build'.
+You may encounter a compiler error on the ARM64 architecture when using the
+\verb'gcc' compiler, versions 6.x and earlier.  This error was encountered on
+ARM64 Linux with gcc 6.x:
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Index *I, *J ;
-    double *X ;
-    int64_t s = ((make_symmetric) ? 2 : 1) * nedges + 1 ;
-    I = malloc (s * sizeof (GrB_Index)) ;
-    J = malloc (s * sizeof (GrB_Index)) ;
-    X = malloc (s * sizeof (double   )) ;
-    if (I == NULL || J == NULL || X == NULL)
-    {
-        // out of memory
-        if (I != NULL) free (I) ;
-        if (J != NULL) free (J) ;
-        if (X != NULL) free (X) ;
-        return (GrB_OUT_OF_MEMORY) ;
-    }
-    int64_t ntuples = 0 ;
-    for (int64_t k = 0 ; k < nedges ; k++)
-    {
-        GrB_Index i = simple_rand_i ( ) % nrows ;
-        GrB_Index j = simple_rand_i ( ) % ncols ;
-        if (no_self_edges && (i == j)) continue ;
-        double x = simple_rand_x ( ) ;
-        // A (i,j) = x
-        I [ntuples] = i ;
-        J [ntuples] = j ;
-        X [ntuples] = x ;
-        ntuples++ ;
-        if (make_symmetric)
-        {
-            // A (j,i) = x
-            I [ntuples] = j ;
-            J [ntuples] = i ;
-            X [ntuples] = x ;
-            ntuples++ ;
-        }
-    }
-    GrB_Matrix_build (A, I, J, X, ntuples, GrB_SECOND_FP64) ; \end{verbatim}}
+\begin{verbatim}
+`In function GrB_Matrix_apply_BinaryOp1st_Scalar.part.1':
+GrB_Matrix_apply.c:(.text+0x210): relocation truncated to
+fit: R_AARCH64_CALL26 against `.text.unlikely'
+\end{verbatim}
 
-The equivalent \verb'sprandsym' function in MATLAB takes 150 seconds, but
-\verb'sprandsym' uses a much higher-quality random number generator to create
-the tuples \verb'[I,J,X]'.  Considering just the time for
-\verb'sparse(I,J,X,n,n)' in \verb'sprandsym' (equivalent to
-\verb'GrB_Matrix_build'), the time is 70 seconds.  That is, each of these three
-methods, \verb'setElement' and \verb'build' in SuiteSparse:GraphBLAS, and
-\verb'sparse' in MATLAB, are equally fast.
+For the ARM64, this error is silenced with gcc v7.x and later, at least on
+Linux.
 
-%-------------------------------------------------------------------------------
-\subsection{Creating a finite-element matrix}
-%-------------------------------------------------------------------------------
-\label{fem}
+%----------------------------------------
+\subsection{On Microsoft Windows}
+\label{sec:windows}
+%----------------------------------------
 
-Suppose a finite-element matrix is being constructed, with \verb'k=40,000'
-finite-element matrices, each of size \verb'8'-by-\verb'8'.  The following
-operations (in pseudo-MATLAB notation) are very efficient in
+SuiteSparse:GraphBLAS is now ported to Microsoft Visual Studio.  However, that
+compiler is not ANSI C11 compliant. As a result, GraphBLAS on Windows will have
+a few minor limitations.
+
+\begin{itemize}
+\item The MS Visual Studio compiler does not support the \verb'_Generic'
+keyword, required for the polymorphic GraphBLAS functions.  So for example, you
+will need to use \verb'GrB_Matrix_free' instead of just \verb'GrB_free'.
+
+\item Variable-length arrays are not supported, so user-defined
+types are limited to 128 bytes in size.  This can be changed by editing
+\verb'GB_VLA_MAXSIZE' in \verb'Source/GB_compiler.h', and recompiling
 SuiteSparse:GraphBLAS.
 
-    {\footnotesize
-    \begin{verbatim}
-    A = sparse (m,n) ; % create an empty n-by-n sparse GraphBLAS matrix
-    for i = 1:k
-        construct a 8-by-8 sparse or dense finite-element F
-        I and J define where the matrix F is to be added:
-        I = a list of 8 row indices
-        J = a list of 8 column indices
-        % using GrB_assign, with the 'plus' accum operator:
-        A (I,J) = A (I,J) + F
-    end \end{verbatim}}
+\item AVX acceleration is not enabled.
+\end{itemize}
 
-If this were done in MATLAB or in GraphBLAS with blocking mode enabled, the
-computations would be extremely slow.  A far better approach is to construct a
-list of tuples \verb'[I,J,X]' and to use \verb'sparse(I,J,X,n,n)'. This is
-identical to creating the same list of tuples in GraphBLAS and using the
-\verb'GrB_Matrix_build', which is equally fast.
+If you use a recent \verb'gcc' or \verb'icx' compiler on Windows other than the
+Microsoft Compiler (\verb'cl'), these limitations can be avoided.
 
-In SuiteSparse:GraphBLAS, the performance of both methods is essentially
-identical, and roughly as fast as \verb'sparse' in MATLAB.  Inside
-SuiteSparse:GraphBLAS, \verb'GrB_assign' is doing the same thing. When
-performing \verb'A(I,J)=A(I,J)+F', if it finds that it cannot quickly insert an
-update into the \verb'A' matrix, it creates a list of pending tuples to be
-assembled later on.   When the matrix is ready for use in a subsequent
-GraphBLAS operation (one that normally cannot use a matrix with pending
-computations), the tuples are assembled all at once via
-\verb'GrB_Matrix_build'.
+The following instructions apply to Windows 10, CMake 3.16, and
+Visual Studio 2019, but may work for earlier versions.
 
-GraphBLAS operations on other matrices have no effect on when the pending
-updates of a matrix are completed.  Thus, any GraphBLAS method or operation can
-be used to construct the \verb'F' matrix in the example above, without
-affecting when the pending updates to \verb'A' are completed.
+\begin{enumerate}
 
-The MATLAB \verb'wathen.m' script is part of Higham's \verb'gallery' of
-matrices \cite{Higham}.  It creates a finite-element matrix with random
-coefficients for a 2D mesh of size \verb'nx'-by-\verb'ny', a matrix formulation
-by Wathen \cite{Wathen}.  The pattern of the matrix is fixed; just the values
-are randomized.  The GraphBLAS equivalent can use either
-\verb'GrB_Matrix_build', or \verb'GrB_assign'.  Both methods have good
-performance.  The \verb'GrB_Matrix_build' version below is about 15\% to 20\%
-faster than the MATLAB \verb'wathen.m' function, regardless of the problem
-size.  It uses the identical algorithm as \verb'wathen.m'.
+\item Install CMake 3.16 or later, if not already installed.
+    See \url{https://cmake.org/} for details.
 
-    {\footnotesize
+\item Install Microsoft Visual Studio, if not already installed.
+    See \url{https://visualstudio.microsoft.com/} for details.
+    Version 2019 is preferred, but earlier versions may also work.
+
+\item Open a terminal window and type this in the
+    \verb'SuiteSparse/GraphBLAS/build' folder:
+
+    \vspace{-0.1in}
+    {\small
     \begin{verbatim}
-    int64_t ntriplets = nx*ny*64 ;
-    I = malloc (ntriplets * sizeof (int64_t)) ;
-    J = malloc (ntriplets * sizeof (int64_t)) ;
-    X = malloc (ntriplets * sizeof (double )) ;
-    if (I == NULL || J == NULL || X == NULL)
-    {
-        FREE_ALL ;
-        return (GrB_OUT_OF_MEMORY) ;
-    }
-    ntriplets = 0 ;
-    for (int j = 1 ; j <= ny ; j++)
-    {
-        for (int i = 1 ; i <= nx ; i++)
-        {
-            nn [0] = 3*j*nx + 2*i + 2*j + 1 ;
-            nn [1] = nn [0] - 1 ;
-            nn [2] = nn [1] - 1 ;
-            nn [3] = (3*j-1)*nx + 2*j + i - 1 ;
-            nn [4] = 3*(j-1)*nx + 2*i + 2*j - 3 ;
-            nn [5] = nn [4] + 1 ;
-            nn [6] = nn [5] + 1 ;
-            nn [7] = nn [3] + 1 ;
-            for (int krow = 0 ; krow < 8 ; krow++) nn [krow]-- ;
-            for (int krow = 0 ; krow < 8 ; krow++)
-            {
-                for (int kcol = 0 ; kcol < 8 ; kcol++)
-                {
-                    I [ntriplets] = nn [krow] ;
-                    J [ntriplets] = nn [kcol] ;
-                    X [ntriplets] = em (krow,kcol) ;
-                    ntriplets++ ;
-                }
-            }
-        }
-    }
-    // A = sparse (I,J,X,n,n) ;
-    GrB_Matrix_build (A, I, J, X, ntriplets, GrB_PLUS_FP64) ; \end{verbatim}}
+    cmake ..  \end{verbatim} }
+    \vspace{-0.1in}
+
+\item The \verb'cmake' command generates many files in
+    \verb'SuiteSparse/GraphBLAS/build', and the file \verb'graphblas.sln' in
+    particular.  Open the generated \verb'graphblas.sln' file in Visual Studio.
+
+\item Optionally: right-click \verb'graphblas' in the left panel (Solution
+    Explorer) and select properties; then navigate to \verb'Configuration'
+    \verb'Properties', \verb'C/C++', \verb'General' and change the parameter
+    \verb'Multiprocessor Compilation' to \verb'Yes (/MP)'.  Click \verb'OK'.
+    This will significantly speed up the compilation of GraphBLAS.
+
+\item Select the \verb'Build' menu item at the top of the window and
+    select \verb'Build Solution'.  This should create a folder called
+    \verb'Release' and place the compiled \verb'graphblas.dll',
+    \verb'graphblas.lib', and \verb'graphblas.exp' files there.  Please be
+    patient; some files may take a while to compile and sometimes may appear to
+    be stalled.  Just wait.
 
-The \verb'GrB_assign' version has the advantage of not requiring the
-user application to construct the tuple list, and is almost as fast as using
-\verb'GrB_Matrix_build'.  The code is more elegant than either the MATLAB
-\verb'wathen.m' function or its GraphBLAS equivalent above.  Its performance is
-comparable with the other two methods, but slightly slower, being about 5\%
-slower than the MATLAB \verb'wathen', and 20\% slower than the GraphBLAS
-method above.
+    % Alternatively, type this command in the terminal window:
+    % {\small
+    % \begin{verbatim}
+    % devenv graphblas.sln /build "release|x64" /project graphblas \end{verbatim}}
 
-    {\footnotesize
-    \begin{verbatim}
-    GrB_Matrix_new (&F, GrB_FP64, 8, 8) ;
-    for (int j = 1 ; j <= ny ; j++)
-    {
-        for (int i = 1 ; i <= nx ; i++)
-        {
-            nn [0] = 3*j*nx + 2*i + 2*j + 1 ;
-            nn [1] = nn [0] - 1 ;
-            nn [2] = nn [1] - 1 ;
-            nn [3] = (3*j-1)*nx + 2*j + i - 1 ;
-            nn [4] = 3*(j-1)*nx + 2*i + 2*j - 3 ;
-            nn [5] = nn [4] + 1 ;
-            nn [6] = nn [5] + 1 ;
-            nn [7] = nn [3] + 1 ;
-            for (int krow = 0 ; krow < 8 ; krow++) nn [krow]-- ;
-            for (int krow = 0 ; krow < 8 ; krow++)
-            {
-                for (int kcol = 0 ; kcol < 8 ; kcol++)
-                {
-                    // F (krow,kcol) = em (krow, kcol)
-                    GrB_Matrix_setElement (F, em (krow,kcol), krow, kcol) ;
-                }
-            }
-            // A (nn,nn) += F
-            GrB_assign (A, NULL, GrB_PLUS_FP64, F, nn, 8, nn, 8, NULL) ;
-        }
-    } \end{verbatim}}
+\item Add the \verb'GraphBLAS/build/Release' folder to the Windows System path:
 
-Since there is no \verb'Mask', and since \verb'GrB_REPLACE' is not used, the call
-to \verb'GrB_assign' in the example above is identical to \verb'GxB_subassign'.
-Either one can be used, and their performance would be identical.
+    \begin{itemize}
+    \item Open the \verb'Start Menu' and type \verb'Control Panel'.
+    \item Select the \verb'Control Panel' app.
+    \item When the app opens, select \verb'System and Security'.
+    \item Under \verb'System and Security', select \verb'System'.
+    \item From the top left side of the \verb'System' window, select
+        \verb'Advanced System Settings'.  You may have to authenticate
+        at this step.
+    \item The \verb'Systems Properties' window should appear with the
+        \verb'Advanced' tab selected;
+        select \verb'Environment Variables'.
+    \item The \verb'Environment Variables' window displays 2 sections, one for
+        \verb'User' variables and the other for \verb'System' variables.  Under
+        the \verb'Systems' variable section, scroll to and select \verb'Path',
+        then select \verb'Edit'.   A editor window appears allowing to add,
+        modify, delete or re-order the parts of the \verb'Path'.
+    \item Add the full path of the \verb'GraphBLAS\build\Release' folder
+        (typically starting with \verb'C:\Users\you\'..., where \verb'you' is
+        your Windows username) to the \verb'Path'.
+    \item If the above steps do not work, you can instead copy the
+        \verb'graphblas.*' files from \verb'GraphBLAS\build\Release' into any
+        existing folder listed in your \verb'Path'. 
+    \end{itemize}
 
-Refer to the \verb'wathen.c' function in the \verb'Demo' folder, which
-uses GraphBLAS to implement the two methods above, and two additional ones.
+\item The \verb'GraphBLAS/Include/GraphBLAS.h' file must be included in user
+    applications via \verb'#include "GraphBLAS.h"'.  This is already done for
+    you in the Octave/MATLAB interface discussed in the next section.
 
-%-------------------------------------------------------------------------------
-\subsection{Reading a matrix from a file}
-%-------------------------------------------------------------------------------
-\label{read}
+\end{enumerate}
 
-See also \verb'LAGraph_mmread' and \verb'LAGraph_mmwrite', which
-can read and write any matrix in Matrix Market format, and
-\verb'LAGraph_binread' and \verb'LAGraph_binwrite', which read/write a matrix
-from a binary file.  The binary file I/O functions are much faster than
-the \verb'read_matrix' function described here, and also much faster than
-\verb'LAGraph_mmread' and \verb'LAGraph_mmwrite'.
+%----------------------------------------
+\subsection{Compiling the Octave/MATLAB interface (for Octave, and for MATLAB R2020a and earlier)}
+%----------------------------------------
+\label{gbmake}
 
-The \verb'read_matrix' function in the \verb'Demo' reads in a triplet matrix
-from a file, one line per entry, and then uses \verb'GrB_Matrix_build' to
-create the matrix.  It creates a second copy with \verb'GrB_Matrix_setElement',
-just to test that method and compare the run times.
-Section~\ref{random} has already compared
-\verb'build' versus \verb'setElement'.
+I'm working closely with John Eaton (the primary developer of Octave) to
+enable SuiteSparse:GraphBLAS to work with Octave, and thus Octave 7 is
+required.  The latest version of Octave is 6.4.0, so you need to download and
+install the development version of Octave 7 to use SuiteSparse:GraphBLAS within
+Octave.
 
-The function can return the matrix as-is, which may be rectangular or
-unsymmetric.  If an input parameter is set to make the matrix symmetric,
-\verb'read_matrix' computes \verb"A=(A+A')/2" if \verb'A' is square (turning
-all directed edges into undirected ones).  If \verb'A' is rectangular, it
-creates a bipartite graph, which is the same as the augmented matrix,
-\verb"A = [0 A ; A' 0]".
-If \verb'C' is an \verb'n'-by-\verb'n' matrix, then \verb"C=(C+C')/2" can be
-computed as follows in GraphBLAS, (the \verb'scale2' function divides an entry
-by 2):
+First, compile the SuiteSparse:GraphBLAS dynamic library
+(\verb'libgraphblas.so' for Linux, \verb'libgraphblas.dylib' for Mac,
+or \verb'graphblas.dll' for Windows), as described in the prior two
+subsections.
 
-    \vspace{-0.05in}
-    {\footnotesize
+On the Mac, SuiteSparse:GraphBLAS v6.1.4 and Octave 7 will work 
+Apple Silicon (thanks to G{\'{a}}bor Sz{\'{a}}rnyas).  Here are his instructions
+(replicated from
+\url{https://github.com/DrTimothyAldenDavis/GraphBLAS/issues/90}); do
+these in your Mac Terminal:
+
+\begin{itemize}
+\item Building Octave.  Grab the brew formula:
+
+    {\scriptsize
     \begin{verbatim}
-    GrB_Descriptor_new (&dt2) ;
-    GrB_Descriptor_set (dt2, GrB_INP1, GrB_TRAN) ;
-    GrB_Matrix_new (&A, GrB_FP64, n, n) ;
-    GrB_eWiseAdd (A, NULL, NULL, GrB_PLUS_FP64, C, C, dt2) ;    // A=C+C'
-    GrB_free (&C) ;
-    GrB_Matrix_new (&C, GrB_FP64, n, n) ;
-    GrB_UnaryOp_new (&scale2_op, scale2, GrB_FP64, GrB_FP64) ;
-    GrB_apply (C, NULL, NULL, scale2_op, A, NULL) ;             // C=A/2
-    GrB_free (&A) ;
-    GrB_free (&scale2_op) ; \end{verbatim}}
+    wget https://raw.githubusercontent.com/Homebrew/homebrew-core/master/Formula/octave.rb
+    \end{verbatim} }
 
-This is of course not nearly as elegant as \verb"A=(A+A')/2" in MATLAB, but
-with minor changes it can work on any type and use any built-in operators
-instead of \verb'PLUS', or it can use any user-defined operators and types.
-The above code in SuiteSparse:GraphBLAS takes 0.60 seconds for the
-\verb'Freescale2' matrix, slightly slower than MATLAB (0.55 seconds).
+\item Edit \verb'octave.rb'.
 
-Constructing the augmented system is more complicated using the GraphBLAS C API
-Specification since it does not yet have a simple way of specifying a range of
-row and column indices, as in \verb'A(10:20,30:50)' in MATLAB (\verb'GxB_RANGE'
-is a SuiteSparse:GraphBLAS extension that is not in the Specification).  Using
-the C API in the Specification, the application must instead build a list of
-indices first, \verb'I=[10, 11' \verb'...' \verb'20]'.
+   Add \verb`"disable-docs"` to \verb`args` (or ensure that you have a working
+   texinfo installation).
+   Edit Mercurial (\verb`hg`) repository: switch from the \verb`default` branch
+   (containing code for Octave v8.0) to \verb`stable` (v7.0).  Then do:
 
-Thus, to compute the MATLAB equivalent of \verb"A = [0 A ; A' 0]", index lists
-\verb'I' and \verb'J' must first be constructed:
+    {\small
+    \begin{verbatim}
+    brew install --head ./octave.rb
+    \end{verbatim} }
 
-    \vspace{-0.05in}
-    {\footnotesize
+\item Building the tests (\verb'gbmake').
+    Grab the OpenMP binaries as described at
+    \url{https://mac.r-project.org/openmp/}
+
+    {\scriptsize
     \begin{verbatim}
-    int64_t n = nrows + ncols ;
-    I = malloc (nrows * sizeof (int64_t)) ;
-    J = malloc (ncols * sizeof (int64_t)) ;
-    // I = 0:nrows-1
-    // J = nrows:n-1
-    if (I == NULL || J == NULL)
-    {
-        if (I != NULL) free (I) ;
-        if (J != NULL) free (J) ;
-        return (GrB_OUT_OF_MEMORY) ;
-    }
-    for (int64_t k = 0 ; k < nrows ; k++) I [k] = k ;
-    for (int64_t k = 0 ; k < ncols ; k++) J [k] = k + nrows ; \end{verbatim}}
+    curl -O https://mac.r-project.org/openmp/openmp-13.0.0-darwin21-Release.tar.gz
+    sudo tar fvxz openmp-13.0.0-darwin21-Release.tar.gz -C /
+    \end{verbatim} }
 
-Once the index lists are generated, however, the resulting GraphBLAS operations
-are fairly straightforward, computing \verb"A=[0 C ; C' 0]".
+\item Do the following to edit \verb'gbmake.m':
 
-    \vspace{-0.05in}
-    {\footnotesize
+    {\scriptsize
     \begin{verbatim}
-    GrB_Descriptor_new (&dt1) ;
-    GrB_Descriptor_set (dt1, GrB_INP0, GrB_TRAN) ;
-    GrB_Matrix_new (&A, GrB_FP64, n, n) ;
-    // A (nrows:n-1, 0:nrows-1) = C'
-    GrB_assign (A, NULL, NULL, C, J, ncols, I, nrows, dt1) ;
-    // A (0:nrows-1, nrows:n-1) = C
-    GrB_assign (A, NULL, NULL, C, I, nrows, J, ncols, NULL) ; \end{verbatim}}
+    sed -i.bkp 's/-fopenmp/-Xclang -fopenmp/g' @GrB/private/gbmake.m
+    \end{verbatim} }
 
-This takes 1.38 seconds for the \verb'Freescale2' matrix, almost as fast as \newline
-\verb"A=[sparse(m,m) C ; C' sparse(n,n)]" in MATLAB (1.25 seconds).
-The \verb'GxB_Matrix_concat' function would be faster still (this example
-was written prior to \verb'GxB_Matrix_concat' was added to SuiteSparse:GraphBLAS).
+\end{itemize}
 
-Both calls to \verb'GrB_assign' use no accumulator, so the second one
-causes the partial matrix \verb"A=[0 0 ; C' 0]" to be built first, followed by
-the final build of \verb"A=[0 C ; C' 0]".  A better method, but not an obvious
-one, is to use the \verb'GrB_FIRST_FP64' accumulator for both assignments.  An
-accumulator enables SuiteSparse:GraphBLAS to determine that that entries
-created by the first assignment cannot be deleted by the second, and thus it
-need not force completion of the pending updates prior to the second
-assignment.
+Once Octave 7 and SuiteSparse:GraphBLAS are compiled and installed,
+and \verb'gbmake.m' is modified if needed for Octave 7 on the Mac,
+(or if using MATLAB) continue with the following instructions:
 
-SuiteSparse:GraphBLAS also adds a \verb'GxB_RANGE' mechanism that mimics
-the MATLAB colon notation.  This speeds up the method and simplifies the
-code the user needs to write to compute \verb"A=[0 C ; C' 0]":
+\begin{enumerate}
+\item In the Octave/MATLAB command window:
 
-    \vspace{-0.05in}
-    {\footnotesize
+    {\small
     \begin{verbatim}
-    int64_t n = nrows + ncols ;
-    GrB_Matrix_new (&A, xtype, n, n) ;
-    GrB_Index I_range [3], J_range [3] ;
-    I_range [GxB_BEGIN] = 0 ;
-    I_range [GxB_END  ] = nrows-1 ;
-    J_range [GxB_BEGIN] = nrows ;
-    J_range [GxB_END  ] = ncols+nrows-1 ;
-    // A (nrows:n-1, 0:nrows-1) += C'
-    GrB_assign (A, NULL, GrB_FIRST_FP64, // or NULL,
-        C, J_range, GxB_RANGE, I_range, GxB_RANGE, dt1) ;
-    // A (0:nrows-1, nrows:n-1) += C
-    GrB_assign (A, NULL, GrB_FIRST_FP64, // or NULL,
-        C, I_range, GxB_RANGE, J_range, GxB_RANGE, NULL) ; \end{verbatim}}
+    cd GraphBLAS/GraphBLAS/@GrB/private
+    gbmake \end{verbatim} }
+
+\item Follow the remaining instructions in the
+    \verb'GraphBLAS/GraphBLAS/README.md' file, to revise your
+    Octave/MATLAB path and \verb'startup.m' file.
+
+\item As a quick test, try the command \verb'GrB(1)', which
+    creates and displays a 1-by-1 GraphBLAS matrix.  For a longer test, do the
+    following:
+
+    {\small
+    \begin{verbatim}
+    cd GraphBLAS/GraphBLAS/test
+    gbtest \end{verbatim} }
+
+\item In Windows, if the tests fail with an error stating that the
+    mex file is invalid because the module could not be found, it means
+    that MATLAB could not find the compiled \verb'graphblas.lib', \verb'*.dll'
+    or \verb'*.exp' files in the \verb'build/Release' folder.  This can happen
+    if your Windows System path is not set properly, or if Windows is not
+    recognizing the \verb'GraphBLAS/build/Release' folder (see
+    Section~\ref{sec:windows})  Or, you might not have permission to change your
+    Windows System path.  In this case, do the following in the MATLAB Command
+    \vspace{-0.1in}
+    Window:
 
-Any operator will suffice because it is not actually applied.  An operator is
-only applied to the set intersection, and the two assignments do not overlap.
-If an \verb'accum' operator is used, only the final matrix is built, and the
-time in GraphBLAS drops slightly to 1.25 seconds.  This is a very small
-improvement because in this particular case, SuiteSparse:GraphBLAS is able to
-detect that no sorting is required for the first build, and the second one is a
-simple concatenation.  In general, however, allowing GraphBLAS to postpone
-pending updates can lead to significant reductions in run time.
+    \vspace{-0.1in}
+    {\small
+    \begin{verbatim}
+    cd GraphBLAS/build/Release
+    GrB(1) \end{verbatim} }
 
-%-------------------------------------------------------------------------------
-\subsection{User-defined types and operators}
-%-------------------------------------------------------------------------------
-\label{user}
+    \vspace{-0.1in}
+    After this step, the GraphBLAS library will be loaded into MATLAB.  You may
+    need to add the above lines in your \verb'Documents/MATLAB/startup.m' file,
+    so that they are done each time MATLAB starts.  You will also need to do
+    this after \verb'clear all' or \verb'clear mex', since those MATLAB
+    commands remove all loaded libraries from MATLAB.
 
-The \verb'Demo' folder contains two working examples of user-defined types,
-first discussed in Section~\ref{type_new}: \verb'double complex', and a
-user-defined \verb'typedef' called \verb'wildtype' with a \verb'struct'
-containing a string and a 4-by-4 \verb'float' matrix.
+    You might also get an error ``the specified procedure cannot be found.''
+    This can occur if you have upgraded your GraphBLAS library from a prior
+    version, and some of the compiled files \verb'@GrB/private/*.mex*'
+    are stale.  Try the command \verb'gbmake all' in the MATLAB Command
+    Window, which forces all of the MATLAB interface to be recompiled.
+    Or, try deleting all \verb'@GrB/private/*.mex*' files and running
+    \verb'gbmake' again.
 
-{\bf Double Complex:}
-Prior to v3.3, GraphBLAS did not have a native complex type.  It now appears as
-the \verb'GxB_FC64' predefined type, but a complex type can also easily added
-as a user-defined type.  The \verb'Complex_init' function in the
-\verb'usercomplex.c' file in the \verb'Demo' folder creates the \verb'Complex'
-type based on the ANSI C11 \verb'double complex' type.
-It creates a full suite of operators that correspond to every
-built-in GraphBLAS operator, both binary and unary.  In addition, it
-creates the operators listed in the following table, where $D$ is
-\verb'double' and $C$ is \verb'Complex'.
+\item On Windows, the \verb'casin', \verb'casinf', \verb'casinh', and
+    \verb'casinhf' functions provided by Microsoft do not return the correct
+    imaginary part.  As a result, \verb'GxB_ASIN_FC32', \verb'GxB_ASIN_FC64'
+    \verb'GxB_ASINH_FC32', and \verb'GxB_ASINH_FC64' do not work properly on
+    Windows.  This affects the \verb'GrB/asin', \verb'GrB/acsc',
+    \verb'GrB/asinh', and \verb'GrB/acsch', functions in the MATLAB interface.
+    See the MATLAB tests bypassed in \verb'gbtest76.m' for details, in the
+    \newline
+    \verb'GraphBLAS/GraphBLAS/test' folder.
+    %% FUTURE: fix asin and acsc on Windows for the complex case.
 
-\vspace{0.1in}
-{\footnotesize
-\begin{tabular}{llll}
-\hline
-name                    & types             & Octave/MATLAB & description \\
-                        &                   & equivalent    & \\
-\hline
-\verb'Complex_complex'  & $D \times D \rightarrow C$ & \verb'z=complex(x,y)' & complex from real and imag. \\
-\hline
-\verb'Complex_conj'     & $C \rightarrow C$ & \verb'z=conj(x)'  & complex conjugate \\
-\verb'Complex_real'     & $C \rightarrow D$ & \verb'z=real(x)'  & real part \\
-\verb'Complex_imag'     & $C \rightarrow D$ & \verb'z=imag(x)'  & imaginary part \\
-\verb'Complex_angle'    & $C \rightarrow D$ & \verb'z=angle(x)' & phase angle \\
-\verb'Complex_complex_real'  & $D \rightarrow C$ & \verb'z=complex(x,0)' & real to complex real \\
-\verb'Complex_complex_imag'  & $D \rightarrow C$ & \verb'z=complex(0,x)' & real to complex imag. \\
-\hline
-\end{tabular}
-}
+\end{enumerate}
 
-The \verb'Complex_init' function creates two monoids (\verb'Complex_add_monoid'
-and \verb'Complex_times_monoid') and a semiring \verb'Complex_plus_times' that
-corresponds to the conventional linear algebra for complex matrices.  The
-include file \verb'usercomplex.h' in the \verb'Demo' folder is available so
-that this user-defined \verb'Complex' type can easily be imported into any
-other user application.  When the user application is done, the
-\verb'Complex_finalize' function frees the \verb'Complex' type and its
-operators, monoids, and semiring.
-NOTE: the \verb'Complex' type is not supported in this Demo in Microsoft
-Visual Studio.
+%----------------------------------------
+\subsection{Compiling the Octave/MATLAB interface (for MATLAB R2021a and later)}
+\label{R2021a}
+%----------------------------------------
 
-{\bf Struct-based:}
-In addition, the \verb'wildtype.c' program  creates a user-defined
-\verb'typedef' of a \verb'struct' containing a dense 4-by-4 \verb'float'
-matrix, and a 64-character string.  It constructs an additive monoid that adds
-two 4-by-4 dense matrices, and a multiplier operator that multiplies two 4-by-4
-matrices.  Each of these 4-by-4 matrices is treated by GraphBLAS as a
-``scalar'' value, and they can be manipulated in the same way any other
-GraphBLAS type can be manipulated. The purpose of this type is illustrate the
-endless possibilities of user-defined types and their use in GraphBLAS.
+MATLAB R2021a includes its own copy of SuiteSparse:GraphBLAS v3.3.3, as the
+file \verb'libmwgraphblas.so', which is used for the built-in \verb'C=A*B' when
+both \verb'A' and \verb'B' are sparse (see the Release Notes of MATLAB R2021a,
+which discusses the performance gained in MATLAB by using GraphBLAS).
 
-%-------------------------------------------------------------------------------
-\subsection{User applications using OpenMP or other threading models}
-%-------------------------------------------------------------------------------
-\label{threads}
+That's great news for the impact of GraphBLAS on MATLAB itself, and the domain
+of high performance computing in general, but it causes a linking problem when
+using this MATLAB interface for GraphBLAS.  The two use different versions of
+the same library, and a segfault arises if the MATLAB interface for v4.x (or
+later) tries to link with the older GraphBLAS v3.3.3 library.  Likewise, the
+built-in \verb'C=A*B' causes a segfault if it tries to use the newer GraphBLAS
+v4.x (or later) libraries.
 
-An example demo program (\verb'openmp_demo') is included that illustrates how a
-multi-threaded user application can use GraphBLAS.
+To resolve this issue, a second GraphBLAS library must be compiled,
+\verb'libgraphblas_renamed', where the internal symbols are all renamed so they
+do not conflict with the \verb'libmwgraphblas' library.  Then both libraries
+can co-exist in the same instance of MATLAB.
 
-The results from the \verb'openmp_demo' program may appear out of order.  This
-is by design, simply to show that the user application is running in parallel.
-The output of each thread should be the same.  In particular, each thread
-generates an intentional error, and later on prints it with \verb'GrB_error'.
-It will print its own error, not an error from another thread.  When all the
-threads finish, the leader thread prints out each matrix generated by each
-thread.
+To do this, go to the \verb'GraphBLAS/GraphBLAS' folder, containing the
+MATLAB interface.  That folder contains a \verb'CMakeLists.txt' file to
+compile the \verb'libgraphblas_renamed' library.  See the instructions
+for how to compile the C library \verb'libgraphblas', and repeat them but
+using the folder \newline
+\verb'SuiteSparse/GraphBLAS/GraphBLAS/build' instead of \newline
+\verb'SuiteSparse/GraphBLAS/build'.
 
-GraphBLAS can also be combined with user applications that rely on MPI, the
-Intel TBB threading library, POSIX pthreads, Microsoft Windows threads, or any
-other threading library.  In all cases, GraphBLAS will be thread safe.
+This will compile the renamed SuiteSparse:GraphBLAS dynamic library
+(\verb'libgraphblas_renamed.so' for Linux, \verb'libgraphblas_renamed.dylib'
+for Mac, or \verb'graphblas_renamed.dll' for Windows).  These can be
+placed in the same system-wide location as the standard \verb'libgraphblas'
+libraries, such as \verb'/usr/local/lib' for Linux.  The two pairs of
+libraries share the identical \verb'GraphBLAS.h' include file.
 
-\newpage
-%-------------------------------------------------------------------------------
-\section{Compiling and Installing SuiteSparse:GraphBLAS}
-%-------------------------------------------------------------------------------
-\label{sec:install}
+Next, compile the MATLAB interface as described in Section~\ref{gbmake}.  For
+any instructions in that Section that refer to the \verb'GraphBLAS/build'
+folder (Linux and Mac) or \verb'GraphBLAS/build/Release' (Windows), use \newline
+\verb'GraphBLAS/GraphBLAS/build' (Linux and Mac) or \newline
+\verb'GraphBLAS/GraphBLAS/build/Release' (Windows) instead.
+
+The resulting functions for your \verb'@GrB' object will now work just fine;
+no other changes are needed.  You can even use the GraphBLAS mexFunctions
+compiled in MATLAB R2021a in earlier versions of MATLAB (such as R2020a).
 
 %----------------------------------------
-\subsection{On Linux and Mac}
+\subsection{Setting the C flags and using CMake}
 %----------------------------------------
 
-GraphBLAS makes extensive use of features in the ANSI C11 standard, and thus a
-C compiler supporting this version of the C standard is required to use
-all features of GraphBLAS. 
+Next, do \verb'make' in the \verb'build' directory.  If this still fails, see
+the \verb'CMakeLists.txt' file.  You can edit that file to pass
+compiler-specific options to your compiler.  Locate this section in the
+\verb'CMakeLists.txt' file.  Use the \verb'set' command in \verb'cmake', as in
+the example below, to set the compiler flags you need.
 
-{\bf Any version of the Intel \verb'icx' compiler is highly recommended.} In
-most cases, the Intel \verb'icx' and the Intel OpenMP library (\verb'libiomp')
-result in the best performance.  The \verb'gcc' and the GNU OpenMP library
-(\verb'libgomp') generally gives good performance: typically on par with icx
-but in a few special cases significantly slower.  The Intel \verb'icc' compiler
-is not recommended; it results in poor performance for
-\verb'#pragma omp atomic'.
+    {\small
+    \begin{verbatim}
+    # check which compiler is being used.  If you need to make
+    # compiler-specific modifications, here is the place to do it.
+    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+        # cmake 2.8 workaround: gcc needs to be told to do ANSI C11.
+        # cmake 3.0 doesn't have this problem.
+        set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -std=c11 -lm " )
+        ...
+    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "Intel")
+        ...
+    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+        ...
+    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
+        ...
+    endif ( )
+    \end{verbatim} }
 
-On the Mac (OS X), \verb'clang' 8.0.0 in \verb'Xcode' version 8.2.1 is
-sufficient, although earlier versions of \verb'Xcode' may work as well.  For
-the GNU \verb'gcc' compiler, version 4.9 or later is required, but best
-performance is obtained in 9.3 or later.  Version 3.13 or later of \verb'cmake'
-is required; version 3.17 is preferred.
+To compile SuiteSparse:GraphBLAS without running the demos, use \newline
+\verb'make library' in the top-level directory, or \verb'make' in the
+\verb'build' directory.
 
-If you are using a pre-C11 ANSI C compiler, such as Microsoft Visual Studio,
-then the \verb'_Generic' keyword is not available.  SuiteSparse:GraphBLAS
-will still compile, but you will not have access to polymorphic functions
-such as \verb'GrB_assign'.  You will need to use the non-polymorphic functions
-instead.
+Several compile-time options can be selected by editing the \verb'Source/GB.h'
+file, but these are meant only for code development of SuiteSparse:GraphBLAS
+itself, not for end-users of SuiteSparse:GraphBLAS.
 
-To compile SuiteSparse:GraphBLAS, simply type \verb'make' in the main GraphBLAS
-folder, which compiles the library with your default system compiler.  This
-compile GraphBLAS using 8 threads, which will take a long time.  To compile with
-more threads (40, for this example), use:
+%----------------------------------------
+\subsection{Using a plain makefile}
+\label{altmake}
+%----------------------------------------
 
-    {\small
-    \begin{verbatim}
-    make JOBS=40 \end{verbatim} }
+The \verb'GraphBLAS/alternative' directory contains a simple \verb'Makefile'
+that can be used to compile SuiteSparse:GraphBLAS.  This is a useful option
+if you do not have the required version of \verb'cmake'.  This \verb'Makefile'
+can even compile the entire library with a C++ compiler, which cannot be
+done with \verb'CMake'.
 
-To use a non-default compiler with 4 threads:
+This alternative \verb'Makefile' does not build the
+\verb'libgraphblas_renamed.so' library required for MATLAB R2021a (see
+Section~\ref{R2021a}).  This can be done by revising the \verb'Makefile',
+however:  add the \verb'-DGBRENAME=1' flag, and change the library name
+from \verb'libgraphblas' to \verb'libgraphbas_renamed'.
+
+%----------------------------------------
+\subsection{Running the Demos}
+%----------------------------------------
+
+After \verb'make' in the top-level directory to compile the library, type
+\verb'make run' to run the demos (also in the top-level directory).
+You can also run the demos after compiling with \verb'make all':
 
     {\small
     \begin{verbatim}
-    make CC=icx CXX=icpx JOBS=4 \end{verbatim} }
+    make all
+    cd Demo
+    ./demo \end{verbatim} }
 
-GraphBLAS v6.1.3 and later use the \verb'cpu_features' package by Google to
-determine if the target architecture supports AVX2 and/or AVX512F (on Intel
-x86\_64 architectures only).  In case you have build issues with this package,
-you can compile without it (and then AVX2 and AVX512F acceleration will not
-be used):
+The \verb'./demo' command is a script that runs the demos with various input
+matrices in the \verb'Demo/Matrix' folder.  The output of the demos will be
+compared with expected output files in \verb'Demo/Output'.
+
+NOTE:
+DO NOT publish benchmarks of these demos, and do not link against the
+demo library in any user application.  These codes are sometimes slow,
+and are meant as simple illustrations only, not for performance.  The fastest
+methods are in LAGraph, not in SuiteSparse/GraphBLAS/Demo.  Benchmark LAGraph
+instead.  Eventually, all GraphBLAS/Demos methods will be removed, and LAGraph
+will serve all uses: for illustration, benchmarking, and production uses.
 
-    {\small
-    \begin{verbatim}
-    make CMAKE_OPTIONS='-DGBNCPUFEAT=1'  \end{verbatim} }
+%----------------------------------------
+\subsection{Installing SuiteSparse:GraphBLAS}
+%----------------------------------------
 
-Without \verb'cpu_features', it is still possible to enable AVX2 and AVX512F.
-Rather than relying on run-time tests, you can use these flags to enable
-both AVX2 and AVX512F, without relying on \verb'cpu_features':
+To install the library (typically in \verb'/usr/local/lib' and
+\verb'/usr/local/include' for Linux systems), go to the top-level GraphBLAS
+folder and type:
 
     {\small
     \begin{verbatim}
-    make CMAKE_OPTIONS='-DGBNCPUFEAT=1 -DGBAVX2=1 -DGBAVX512F=1' \end{verbatim} }
+    sudo make install \end{verbatim} }
 
-To use multiple options, separate them by a space.  For example, to build
-just the library but not \verb'cpu_features', and to enable
-AVX2 but not AVX512F, and use 40 threads to compile:
+%----------------------------------------
+\subsection{Linking issues after installation}
+%----------------------------------------
+
+My Linux distro (Ubuntu 18.04) includes a copy of \verb'libgraphblas.so.1',
+which is SuiteSparse:GraphBLAS v1.1.2.  After installing SuiteSparse:GraphBLAS
+in \verb'/usr/local/lib' (with \verb'sudo make install'), compiling a simple
+stand-alone program links against \verb'libgraphblas.so.1' instead of the
+latest version, while at the same time accessing the latest version of the
+include file as \verb'/usr/local/include/GraphBLAS.h'.  This command fails:
 
     {\small
     \begin{verbatim}
-    make CMAKE_OPTIONS='-DGBNCPUFEAT=1 -DGBAVX2=1' JOBS=40 \end{verbatim} }
-
-After compiling the library, you can compile the demos with
-\verb'make all' and then \verb'make run' while in the top-level
-GraphBLAS folder.
+    gcc prog.c -lgraphblas \end{verbatim} }
 
-If \verb'cmake' or \verb'make' fail, it might be that your default compiler
-does not support ANSI C11.  Try another compiler.  For example, try one of
-these options.  Go into the \verb'build' directory and type one of these:
+Revising my \verb'LD_LIBRARY_PATH' to put \verb'/usr/local/lib' first in the
+library directory order didn't help.  If you encounter this problem, try one of
+the following options (all four work for me, and link against the proper
+version, \verb'/usr/local/lib/libgraphblas.so.6.1.4' for example):
 
     {\small
     \begin{verbatim}
-    CC=gcc cmake ..
-    CC=gcc-11 cmake ..
-    CC=xlc cmake ..
-    CC=icx cmake ..  \end{verbatim} }
+    gcc prog.c -l:libgraphblas.so.6
+    gcc prog.c -l:libgraphblas.so.6.1.4
+    gcc prog.c /usr/local/lib/libgraphblas.so
+    gcc prog.c -Wl,-v -L/usr/local/lib -lgraphblas \end{verbatim} }
 
-You can also do the following in the top-level GraphBLAS folder instead:
+This \verb'prog.c' test program is a trivial one, which works in v1.0 and
+later:
 
     {\small
     \begin{verbatim}
-    CC=gcc make
-    CC=gcc-11 make
-    CC=xlc make
-    CC=icx make \end{verbatim} }
+    #include <GraphBLAS.h>
+    int main (void)
+    {
+        GrB_init (GrB_NONBLOCKING) ;
+        GrB_finalize ( ) ;
+    } \end{verbatim} }
 
-For faster compilation, you can specify a parallel make.  For example,
-to use 32 parallel jobs and the \verb'gcc' compiler, do the following:
+Compile the program above, then use this command to ensure
+\verb'libgraphblas.so.6' appears:
 
     {\small
     \begin{verbatim}
-    JOBS=32 CC=gcc make \end{verbatim} }
-
-If you do not have \verb'cmake', refer to Section~\ref{altmake}.
+    ldd a.out \end{verbatim} }
 
 %----------------------------------------
-\subsection{More details on the Mac}
+\subsection{Running the tests}
 %----------------------------------------
 
-SuiteSparse:GraphBLAS requires OpenMP for its internal parallelism, but
-OpenMP is not on the Mac by default.
-
-If you have the Intel compiler and OpenMP library, then use the following
-in the top-level \verb'GraphBLAS' folder.  OpenMP will be found automatically:
-
-    {\small
-    \begin{verbatim}
-    make CC=icc CXX=icpc \end{verbatim} }
+To run a short test, type \verb'make run' at the top-level \verb'GraphBLAS'
+folder.  This will run all the demos in \verb'GraphBLAS/Demos'.  MATLAB is not
+required.
 
-The following instructions work on MacOS Big Sur (v11.3)
-and MacOS Monterey (12.1), using
-cmake 3.13 or later:
+To perform the extensive tests in the \verb'Test' folder, and the statement
+coverage tests in \verb'Tcov', MATLAB R2017A is required.  See the
+\verb'README.txt' files in those two folders for instructions on how to run the
+tests.  The tests in the \verb'Test' folder have been ported to MATLAB on
+Linux, MacOS, and Windows.  The \verb'Tcov' tests do not work on Windows.  The
+MATLAB interface test (\verb'gbtest') works on all platforms; see the
+\verb'GraphBLAS/GraphBLAS' folder for more details.
 
-First install Xcode (see \url{https://developer.apple.com/xcode},
-and then install the command line tools for Xcode:
+%----------------------------------------
+\subsection{Cleaning up}
+%----------------------------------------
 
-    {\small
-    \begin{verbatim}
-    cd /Applications/Utilities
-    xcode-select —install \end{verbatim} }
+To remove all compiled files, type \verb'make' \verb'distclean' in the top-level
+GraphBLAS folder.
 
-Next, install brew, at \url{https://brew.sh}.
+%-------------------------------------------------------------------------------
+\section{Release Notes}
+%-------------------------------------------------------------------------------
 
-If not used for the MATLAB mexFunction interface, a recent update of the Apple
-Clang compiler now works with \verb'libomp' and the
-\verb'GraphBLAS/CMakeLists.txt'.  To use the MATLAB mexFunction, however, you
-must use \verb'gcc' (\verb'gcc-11' is recommended).  Using Clang will result in
-a segfault when you attempt to use the \verb'@GrB' interface in MATLAB.
+\begin{itemize}
 
-With MacOS Big Sur install \verb'gcc-11', \verb'cmake', and OpenMP, and then
-compile GraphBLAS.  cmake 3.13 or later is required.  For the MATLAB
-mexFunctions, you must use \verb'gcc-11'; the \verb'libomp' from \verb'brew'
-will allow you to compile the mexFunctions but they will not work properly.
+\item Version 7.2.0 (Aug 8, 2022)
 
-    {\small
-    \begin{verbatim}
-    brew install cmake
-    brew install libomp
-    brew install gcc
-    cd GraphBLAS/GraphBLAS
-    make CC=gcc-11 CXX=g++-11 JOBS=8 \end{verbatim} }
+    \begin{packed_itemize}
+    \item added ZSTD as a compression option for serialize/deserialize:
+        Version 1.5.3 by Yann Collet,
+        \url{https://github.com/facebook/zstd.git}.
+        Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+        Included in SuiteSparse:GraphBLAS via its BSD-3-clause license.
+        The default method is now ZSTD, level 1.
+    \item \verb'GxB_Matrix_reshape*' added.
+    \item MATLAB interface: \verb'reshape', \verb'C(:)=A', \verb'C=A(:)' are
+        faster.  Better error messages.
+    \end{packed_itemize}
 
-The above instructions assume MATLAB R2021a, using
-\verb'libgraphblas_renamed.dylib', since that version of MATLAB includes its
-own copy of SuiteSparse:GraphBLAS (\verb'libmwgraphblas.dylib') but at version
-v3.3.3, not the latest version.
+\item Version 7.1.2 (July 8, 2022)
 
-Next, compile the MATLAB mexFunctions.  I had to edit this file first:
+    \begin{packed_itemize}
+    \item MATLAB interface: linear indexing added for C(:)=A, C=A(:), and
+        single-output I=find(C).  Faster bandwidth, istriu, istril,
+        isbanded, isdiag.  C(I,J)=A can now grow the size of A.
+    \end{packed_itemize}
 
-{\small
-\begin{verbatim}
-/Users/davis/Library/Application Support/MathWorks/MATLAB/R2021a/mex_C_maci64.xml \end{verbatim} }
+\item Version 7.1.1 (June 3, 2022)
 
-where you would replace \verb'davis' with your MacOS user name.
-Change lines 4 and 18, where both cases of \verb'MACOSX_DEPLOYMENT_TARGET=10.14'
-must become \verb"MACOSX_DEPLOYMENT_TARGET=11.3".  Otherwise, MATLAB
-complains that the \verb'libgraphblas_renamed.dylib' was built for 11.3 but
-linked for 10.14.
+    \begin{packed_itemize}
+    \item minor updates to documentation and error messages
+    \item MATLAB interface: minor revision of GrB.deserialize
+    \end{packed_itemize}
 
-Next, type the following in the MATLAB Command Window:
+\item Version 7.1.0 (May 20, 2022)
 
-    {\small
-    \begin{verbatim}
-    cd GraphBLAS/GraphBLAS/@GrB/private
-    gbmake \end{verbatim} }
+    \begin{packed_itemize}
+    \item  added cube root: \verb'GxB_CBRT_FP32' and \verb'GxB_CBRT_FP64'
+        unary operators
+    \item added \verb'GxB_Matrix_isStoredElement'
+        and \verb'GxB_Vector_isStoredElement'
+    \end{packed_itemize}
 
-Then add the paths to your \verb'startup.m' file (usually in
-\verb'~/Documents/MATLAB/startup.m').  For example, my path is:
+\item Version 7.0.4 (Apr 25, 2022)
 
-    {\small
-    \begin{verbatim}
-    addpath ('/Users/davis/GraphBLAS/GraphBLAS') ;
-    addpath ('/Users/davis/GraphBLAS/GraphBLAS/build') ; \end{verbatim} }
+    \begin{packed_itemize}
+    \item (46) bug fix: user-defined type size was incorrectly limited
+        to 128 bytes.  Caught by Erik Welch.
+    \end{packed_itemize}
 
-Finally, you can run the tests to see if your installation works:
+\item Version 7.0.3 (Apr 8, 2022)
 
-    {\small
-    \begin{verbatim}
-    cd ../../test
-    gbtest \end{verbatim} }
+    \begin{packed_itemize}
+    \item faster transpose when using 2 threads
+    \end{packed_itemize}
 
-%----------------------------------------
-\subsection{On the ARM64 architecture}
-%----------------------------------------
+\item Version 7.0.2 (Apr 5, 2022)
 
-You may encounter a compiler error on the ARM64 architecture when using the
-\verb'gcc' compiler, versions 6.x and earlier.  This error was encountered on
-ARM64 Linux with gcc 6.x:
+    \begin{packed_itemize}
+    \item (45) bug fix: vector iterator was broken for iterating across a
+        vector in bitmap format.  Caught by Erik Welch.
+    \end{packed_itemize}
 
-\begin{verbatim}
-`In function GrB_Matrix_apply_BinaryOp1st_Scalar.part.1':
-GrB_Matrix_apply.c:(.text+0x210): relocation truncated to
-fit: R_AARCH64_CALL26 against `.text.unlikely'
-\end{verbatim}
+\item Version 7.0.1 (Apr 3, 2022)
 
-For the ARM64, this error is silenced with gcc v7.x and later, at least on
-Linux.
+    \begin{packed_itemize}
+    \item added revised ACM TOMS submission to the Doc folder
+    \end{packed_itemize}
 
-%----------------------------------------
-\subsection{On Microsoft Windows}
-\label{sec:windows}
-%----------------------------------------
+\item Version 7.0.0 (Apr 2, 2022)
 
-SuiteSparse:GraphBLAS is now ported to Microsoft Visual Studio.  However, that
-compiler is not ANSI C11 compliant. As a result, GraphBLAS on Windows will have
-a few minor limitations.
+    \begin{packed_itemize}
+    \item (44) spec bug: \verb'GrB_Matrix_diag'
+        was implemented in v5.2.x and v6.x with the wrong signature.
+        This fix requires the major release to change, from v6.x to v7.x.
+    \item (43) performance bug fix for \verb'GrB_mxm':
+        auto selection for saxpy method (Hash vs Gustavson) revised.
+    \item \verb'GrB_assign': better performance for \verb'C(i,j)=scalar' and
+        \verb'C(i,j)+=scalar' when \verb'i' and \verb'j' have length 1 (scalar
+        assigment with no scalar expansion).
+    \end{packed_itemize}
 
-\begin{itemize}
-\item The MS Visual Studio compiler does not support the \verb'_Generic'
-keyword, required for the polymorphic GraphBLAS functions.  So for example, you
-will need to use \verb'GrB_Matrix_free' instead of just \verb'GrB_free'.
+\item Version 6.2.5 (Mar 14, 2022)
 
-\item Variable-length arrays are not supported, so user-defined
-types are limited to 128 bytes in size.  This can be changed by editing
-\verb'GB_VLA_MAXSIZE' in \verb'Source/GB_compiler.h', and recompiling
-SuiteSparse:GraphBLAS.
+    \begin{packed_itemize}
+    \item For SuiteSparse v5.11.0.
+    \end{packed_itemize}
 
-\item AVX acceleration is not enabled.
-\end{itemize}
+\item Version 6.2.4 (Mar 8, 2022)
 
-If you use a recent \verb'gcc' or \verb'icx' compiler on Windows other than the
-Microsoft Compiler (\verb'cl'), these limitations can be avoided.
+    \begin{packed_itemize}
+    \item (42) bug fix: \verb'GrB_mxm' with 0-by-0 iso full matrices.
+        Caught by Henry Amuasi in the Python
+        grblas interface, then triaged and isolated by Erik Welch.
+    \end{packed_itemize}
 
-The following instructions apply to Windows 10, CMake 3.16, and
-Visual Studio 2019, but may work for earlier versions.
+\item Version 6.2.3 (Mar 5, 2022)
 
-\begin{enumerate}
+    \begin{packed_itemize}
+    \item minor update to documentation in \verb'GrB.build':
+        no change to any code
+    \end{packed_itemize}
 
-\item Install CMake 3.16 or later, if not already installed.
-    See \url{https://cmake.org/} for details.
+\item Version 6.2.2 (Feb 28, 2022)
 
-\item Install Microsoft Visual Studio, if not already installed.
-    See \url{https://visualstudio.microsoft.com/} for details.
-    Version 2019 is preferred, but earlier versions may also work.
+    \begin{packed_itemize}
+    \item revised output of \verb'GxB_*_sort' to return newly created matrices
+        C and P as full or bitmap matrices, as appropriate, instead of
+        sparse/hypersparse, following their sparsity control settings.
+    \end{packed_itemize}
 
-\item Open a terminal window and type this in the
-    \verb'SuiteSparse/GraphBLAS/build' folder:
+\item Version 6.2.1 (Feb 14, 2022)
 
-    \vspace{-0.1in}
-    {\small
-    \begin{verbatim}
-    cmake ..  \end{verbatim} }
-    \vspace{-0.1in}
+    \begin{packed_itemize}
+    \item  (41) bug fix: \verb'GxB_Iterator_get' used \verb'(void *) + size'
+        arithmetic
+    \end{packed_itemize}
 
-\item The \verb'cmake' command generates many files in
-    \verb'SuiteSparse/GraphBLAS/build', and the file \verb'graphblas.sln' in
-    particular.  Open the generated \verb'graphblas.sln' file in Visual Studio.
+\item Version 6.2.0 (Feb 14, 2022)
 
-\item Optionally: right-click \verb'graphblas' in the left panel (Solution
-    Explorer) and select properties; then navigate to \verb'Configuration'
-    \verb'Properties', \verb'C/C++', \verb'General' and change the parameter
-    \verb'Multiprocessor Compilation' to \verb'Yes (/MP)'.  Click \verb'OK'.
-    This will significantly speed up the compilation of GraphBLAS.
+    \begin{packed_itemize}
+    \item added the \verb'GxB_Iterator' object and its methods.  See
+        Section~\ref{iter}.
+    \item \verb'@GrB' interface: revised sparse-times-full rule for the
+        conventional semiring (the syntax \verb'C=A*B'), so that
+        sparse-times-full results in \verb'C' as full,
+        but hypersparse-times-sparse is not full
+        (typically sparse or hypersparse).
+    \end{packed_itemize}
 
-\item Select the \verb'Build' menu item at the top of the window and
-    select \verb'Build Solution'.  This should create a folder called
-    \verb'Release' and place the compiled \verb'graphblas.dll',
-    \verb'graphblas.lib', and \verb'graphblas.exp' files there.  Please be
-    patient; some files may take a while to compile and sometimes may appear to
-    be stalled.  Just wait.
+\item Version 6.1.4 (Jan 12, 2022)
 
-    % Alternatively, type this command in the terminal window:
-    % {\small
-    % \begin{verbatim}
-    % devenv graphblas.sln /build "release|x64" /project graphblas \end{verbatim}}
+    \begin{packed_itemize}
+    \item added Section~\ref{perf} to User Guide: how to get the best
+        performance out of algorithms based on GraphBLAS.
+    \item \verb'cpu_features':  no longer built as a separate library,
+        but built directly into \verb'libgraphblas.so' and
+        \verb'libgraphblas.a'.  Added compile-time flags to
+        optionally disable the use of \verb'cpu_features' completely.
+    \item Octave 7: port to Apple Silicon (thanks to 
+            G{\'{a}}bor Sz{\'{a}}rnyas).
+    \item min/max monoids:  real case (FP32 and FP64) no longer terminal
+    \item \verb'@GrB' interface: overloaded \verb'C=A*B' syntax where one
+        matrix is full always results in a full matrix \verb'C'.
+    \item Faster \verb'C=A*B' for sparse-times-full and full-times-sparse
+        for \verb'@GrB' interface.
+    \end{packed_itemize}
 
-\item Add the \verb'GraphBLAS/build/Release' folder to the Windows System path:
+\item Version 6.1.3 (Jan 1, 2022)
 
-    \begin{itemize}
-    \item Open the \verb'Start Menu' and type \verb'Control Panel'.
-    \item Select the \verb'Control Panel' app.
-    \item When the app opens, select \verb'System and Security'.
-    \item Under \verb'System and Security', select \verb'System'.
-    \item From the top left side of the \verb'System' window, select
-        \verb'Advanced System Settings'.  You may have to authenticate
-        at this step.
-    \item The \verb'Systems Properties' window should appear with the
-        \verb'Advanced' tab selected;
-        select \verb'Environment Variables'.
-    \item The \verb'Environment Variables' window displays 2 sections, one for
-        \verb'User' variables and the other for \verb'System' variables.  Under
-        the \verb'Systems' variable section, scroll to and select \verb'Path',
-        then select \verb'Edit'.   A editor window appears allowing to add,
-        modify, delete or re-order the parts of the \verb'Path'.
-    \item Add the full path of the \verb'GraphBLAS\build\Release' folder
-        (typically starting with \verb'C:\Users\you\'..., where \verb'you' is
-        your Windows username) to the \verb'Path'.
-    \item If the above steps do not work, you can instead copy the
-        \verb'graphblas.*' files from \verb'GraphBLAS\build\Release' into any
-        existing folder listed in your \verb'Path'. 
-    \end{itemize}
+    \begin{packed_itemize}
+    \item performance: task creation for \verb'GrB_mxm'
+        had a minor flaw (results were correct but parallelism suffered).
+        Performance improvement of up to 10x when nnz(A)<<nnz(B).
+    \end{packed_itemize}
 
-\item The \verb'GraphBLAS/Include/GraphBLAS.h' file must be included in user
-    applications via \verb'#include "GraphBLAS.h"'.  This is already done for
-    you in the Octave/MATLAB interface discussed in the next section.
+\item Version 6.1.2 (Dec 31, 2021)
 
-\end{enumerate}
+    \begin{packed_itemize}
+    \item performance: revised \verb'swap_rule' in \verb'GrB_mxm', which decides whether
+        to compute \verb"C=A*B" or \verb"C=(B'*A')'", and variants, resulting in up
+        to 3x performance gain over v6.1.1 for \verb'GrB_mxm' (observed;
+        could be higher in other cases).
+    \end{packed_itemize}
 
-%----------------------------------------
-\subsection{Compiling the Octave/MATLAB interface (for Octave, and for MATLAB R2020a and earlier)}
-%----------------------------------------
-\label{gbmake}
+\item Version 6.1.1 (Dec 28, 2021) 
 
-I'm working closely with John Eaton (the primary developer of Octave) to
-enable SuiteSparse:GraphBLAS to work with Octave, and thus Octave 7 is
-required.  The latest version of Octave is 6.4.0, so you need to download and
-install the development version of Octave 7 to use SuiteSparse:GraphBLAS within
-Octave.
+    \begin{packed_itemize}
+    \item minor revision to AVX2 and AVX512f selection
+    \item \verb'cpu_features/Makefile': remove test of \verb'list_cpu_features'
+        so that the package can be built when cross-compiling
+    \end{packed_itemize}
 
-First, compile the SuiteSparse:GraphBLAS dynamic library
-(\verb'libgraphblas.so' for Linux, \verb'libgraphblas.dylib' for Mac,
-or \verb'graphblas.dll' for Windows), as described in the prior two
-subsections.
+\item Versions 6.1.0 (Dec 26, 2021)
 
-On the Mac, SuiteSparse:GraphBLAS v6.1.4 and Octave 7 will work 
-Apple Silicon (thanks to G{\'{a}}bor Sz{\'{a}}rnyas).  Here are his instructions
-(replicated from
-\url{https://github.com/DrTimothyAldenDavis/GraphBLAS/issues/90}); do
-these in your Mac Terminal:
+    \begin{packed_itemize}
+    \item added \verb'GxB_get' options: compiler name and version.
+    \item added package: \url{https://github.com/google/cpu_features},
+        Nov 30, 2021 version.
+    \item performance: faster \verb'C+=A*B' when \verb'C' is full,
+        \verb'A' is bitmap/full, and \verb'B' is sparse/hyper.  % saxpy5
+        Faster \verb"C+=A'*B" when
+        \verb'A' is sparse/hyper, and \verb'B' is bitmap/full.  % dot4
+    \item (40) bug fix: deserialization of iso and empty matrices/vectors was
+        incorrect
+    \end{packed_itemize}
 
-\begin{itemize}
-\item Building Octave.  Grab the brew formula:
+\item Versions 6.0.2 and 5.2.2 (Nov 30, 2021)
 
-    {\scriptsize
-    \begin{verbatim}
-    wget https://raw.githubusercontent.com/Homebrew/homebrew-core/master/Formula/octave.rb
-    \end{verbatim} }
+    \begin{packed_itemize}
+    \item (39) bug fix: \verb'GrB_Matrix_export':
+        numerical values not properly exported
+    \end{packed_itemize}
 
-\item Edit \verb'octave.rb'.
+\item Versions 6.0.1 and 5.2.1 (Nov 27, 2021)
 
-   Add \verb`"disable-docs"` to \verb`args` (or ensure that you have a working
-   texinfo installation).
-   Edit Mercurial (\verb`hg`) repository: switch from the \verb`default` branch
-   (containing code for Octave v8.0) to \verb`stable` (v7.0).  Then do:
+    \begin{packed_itemize}
+    \item v6.0.x and v5.2.x (for the same x):
+        differ only in \verb'GrB_wait', \verb'GrB_Info',
+        \verb'GrB_SCMP', and \verb'GxB_init'.
+    \item (38) bug fix:  \verb"C+=A'*B" when the accum operator is the same as
+        the monoid and C is iso-full, and \verb'A' or \verb'B' are hypersparse.
+        (dot4 method).
+    \item performance: \verb'GrB_select' with user-defined
+        \verb'GrB_IndexUnaryOp' about 2x faster.
+    \item performance: faster \verb'(MIN,MAX)_(FIRSTJ,SECONDI)' semirings
+    \end{packed_itemize}
 
-    {\small
-    \begin{verbatim}
-    brew install --head ./octave.rb
-    \end{verbatim} }
+\item Version 6.0.0 (Nov 15, 2021)
 
-\item Building the tests (\verb'gbmake').
-    Grab the OpenMP binaries as described at
-    \url{https://mac.r-project.org/openmp/}
+    \begin{packed_itemize}
+    \item this release contains only a few changes that cause a
+        break with backward compatibility.  It is otherwise identical to v5.2.0.
+    \item v6.0.0 is fully compliant with the v2.0 C API Specification.
+        Three changes from the v2.0 C API Spec are not backward compatible
+        (\verb'GrB_*wait', \verb'GrB_Info', \verb'GrB_SCMP').
+        \verb'GxB_init' has also changed.
+        \begin{packed_itemize}
+        \item \verb'GrB_wait (object, mode)': was \verb'GrB_wait (&object)'.
+        \item \verb'GrB_Info': changed enum values
+        \item \verb'GrB_SCMP': removed
+        \item \verb'GxB_init (mode, malloc, calloc, realloc, free, is_thread_safe)':
+            the last parameter, \verb'is_thread_safe', is deleted.
+            The malloc, calloc, realloc, and free functions must be thread-safe.
+        \end{packed_itemize}
+    \end{packed_itemize}
 
-    {\scriptsize
-    \begin{verbatim}
-    curl -O https://mac.r-project.org/openmp/openmp-13.0.0-darwin21-Release.tar.gz
-    sudo tar fvxz openmp-13.0.0-darwin21-Release.tar.gz -C /
-    \end{verbatim} }
+\item Version 5.2.0 (Nov 15, 2021)
 
-\item Do the following to edit \verb'gbmake.m':
+    \begin{packed_itemize}
+    \item Added for the v2.0 C API Specification: only features that are
+        backward compatible with SuiteSparse:GraphBLAS v5.x have been
+        added to v5.2.0:
+        \begin{packed_itemize}
+        \item \verb'GrB_Scalar': replaces \verb'GxB_Scalar', \verb'GxB_Scalar_*'
+            functions renamed GrB
+        \item \verb'GrB_IndexUnaryOp': new, free, fprint, wait
+        \item \verb'GrB_select': selection via \verb'GrB_IndexUnaryOp'
+        \item \verb'GrB_apply': with \verb'GrB_IndexUnaryOp'
+        \item \verb'GrB_reduce': reduce matrix or vector to \verb'GrB_Scalar'
+        \item \verb'GrB_assign', \verb'GrB_subassion': with \verb'GrB_Scalar'
+            input
+        \item \verb'GrB_*_extractElement_Scalar': get \verb'GrB_Scalar'
+            from a matrix or vector
+        \item \verb'GrB*build': when \verb'dup' is \verb'NULL',
+            duplicates result in an error.
+        \item \verb'GrB import/export': import/export from/to user-provided
+            arrays
+        \item \verb'GrB_EMPTY_OBJECT', \verb'GrB_NOT_IMPLEMENTED': error codes
+            added
+        \item \verb'GrB_*_setElement_Scalar': set an entry in a matrix or
+            vector, from a \verb'GrB_Scalar'
+        \item \verb'GrB_Matrix_diag': same as
+            \verb'GxB_Matrix_diag (C,v,k,NULL)'
+        \item \verb'GrB_*_serialize/deserialize': with compression
+        \item \verb'GrB_ONEB_T': binary operator, $f(x,y)=1$, the same as
+            \verb'GxB_PAIR_T'.
+        \end{packed_itemize}
+    \item \verb'GxB*import*' and \verb'GxB*export*': now historical; use
+        \verb'GxB*pack/unpack*'
+    \item \verb'GxB_select': is now historical; use \verb'GrB_select' instead.
+    \item \verb'GxB_IGNORE_DUP': special operator for build methods only; if dup
+        is this operator, then duplicates are ignored (not an error)
+    \item \verb'GxB_IndexUnaryOp_new': create a named index-unary operator
+    \item \verb'GxB_BinaryOp_new': create a named binary operator
+    \item \verb'GxB_UnaryOp_new': create a named unary operator
+    \item \verb'GxB_Type_new': to create a named type
+    \item \verb'GxB_Type_name': to query the name of a type
+    \item added \verb'GxB_*type_name' methods
+        to query the name of a type as a string.
+    \item \verb'GxB' methods that query an object return a \verb'GrB_type' such
+        as \verb'GxB_Matrix_type' are declared historical; will be kept but not
+        recommended (use \verb'GxB_*type_name' methods).
+    \item \verb'GxB_Matrix_serialize/deserialize': with compression;
+        optional descriptor.
+    \item \verb'GxB_Matrix_sort', \verb'GxB_Vector_sort':
+        sort a matrix or vector
+    \item \verb'GxB_eWiseUnion': like \verb'GrB_eWiseAdd' except for how
+        entries in $\bf A\setminus B$ and $\bf B \setminus A$ are computed.
+    \item added LZ4/LZ4HC: compression library, \url{http://www.lz4.org} (BSD
+        2), v1.9.3, Copyright (c) 2011-2016, Yann Collet.
+    \item MIS and pagerank demos: removed; MIS added to LAGraph/experimental
+    \item disabled free memory pool if OpenMP not available
+    \item (37) bug fix: ewise \verb'C=A+B' when all matrices are full,
+        \verb'GBCOMPACT' not used, but \verb'GB_control.h' disabled the
+        operator or type.  Caught by Roi Lipman, Redis.
+    \item (36) bug fix: \verb'C<M>=Z' not returning \verb'C'
+        as iso if \verb'Z 'iso and \verb'C' initially
+        empty.  Caught by Erik Welch, Anaconda.
+    \item performance improvements: \verb'C=A*B': sparse/hyper times
+        bitmap/full, and visa versa, including \verb'C += A*B' when \verb'C' is
+        full.
+    \end{packed_itemize}
 
-    {\scriptsize
-    \begin{verbatim}
-    sed -i.bkp 's/-fopenmp/-Xclang -fopenmp/g' @GrB/private/gbmake.m
-    \end{verbatim} }
+\item Version 5.1.10 (Oct 27, 2021)
+
+    \begin{packed_itemize}
+    \item  (35) bug fix: \verb'GB_selector'; \verb'A->plen' and \verb'C->plen'
+        not updated correctly.  Caught by Jeffry Lovitz, Redis.
+    \end{packed_itemize}
 
-\end{itemize}
+\item Version 5.1.9 (Oct 26, 2021)
 
-Once Octave 7 and SuiteSparse:GraphBLAS are compiled and installed,
-and \verb'gbmake.m' is modified if needed for Octave 7 on the Mac,
-(or if using MATLAB) continue with the following instructions:
+    \begin{packed_itemize}
+    \item (34) bug fix: in-place test incorrect for \verb"C+=A'*B" using dot4
+    \item (33) bug fix: disable free pool if OpenMP not available
+    \end{packed_itemize}
 
-\begin{enumerate}
-\item In the Octave/MATLAB command window:
+\item Version 5.1.8 (Oct 5, 2021)
 
-    {\small
-    \begin{verbatim}
-    cd GraphBLAS/GraphBLAS/@GrB/private
-    gbmake \end{verbatim} }
+    \begin{packed_itemize}
+    \item (32) bug fix: C=A*B when A is sparse and B is iso and bitmap.
+        Caught by Mark Blanco, CMU.
+    \end{packed_itemize}
 
-\item Follow the remaining instructions in the
-    \verb'GraphBLAS/GraphBLAS/README.md' file, to revise your
-    Octave/MATLAB path and \verb'startup.m' file.
+\item Version 5.1.7 (Aug 23, 2021)
 
-\item As a quick test, try the command \verb'GrB(1)', which
-    creates and displays a 1-by-1 GraphBLAS matrix.  For a longer test, do the
-    following:
+    \begin{packed_itemize}
+    \item (31) bug fix:  \verb'GrB_apply', when done in-place and matrix starts
+        non-iso and becomes iso, gave the wrong iso result.
+        Caught by Fabian Murariu.
+    \end{packed_itemize}
 
-    {\small
-    \begin{verbatim}
-    cd GraphBLAS/GraphBLAS/test
-    gbtest \end{verbatim} }
+\item Version 5.1.6 (Aug 16, 2021)
 
-\item In Windows, if the tests fail with an error stating that the
-    mex file is invalid because the module could not be found, it means
-    that MATLAB could not find the compiled \verb'graphblas.lib', \verb'*.dll'
-    or \verb'*.exp' files in the \verb'build/Release' folder.  This can happen
-    if your Windows System path is not set properly, or if Windows is not
-    recognizing the \verb'GraphBLAS/build/Release' folder (see
-    Section~\ref{sec:windows})  Or, you might not have permission to change your
-    Windows System path.  In this case, do the following in the MATLAB Command
-    \vspace{-0.1in}
-    Window:
+    \begin{packed_itemize}
+    \item one-line change to \verb'C=A*B': faster symbolic analysis when a
+        vector \verb'C(:,j)' is dense (for CSC) or \verb'C(i,:)' for CSR.
+    \end{packed_itemize}
 
-    \vspace{-0.1in}
-    {\small
-    \begin{verbatim}
-    cd GraphBLAS/build/Release
-    GrB(1) \end{verbatim} }
+\item Version 5.1.5 (July 15, 2021)
 
-    \vspace{-0.1in}
-    After this step, the GraphBLAS library will be loaded into MATLAB.  You may
-    need to add the above lines in your \verb'Documents/MATLAB/startup.m' file,
-    so that they are done each time MATLAB starts.  You will also need to do
-    this after \verb'clear all' or \verb'clear mex', since those MATLAB
-    commands remove all loaded libraries from MATLAB.
+    \begin{packed_itemize}
+    \item submission to ACM Transactions on Mathematical Software as
+        a Collected Algorithm of the ACM.
+    \end{packed_itemize}
 
-    You might also get an error ``the specified procedure cannot be found.''
-    This can occur if you have upgraded your GraphBLAS library from a prior
-    version, and some of the compiled files \verb'@GrB/private/*.mex*'
-    are stale.  Try the command \verb'gbmake all' in the MATLAB Command
-    Window, which forces all of the MATLAB interface to be recompiled.
-    Or, try deleting all \verb'@GrB/private/*.mex*' files and running
-    \verb'gbmake' again.
+\item Version 5.1.4 (July 6, 2021)
 
-\item On Windows, the \verb'casin', \verb'casinf', \verb'casinh', and
-    \verb'casinhf' functions provided by Microsoft do not return the correct
-    imaginary part.  As a result, \verb'GxB_ASIN_FC32', \verb'GxB_ASIN_FC64'
-    \verb'GxB_ASINH_FC32', and \verb'GxB_ASINH_FC64' do not work properly on
-    Windows.  This affects the \verb'GrB/asin', \verb'GrB/acsc',
-    \verb'GrB/asinh', and \verb'GrB/acsch', functions in the MATLAB interface.
-    See the MATLAB tests bypassed in \verb'gbtest76.m' for details, in the
-    \newline
-    \verb'GraphBLAS/GraphBLAS/test' folder.
-    %% FUTURE: fix asin and acsc on Windows for the complex case.
+    \begin{packed_itemize}
+    \item faster Octave interface.  Octave v7 or later is required.
+    \item (30) bug fix: 1-based printing not enabled for pending tuples.
+        Caught by Will Kimmerer, while working on the Julia interface.
+    \end{packed_itemize}
 
-\end{enumerate}
+\item Version 5.1.3 (July 3, 2021)
 
-%----------------------------------------
-\subsection{Compiling the Octave/MATLAB interface (for MATLAB R2021a and later)}
-\label{R2021a}
-%----------------------------------------
+    \begin{packed_itemize}
+    \item added \verb'GxB_Matrix_iso' and \verb'GxB_Vector_iso':
+        to query if a matrix or vector is held as iso-valued
+    \item (29) bug fix: \verb'Matrix_pack_*R' into a matrix previously held by
+        column, or \verb'Matrix_pack*C' into a matrix by row, would flip the
+        dimensions.
+        Caught by Erik Welch, Anaconda.
+    \item (28) bug fix: \verb'kron(A,B)' with iso input matrices
+        \verb'A' and \verb'B' fixed.
+        Caught by Michel Pelletier, Graphegon.
+    \item (27) bug fix: v5.1.0 had a wrong version of a file; posted by mistake.
+        Caught by Michel Pelletier, Graphegon.
+    \end{packed_itemize}
 
-MATLAB R2021a includes its own copy of SuiteSparse:GraphBLAS v3.3.3, as the
-file \verb'libmwgraphblas.so', which is used for the built-in \verb'C=A*B' when
-both \verb'A' and \verb'B' are sparse (see the Release Notes of MATLAB R2021a,
-which discusses the performance gained in MATLAB by using GraphBLAS).
+\item Version 5.1.2 (June 30, 2021)
 
-That's great news for the impact of GraphBLAS on MATLAB itself, and the domain
-of high performance computing in general, but it causes a linking problem when
-using this MATLAB interface for GraphBLAS.  The two use different versions of
-the same library, and a segfault arises if the MATLAB interface for v4.x (or
-later) tries to link with the older GraphBLAS v3.3.3 library.  Likewise, the
-built-in \verb'C=A*B' causes a segfault if it tries to use the newer GraphBLAS
-v4.x (or later) libraries.
+    \begin{packed_itemize}
+    \item iso matrices added:  these are matrices and vectors whose
+        values in the sparsity pattern are all the same.  This is an internal
+        change to the opaque data structures of the \verb'GrB_Matrix' and
+        \verb'GrB_Vector' with very little change to the API.
+    \item added \verb'GxB_Matrix_build_Scalar'
+            and \verb'GxB_Vector_build_Scalar',
+            which always build iso matrices and vectors.
+    \item import/export methods can now import/export iso matrices and vectors.
+    \item added \verb'GrB.argmin/argmax' to Octave/MATLAB interface
+    \item added \verb'GxB_*_pack/unpack' methods as alternatives to
+        import/export.
+    \item added \verb'GxB_PRINT_1BASED' to the global settings.
+    \item added \verb'GxB_*_memoryUsage'
+    \item port to Octave:  \verb'gbmake' and \verb'gbtest'
+        work in Octave7 to build and test
+        the \verb'@GrB' interface to GraphBLAS.  Octave 7.0.0 is required.
+    \end{packed_itemize}
 
-To resolve this issue, a second GraphBLAS library must be compiled,
-\verb'libgraphblas_renamed', where the internal symbols are all renamed so they
-do not conflict with the \verb'libmwgraphblas' library.  Then both libraries
-can co-exist in the same instance of MATLAB.
+\item Version 5.0.6 (May 24, 2021)
 
-To do this, go to the \verb'GraphBLAS/GraphBLAS' folder, containing the
-MATLAB interface.  That folder contains a \verb'CMakeLists.txt' file to
-compile the \verb'libgraphblas_renamed' library.  See the instructions
-for how to compile the C library \verb'libgraphblas', and repeat them but
-using the folder \newline
-\verb'SuiteSparse/GraphBLAS/GraphBLAS/build' instead of \newline
-\verb'SuiteSparse/GraphBLAS/build'.
+    \begin{packed_itemize}
+    \item BFS and triangle counting demos removed from GraphBLAS/Demo:
+        see LAGraph for these algorithms.  Eventually, all of GraphBLAS/Demo
+        will be deleted, once LAGraph includes all the methods included there.
+    \end{packed_itemize}
 
-This will compile the renamed SuiteSparse:GraphBLAS dynamic library
-(\verb'libgraphblas_renamed.so' for Linux, \verb'libgraphblas_renamed.dylib'
-for Mac, or \verb'graphblas_renamed.dll' for Windows).  These can be
-placed in the same system-wide location as the standard \verb'libgraphblas'
-libraries, such as \verb'/usr/local/lib' for Linux.  The two pairs of
-libraries share the identical \verb'GraphBLAS.h' include file.
+\item Version 5.0.5 (May 17, 2021)
 
-Next, compile the MATLAB interface as described in Section~\ref{gbmake}.  For
-any instructions in that Section that refer to the \verb'GraphBLAS/build'
-folder (Linux and Mac) or \verb'GraphBLAS/build/Release' (Windows), use \newline
-\verb'GraphBLAS/GraphBLAS/build' (Linux and Mac) or \newline
-\verb'GraphBLAS/GraphBLAS/build/Release' (Windows) instead.
+    \begin{packed_itemize}
+    \item (26) performance bug fix:  reduce-to-vector where \verb'A' is
+        hypersparse CSR with a transposed descriptor (or CSC with no
+        transpose), and some cases for \verb'GrB_mxm/mxv/vxm' when computing
+        \verb'C=A*B' with A hypersparse CSC and \verb'B' bitmap/full (or
+        \verb'A' bitmap/full and \verb'B' hypersparse CSR), the wrong internal
+        method was being selected via the auto-selection strategy, resulting in
+        a significant slowdown in some cases.
+    \end{packed_itemize}
 
-The resulting functions for your \verb'@GrB' object will now work just fine;
-no other changes are needed.  You can even use the GraphBLAS mexFunctions
-compiled in MATLAB R2021a in earlier versions of MATLAB (such as R2020a).
+\item Version 5.0.4 (May 13, 2021)
 
-%----------------------------------------
-\subsection{Setting the C flags and using CMake}
-%----------------------------------------
+    \begin{packed_itemize}
+    \item \verb'@GrB' Octave/MATLAB interface: changed license
+        to GNU General Public License v3.0 or later.
+    \end{packed_itemize}
 
-Next, do \verb'make' in the \verb'build' directory.  If this still fails, see
-the \verb'CMakeLists.txt' file.  You can edit that file to pass
-compiler-specific options to your compiler.  Locate this section in the
-\verb'CMakeLists.txt' file.  Use the \verb'set' command in \verb'cmake', as in
-the example below, to set the compiler flags you need.
+\item Version 5.0.3 (May 12, 2021)
 
-    {\small
-    \begin{verbatim}
-    # check which compiler is being used.  If you need to make
-    # compiler-specific modifications, here is the place to do it.
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
-        # cmake 2.8 workaround: gcc needs to be told to do ANSI C11.
-        # cmake 3.0 doesn't have this problem.
-        set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -std=c11 -lm " )
-        ...
-    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "Intel")
-        ...
-    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
-        ...
-    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
-        ...
-    endif ( )
-    \end{verbatim} }
+    \begin{packed_itemize}
+    \item (25) bug fix: disabling \verb'ANY_PAIR' semirings by editing
+        \verb'Source/GB_control.h' would cause a segfault if those disabled
+        semirings were used.
+    \item demos are no longer built by default
+    \item (24) bug fix: new functions in v5.0.2 not declared as \verb'extern'
+        in \verb'GraphBLAS.h'.
+    \item \verb'GrB_Matrix_reduce_BinaryOp' reinstated from v4.0.3;
+        same limit on built-in ops that correspond to known monoids.
+    \end{packed_itemize}
 
-To compile SuiteSparse:GraphBLAS without running the demos, use \newline
-\verb'make library' in the top-level directory, or \verb'make' in the
-\verb'build' directory.
+\item Version 5.0.2 (May 5, 2021)
 
-Several compile-time options can be selected by editing the \verb'Source/GB.h'
-file, but these are meant only for code development of SuiteSparse:GraphBLAS
-itself, not for end-users of SuiteSparse:GraphBLAS.
+    \begin{packed_itemize}
+    \item (23) bug fix: \verb'GrB_Matrix_apply_BinaryOp1st' and \verb'2nd'
+        were using the
+        wrong descriptors for \verb'GrB_INP0' and \verb'GrB_INP1'.
+        Caught by Erik Welch, Anaconda.
+    \item memory pool added for faster allocation/free of small blocks
+    \item \verb'@GrB' interface ported to MATLAB R2021a.
+    \item \verb'GxB_PRINTF' and \verb'GxB_FLUSH' global options added.
+    \item \verb'GxB_Matrix_diag': construct a diagonal matrix from a vector
+    \item \verb'GxB_Vector_diag': extract a diagonal from a matrix
+    \item \verb'concat/split': methods to concatenate and split matrices.
+    \item \verb'import/export':
+        size of arrays now in bytes, not entries.  This change
+        is required for better internal memory management, and it is not
+        backward compatible with the \verb'GxB*import/export' functions in v4.0.
+        A new parameter, \verb'is_uniform', has been added to all import/export
+        methods, which indicates that the matrix values are all the same.
+    \item (22) bug fix: SIMD vectorization was missing
+        \verb'reduction(+,task_cnvals)' in
+        \verb'GB_dense_subassign_06d_template.c'.  Caught by Jeff Huang, Texas
+        A\&M, with his software package for race-condition detection.
+    \item \verb'GrB_Matrix_reduce_BinaryOp': removed.  Use a monoid instead,
+        with \verb'GrB_reduce' or \verb'GrB_Matrix_reduce_Monoid'.
+    \end{packed_itemize}
 
-%----------------------------------------
-\subsection{Using a plain makefile}
-\label{altmake}
-%----------------------------------------
+\item Version 4.0.3 (Jan 19, 2021)
 
-The \verb'GraphBLAS/alternative' directory contains a simple \verb'Makefile'
-that can be used to compile SuiteSparse:GraphBLAS.  This is a useful option
-if you do not have the required version of \verb'cmake'.  This \verb'Makefile'
-can even compile the entire library with a C++ compiler, which cannot be
-done with \verb'CMake'.
+    \begin{packed_itemize}
+    \item faster min/max monoids
+    \item \verb'G=GrB(G)' converts \verb'G' from v3 object to v4
+    \end{packed_itemize}
 
-This alternative \verb'Makefile' does not build the
-\verb'libgraphblas_renamed.so' library required for MATLAB R2021a (see
-Section~\ref{R2021a}).  This can be done by revising the \verb'Makefile',
-however:  add the \verb'-DGBRENAME=1' flag, and change the library name
-from \verb'libgraphblas' to \verb'libgraphbas_renamed'.
+\item Version 4.0.2 (Jan 13, 2021)
 
-%----------------------------------------
-\subsection{Running the Demos}
-%----------------------------------------
+    \begin{packed_itemize}
+    \item ability to load \verb'*.mat' files saved with the v3 \verb'GrB'
+    \end{packed_itemize}
 
-After \verb'make' in the top-level directory to compile the library, type
-\verb'make run' to run the demos (also in the top-level directory).
-You can also run the demos after compiling with \verb'make all':
+\item Version 4.0.1 (Jan 4, 2021)
 
-    {\small
-    \begin{verbatim}
-    make all
-    cd Demo
-    ./demo \end{verbatim} }
+    \begin{packed_itemize}
+    \item significant performance improvements: compared with v3.3.3,
+        up to 5x faster in breadth-first-search (using 
+        \verb'LAGraph_bfs_parent2'), and 2x faster in
+        Betweenness-Centrality (using \verb'LAGraph_bc_batch5').
+    \item \verb'GrB_wait(void)', with no inputs: removed
+    \item \verb'GrB_wait(&object)': polymorphic function added
+    \item \verb'GrB_*_nvals': no longer guarantees completion;
+        use \verb'GrB_wait(&object)'
+        or non-polymorphic \verb'GrB_*_wait (&object)' instead
+    \item \verb'GrB_error': now has two parameters: a string
+        (\verb'char **') and an object.
+    \item \verb'GrB_Matrix_reduce_BinaryOp' limited to built-in operators that
+        correspond to known monoids.
+    \item \verb'GrB_*_extractTuples': may return indices out of order
+    \item removed internal features: GBI iterator, slice and hyperslice matrices
+    \item bitmap/full matrices and vectors added
+    \item positional operators and semirings:
+        \verb'GxB_FIRSTI_INT32' and related ops
+    \item jumbled matrices: sort left pending, like zombies and pending tuples
+    \item \verb'GxB_get/set': added \verb'GxB_SPARSITY_*'
+        (hyper, sparse, bitmap, or full) and \verb'GxB_BITMAP_SWITCH'.
+    \item \verb'GxB_HYPER': enum renamed to \verb'GxB_HYPER_SWITCH'
+    \item \verb'GxB*import/export': API modified
+    \item \verb'GxB_SelectOp': \verb'nrows' and \verb'ncols' removed
+        from function signature.
+    \item OpenMP tasking removed from mergesort and replaced with parallel
+        for loops.  Just as fast on Linux/Mac; now the performance ports to
+        Windows.
+    \item \verb'GxB_BURBLE' added as a supported feature.  This was an
+        undocumented feature of prior versions.
+    \item bug fix: \verb'A({lo,hi})=scalar'
+        \verb'A(lo:hi)=scalar' was OK
+    \end{packed_itemize}
 
-The \verb'./demo' command is a script that runs the demos with various input
-matrices in the \verb'Demo/Matrix' folder.  The output of the demos will be
-compared with expected output files in \verb'Demo/Output'.
+\item Version 3.3.3 (July 14, 2020).
+    Bug fix: \verb'w<m>=A*u' with mask non-empty and u empty.
 
-NOTE:
-DO NOT publish benchmarks of these demos, and do not link against the
-demo library in any user application.  These codes are sometimes slow,
-and are meant as simple illustrations only, not for performance.  The fastest
-methods are in LAGraph, not in SuiteSparse/GraphBLAS/Demo.  Benchmark LAGraph
-instead.  Eventually, all GraphBLAS/Demos methods will be removed, and LAGraph
-will serve all uses: for illustration, benchmarking, and production uses.
+\item Version 3.3.2 (July 3, 2020).  Minor changes to build system.
 
-%----------------------------------------
-\subsection{Installing SuiteSparse:GraphBLAS}
-%----------------------------------------
+\item Version 3.3.1 (June 30, 2020).  Bug fix to \verb'GrB_assign' and
+    \verb'GxB_subassign' when the assignment is simple (\verb'C=A') but
+    with typecasting.
 
-To install the library (typically in \verb'/usr/local/lib' and
-\verb'/usr/local/include' for Linux systems), go to the top-level GraphBLAS
-folder and type:
+\item Version 3.3.0 (June 26, 2020).  Compliant with V1.3 of the C API
+    (except that the polymorphic \verb'GrB_wait(&object)' doesn't appear yet;
+    it will appear in V4.0).
 
-    {\small
-    \begin{verbatim}
-    sudo make install \end{verbatim} }
+    Added complex types (\verb'GxB_FC32' and \verb'GxB_FC64'), many unary
+    operators, binary operators, monoids, and semirings.  Added bitwise
+    operators, and their monoids and semirings.  Added the predefined monoids
+    and semirings from the v1.3 specification.  \verb'@GrB' interface: added complex
+    matrices and operators, and changed behavior of integer operations to more
+    closely match the behavior on built-in integer matrices.  The rules for
+    typecasting large floating point values to integers has changed.  The
+    specific object-based \verb'GrB_Matrix_wait', \verb'GrB_Vector_wait', etc,
+    functions have been added.  The no-argument \verb'GrB_wait()' is
+    deprecated.  Added \verb'GrB_getVersion', \verb'GrB_Matrix_resize',
+    \verb'GrB_Vector_resize', \verb'GrB_kronecker', \verb'GrB_*_wait', scalar
+    binding with binary operators for \verb'GrB_apply', \newline
+    \verb'GrB_Matrix_removeElement', and \verb'GrB_Vector_removeElement'.
 
-%----------------------------------------
-\subsection{Linking issues after installation}
-%----------------------------------------
+\item Version 3.2.0 (Feb 20, 2020).  Faster \verb'GrB_mxm', \verb'GrB_mxv', and
+    \verb'GrB_vxm', and faster operations on dense matrices/vectors.  Removed
+    compile-time user objects (\verb'GxB_*_define'), since these were not
+    compatible with the faster matrix operations.  Added the \verb'ANY' and
+    \verb'PAIR' operators.  Added the predefined descriptors,
+    \verb'GrB_DESC_*'.  Added the structural mask option.  Changed default
+    chunk size to 65,536.  \verb'@GrB' interface modified:  \verb'GrB.init' is
+    now optional.
 
-My Linux distro (Ubuntu 18.04) includes a copy of \verb'libgraphblas.so.1',
-which is SuiteSparse:GraphBLAS v1.1.2.  After installing SuiteSparse:GraphBLAS
-in \verb'/usr/local/lib' (with \verb'sudo make install'), compiling a simple
-stand-alone program links against \verb'libgraphblas.so.1' instead of the
-latest version, while at the same time accessing the latest version of the
-include file as \verb'/usr/local/include/GraphBLAS.h'.  This command fails:
+\item Version 3.1.2 (Dec, 2019).  Changes to allow SuiteSparse:GraphBLAS
+    to be compiled with the Microsoft Visual Studio compiler.  This compiler
+    does not support the \verb'_Generic' keyword, so the polymorphic functions
+    are not available.  Use the equivalent non-polymorphic functions instead,
+    when compiling GraphBLAS with MS Visual Studio.  In addition,
+    variable-length arrays are not supported, so user-defined types are limited
+    to 128 bytes in size.  These changes have no effect if you have an ANSI C11
+    compliant compiler.
 
-    {\small
-    \begin{verbatim}
-    gcc prog.c -lgraphblas \end{verbatim} }
+    \verb'@GrB' interface modified:  \verb'GrB.init' is now required.
 
-Revising my \verb'LD_LIBRARY_PATH' to put \verb'/usr/local/lib' first in the
-library directory order didn't help.  If you encounter this problem, try one of
-the following options (all four work for me, and link against the proper
-version (\verb'/usr/local/lib/libgraphblas.so.6.1.4' for example):
+\item Version 3.1.0 (Oct 1, 2019).  \verb'@GrB' interface added.  See the
+    \newline \verb'GraphBLAS/GraphBLAS' folder for details and documentation,
+    and Section~\ref{octave}.
 
-    {\small
-    \begin{verbatim}
-    gcc prog.c -l:libgraphblas.so.6
-    gcc prog.c -l:libgraphblas.so.6.1.4
-    gcc prog.c /usr/local/lib/libgraphblas.so
-    gcc prog.c -Wl,-v -L/usr/local/lib -lgraphblas \end{verbatim} }
+\item Version 3.0 (July 26, 2019), with OpenMP parallelism.
 
-This \verb'prog.c' test program is a trivial one, which works in v1.0 and
-later:
+The version number is increased to 3.0, since
+this version is not backward compatible with V2.x.  The \verb'GxB_select'
+operation changes; the \verb'Thunk' parameter was formerly a
+\verb'const void *' pointer, and is now a \verb'GxB_Scalar'.  A new parameter
+is added to \verb'GxB_SelectOp_new', to define the expected type of
+\verb'Thunk'.  A new parameter is added to \verb'GxB_init', to specify whether
+or not the user-provided memory management functions are thread safe.
 
-    {\small
-    \begin{verbatim}
-    #include <GraphBLAS.h>
-    int main (void)
-    {
-        GrB_init (GrB_NONBLOCKING) ;
-        GrB_finalize ( ) ;
-    } \end{verbatim} }
+The remaining changes add new features, and are upward compatible with V2.x.
+The major change is the addition of OpenMP parallelism.  This addition has no
+effect on the API, except that round-off errors can differ with the number of
+threads used, for floating-point types.  \verb'GxB_set' can optionally define
+the number of threads to use (the default is \verb'omp_get_max_threads').  The
+number of threads can also defined globally, and/or in the
+\verb'GrB_Descriptor'.  The \verb'RDIV' and \verb'RMINUS' operators are added,
+which are defined as $f(x,y)=y/x$ and $f(x,y)=y-x$, respectively.  Additional
+options are added to \verb'GxB_get'.
 
-Compile the program above, then use this command to ensure
-\verb'libgraphblas.so.6' appears:
+\item Version 2.3.3 (May 2019): Collected Algorithm of the ACM.
+No changes from V2.3.2 other than the documentation.
 
-    {\small
-    \begin{verbatim}
-    ldd a.out \end{verbatim} }
+\item Version 2.3 (Feb 2019) improves the performance of many GraphBLAS
+operations, including an early-exit for monoids.  These changes have a
+significant impact on breadth-first-search (a performance bug was also fixed in
+the two BFS \verb'Demo' codes).  The matrix and vector import/export functions
+were added (Section~\ref{pack_unpack}), in support of the new LAGraph project
+(\url{https://github.com/GraphBLAS/LAGraph}, see also Section~\ref{lagraph}).
+LAGraph includes a push-pull BFS in GraphBLAS that is faster than two versions
+in the \verb'Demo' folder.  \verb'GxB_init' was added to allow the memory
+manager functions (\verb'malloc', etc) to be specified.
 
-%----------------------------------------
-\subsection{Running the tests}
-%----------------------------------------
+\item
+Version 2.2 (Nov 2018)
+adds user-defined objects at compile-time, via user \verb'*.m4' files placed in
+\verb'GraphBLAS/User', which use the \verb'GxB_*_define' macros 
+(NOTE: feature removed in v3.2).
+The default matrix format is now \verb'GxB_BY_ROW'.
+Also added are the \verb'GxB_*print' methods for printing the contents of each
+GraphBLAS object (Section~\ref{fprint}).   PageRank demos have been added to
+the \verb'Demos' folder.
 
-To run a short test, type \verb'make run' at the top-level \verb'GraphBLAS'
-folder.  This will run all the demos in \verb'GraphBLAS/Demos'.  MATLAB is not
-required.
+\item
+Version 2.1 (Oct 2018) was
+a major update with support for new matrix formats
+(by row or column, and hypersparse matrices), and colon notation
+(\verb'I=begin:end' or \verb'I=begin:inc:end').  Some graph algorithms are more
+naturally expressed with matrices stored by row, and this version includes the
+new \verb'GxB_BY_ROW' format.  The default format in Version 2.1 and
+prior versions is by column.
+New extensions to GraphBLAS in this version include \verb'GxB_get',
+\verb'GxB_set', and \verb'GxB_AxB_METHOD', \verb'GxB_RANGE', \verb'GxB_STRIDE',
+and \verb'GxB_BACKWARDS', and their related definitions, described in
+Sections~\ref{descriptor},~\ref{options},~and~\ref{colon}.
 
-To perform the extensive tests in the \verb'Test' folder, and the statement
-coverage tests in \verb'Tcov', MATLAB R2017A is required.  See the
-\verb'README.txt' files in those two folders for instructions on how to run the
-tests.  The tests in the \verb'Test' folder have been ported to MATLAB on
-Linux, MacOS, and Windows.  The \verb'Tcov' tests do not work on Windows.  The
-MATLAB interface test (\verb'gbtest') works on all platforms; see the
-\verb'GraphBLAS/GraphBLAS' folder for more details.
+\item
+Version 2.0 (March 2018) addressed changes in the GraphBLAS C API
+Specification and added \verb'GxB_kron' and \verb'GxB_resize'.
 
-%----------------------------------------
-\subsection{Cleaning up}
-%----------------------------------------
+\item
+Version 1.1 (Dec 2017) primarily improved the performance.
 
-To remove all compiled files, type \verb'make' \verb'distclean' in the top-level
-GraphBLAS folder.
+\item
+Version 1.0 was released on Nov 25, 2017.
+\end{itemize}
 
 %-------------------------------------------------------------------------------
-\section{About NUMA systems}
+\subsection{Regarding historical and deprecated functions and symbols}
 %-------------------------------------------------------------------------------
 
-I have tested this package extensively on multicore single-socket systems, but
-have not yet optimized it for multi-socket systems with a NUMA architecture.
-That will be done in a future release.  If you publish benchmarks
-with this package, please state the SuiteSparse:GraphBLAS version, and a caveat
-if appropriate.  If you see significant performance issues when going from a
-single-socket to multi-socket system, I would like to hear from you so I can
-look into it.
+When a \verb'GxB*' function or symbol is added to the C API Specification with
+a \verb'GrB*' name, the new \verb'GrB*' name should be used instead, if
+possible.  However, the old \verb'GxB*' name will be kept as long as possible
+for historical reasons.  Historical functions and symbols will not always be
+documented here in the SuiteSparse:GraphBLAS User Guide, but they will be kept
+in \verb'GraphbBLAS.h' and kept in good working order in the library.
+Historical functions and symbols would only be removed in the very unlikely
+case that they cause a serious conflict with future methods.
+
+The only methods that have been fully deprecated and removed are the older
+versions of \verb'GrB_wait' and \verb'GrB_error' methods, which are
+incompatible with the latest versions.
 
 % \newpage
 %-------------------------------------------------------------------------------
diff --git a/GraphBLAS/Doc/GraphBLAS_version.tex b/GraphBLAS/Doc/GraphBLAS_version.tex
index c476f7511..444e264bb 100644
--- a/GraphBLAS/Doc/GraphBLAS_version.tex
+++ b/GraphBLAS/Doc/GraphBLAS_version.tex
@@ -1,5 +1,5 @@
 % version of SuiteSparse:GraphBLAS
 \date{VERSION
-7.0.3,
-Apr 8, 2022}
+7.2.0,
+Aug 8, 2022}
 
diff --git a/GraphBLAS/Doc/toms_parallel_grb2.pdf b/GraphBLAS/Doc/toms_parallel_grb2.pdf
index 9bec76176..06d1e93b8 100644
Binary files a/GraphBLAS/Doc/toms_parallel_grb2.pdf and b/GraphBLAS/Doc/toms_parallel_grb2.pdf differ
diff --git a/GraphBLAS/GraphBLAS/@GrB/GrB.m b/GraphBLAS/GraphBLAS/@GrB/GrB.m
index 0311ddaca..470db756e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/GrB.m
+++ b/GraphBLAS/GraphBLAS/@GrB/GrB.m
@@ -196,6 +196,7 @@
 %   C = coth (G)            hyperbolic cotangent
 %   C = csc (G)             cosecant
 %   C = csch (G)            hyperbolic cosecant
+%   C = cbrt (G)            cube root
 %
 %   C = diag (A, k)         diagonal matrices and diagonals
 %   DiGraph = digraph (G,...)   directed Graph
@@ -834,6 +835,7 @@
     C = coth (G) ;
     C = csc (G) ;
     C = csch (G) ;
+    C = cbrt (G) ;
 
     C = diag (A, k) ;
     DiGraph = digraph (G, option) ;
@@ -917,7 +919,7 @@
 
     C = real (G) ;
     C = repmat (G, m, n) ;
-    C = reshape (G, arg1, arg2) ;
+    C = reshape (G, m, n, by_col) ;
     C = round (G) ;
 
     C = sec (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m b/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m
index d636530e6..cf5e4b5bc 100644
--- a/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m
+++ b/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m
@@ -44,8 +44,9 @@
 % ------------------------------------------------
 %
 %     In Octave/MATLAB, as in A = rand (3) ; X = A (1:6) extracts the
-%     first two columns of A as a 6-by-1 vector.  This is not yet
-%     supported in GraphBLAS, but may be added in the future.
+%     first two columns of A as a 6-by-1 vector.  Except for C=A(:),
+%     this is not yet supported in GraphBLAS, but will be added in
+%     the future.
 %
 % ------------------------------------------------
 %% Increasing/decreasing the size of a matrix:
diff --git a/GraphBLAS/GraphBLAS/@GrB/assign.m b/GraphBLAS/GraphBLAS/@GrB/assign.m
index 5e895314d..c0eabe506 100644
--- a/GraphBLAS/GraphBLAS/@GrB/assign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/assign.m
@@ -37,8 +37,9 @@
 %
 %       The J argument is identical, except that it is a list of column
 %       indices of C.  If only one cell array is provided, J = {  } is
-%       implied, refering to all n columns of C, like C(I,:).  1D
-%       indexing of a matrix C, as in C(I) = A, is not yet supported.
+%       implied, refering to all n columns of C, like C(I,:).
+%       GrB.assign does not support linear indexing of a 2D matrix,
+%       as in C(I)=A when C is a 2D matrix.
 %
 %       If neither I nor J are provided on input, then this implies both
 %       I = { } and J = { }, or C(:,:), refering to all rows and columns
diff --git a/GraphBLAS/GraphBLAS/@GrB/atan2.m b/GraphBLAS/GraphBLAS/@GrB/atan2.m
index 28206652f..d1af6dcae 100644
--- a/GraphBLAS/GraphBLAS/@GrB/atan2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/atan2.m
@@ -19,7 +19,7 @@
 btype = gbtype (B) ;
 
 if (gb_contains (atype, 'complex') || gb_contains (btype, 'complex'))
-    error ('inputs must be real') ;
+    error ('GrB:error', 'inputs must be real') ;
 end
 
 if (~gb_isfloat (atype))
@@ -38,12 +38,12 @@
         C = GrB (gbemult ('atan2', A, B)) ;
     else
         % A is a scalar, B is a matrix
-        C = GrB (gbapply2 ('atan2', A, B)) ;
+        C = GrB (gbapply2 ('atan2', gbfull (A), B)) ;
     end
 else
     if (gb_isscalar (B))
         % A is a matrix, B is a scalar
-        C = GrB (gbapply2 ('atan2', A, B)) ;
+        C = GrB (gbapply2 ('atan2', A, gbfull (B))) ;
     else
         % both A and B are matrices.  C is the set union of A and B.
         C = GrB (gbeunion ('atan2', A, 0, B, 0)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/bandwidth.m b/GraphBLAS/GraphBLAS/@GrB/bandwidth.m
index 3e86be247..0abc28274 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bandwidth.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bandwidth.m
@@ -9,26 +9,22 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: this will be much faster when implemented in a mexFunction.
-% It is currently much slower than the built-in bandwidth function.
-
-% compute the bandwidth
-G = G.opaque ;
-[lo, hi] = gb_bandwidth (G) ;
-
-% return the result
 if (nargin == 1)
-   arg1 = lo ;
-   arg2 = hi ;
+    % compute lo, and compute hi if present in output argument list
+    [lo, hi] = gbbandwidth (G.opaque, 1, nargout > 1) ;
+    arg1 = lo ;
+    arg2 = hi ;
 else
     if (nargout > 1)
-        error ('too many output arguments') ;
+        error ('GrB:error', 'too many output arguments') ;
     elseif isequal (uplo, 'lower')
+        [lo, ~] = gbbandwidth (G.opaque, 1, 0) ;
         arg1 = lo ;
     elseif isequal (uplo, 'upper')
+        [~, hi] = gbbandwidth (G.opaque, 0, 1) ;
         arg1 = hi ;
     else
-        error ('unrecognized option') ;
+        error ('GrB:error', 'unrecognized option') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/bfs.m b/GraphBLAS/GraphBLAS/@GrB/bfs.m
index 9079b1053..bc985d9b8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bfs.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bfs.m
@@ -69,7 +69,7 @@
 
 [m, n] = size (A) ;
 if (m ~= n)
-    error ('A must be square') ;
+    error ('GrB:error', 'A must be square') ;
 end
 
 % get the string options
@@ -85,7 +85,7 @@
         case { 'check' }
             check = true ;
         otherwise
-            error ('unknown option') ;
+            error ('GrB:error', 'unknown option') ;
     end
 end
 
@@ -97,7 +97,7 @@
 % determine the method to use, and convert A if necessary
 if (isequal (kind, 'undirected'))
     if (check && ~issymmetric (A))
-        error ('A must be symmetric') ;
+        error ('GrB:error', 'A must be symmetric') ;
     end
     if (GrB.isbycol (A))
         % A is stored by column but undirected, so use q*A' instead of q*A
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitcmp.m b/GraphBLAS/GraphBLAS/@GrB/bitcmp.m
index 5a622931d..f204e20de 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitcmp.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitcmp.m
@@ -37,15 +37,15 @@
 atype = gbtype (A) ;
 
 if (gb_contains (atype, 'complex'))
-    error ('inputs must be real') ;
+    error ('GrB:error', 'inputs must be real') ;
 end
 
 if (isequal (atype, 'logical'))
-    error ('inputs must not be logical') ;
+    error ('GrB:error', 'inputs must not be logical') ;
 end
 
 if (~gb_contains (assumedtype, 'int'))
-    error ('assumedtype must be an integer type') ;
+    error ('GrB:error', 'assumedtype must be an integer type') ;
 end
 
 % C will have the same type as A on input
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitget.m b/GraphBLAS/GraphBLAS/@GrB/bitget.m
index bc59cd40f..ae4be6721 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitget.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitget.m
@@ -54,15 +54,15 @@
 btype = gbtype (B) ;
 
 if (gb_contains (atype, 'complex') || gb_contains (btype, 'complex'))
-    error ('inputs must be real') ;
+    error ('GrB:error', 'inputs must be real') ;
 end
 
 if (isequal (atype, 'logical') || isequal (btype, 'logical'))
-    error ('inputs must not be logical') ;
+    error ('GrB:error', 'inputs must not be logical') ;
 end
 
 if (~gb_contains (assumedtype, 'int'))
-    error ('assumedtype must be an integer type') ;
+    error ('GrB:error', 'assumedtype must be an integer type') ;
 end
 
 % C will have the same type as A on input
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitset.m b/GraphBLAS/GraphBLAS/@GrB/bitset.m
index 89641be88..edc955c1e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitset.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitset.m
@@ -56,11 +56,11 @@
 [bm, bn, btype] = gbsize (B) ;
 
 if (gb_contains (atype, 'complex') || gb_contains (btype, 'complex'))
-    error ('inputs must be real') ;
+    error ('GrB:error', 'inputs must be real') ;
 end
 
 if (isequal (atype, 'logical') || isequal (btype, 'logical'))
-    error ('inputs must not be logical') ;
+    error ('GrB:error', 'inputs must not be logical') ;
 end
 
 a_is_scalar = (am == 1) && (an == 1) ;
@@ -84,7 +84,7 @@
 end
 
 if (~gb_contains (assumedtype, 'int'))
-    error ('assumedtype must be an integer type') ;
+    error ('GrB:error', 'assumedtype must be an integer type') ;
 end
 
 % C will have the same type as A on input
@@ -126,13 +126,13 @@
             C = gbeunion (op, A, 0, B, 0) ;
         else
             % A is a scalar, B is a matrix
-            C = gbapply2 (op, A, B) ;
+            C = gbapply2 (op, gbfull (A), B) ;
         end
     else
         % A is a matrix
         if (b_is_scalar)
             % A is a matrix, B is scalar
-            C = gbapply2 (op, A, B) ;
+            C = gbapply2 (op, A, gbfull (B)) ;
         else
             % both A and B are matrices
             C = gbeunion (op, A, 0, B, 0) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/cbrt.m b/GraphBLAS/GraphBLAS/@GrB/cbrt.m
new file mode 100644
index 000000000..ebdea5825
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/cbrt.m
@@ -0,0 +1,21 @@
+function C = cbrt (G)
+%CBRT cube root
+% C = cbrt (G) is the cube root of the entries of G.
+%
+% See also GrB/sqrt, nthroot.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+% SPDX-License-Identifier: GPL-3.0-or-later
+
+G = G.opaque ;
+type = gbtype (G) ;
+if (gb_contains (type, 'complex'))
+    error ('GrB:error', 'input must be real') ;
+elseif (gb_isfloat (type))
+    op = 'cbrt' ;
+else
+    op = 'cbrt.double' ;
+end
+
+C = GrB (gbapply (op, G)) ;
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/cell2mat.m b/GraphBLAS/GraphBLAS/@GrB/cell2mat.m
index 1847157f6..a50ea83b1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/cell2mat.m
+++ b/GraphBLAS/GraphBLAS/@GrB/cell2mat.m
@@ -59,10 +59,10 @@
 % SPDX-License-Identifier: GPL-3.0-or-later
 
 if (~iscell (A))
-    error ('input must be a cell array') ;
+    error ('GrB:error', 'input must be a cell array') ;
 end
 if (ndims (A) > 2) %#ok<ISMAT>
-    error ('only 2D cell arrays are supported') ;
+    error ('GrB:error', 'only 2D cell arrays are supported') ;
 end
 
 % get the input matrices
diff --git a/GraphBLAS/GraphBLAS/@GrB/complex.m b/GraphBLAS/GraphBLAS/@GrB/complex.m
index 39538502d..c98ed1cfe 100644
--- a/GraphBLAS/GraphBLAS/@GrB/complex.m
+++ b/GraphBLAS/GraphBLAS/@GrB/complex.m
@@ -54,7 +54,7 @@
     b_is_scalar = (bm == 1) && (bn == 1) ;
 
     if (gb_contains (atype, 'complex') || gb_contains (btype, 'complex'))
-        error ('inputs must be real') ;
+        error ('GrB:error', 'inputs must be real') ;
     end
 
     if (a_is_scalar)
@@ -94,7 +94,7 @@
         else
             % both A and B are matrices.  C is sparse or full.
             desc.kind = 'builtin' ;
-            C = gbeadd (A, '+', gbapply2 (1i, '*', B), desc) ;
+            C = gbeadd (A, '+', gbapply2 (B, '*', 1i), desc) ;
         end
     end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/digraph.m b/GraphBLAS/GraphBLAS/@GrB/digraph.m
index 4eae995dc..50777c2ca 100644
--- a/GraphBLAS/GraphBLAS/@GrB/digraph.m
+++ b/GraphBLAS/GraphBLAS/@GrB/digraph.m
@@ -31,7 +31,7 @@
 
 [m, n, type] = gbsize (G) ;
 if (m ~= n)
-    error ('G must be square') ;
+    error ('GrB:error', 'G must be square') ;
 end
 
 % get the string options
@@ -40,7 +40,7 @@
     if (isequal (lower (option), 'omitselfloops'))
         omitself = true ;
     else
-        error ('unknown option') ;
+        error ('GrB:error', 'unknown option') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/empty.m b/GraphBLAS/GraphBLAS/@GrB/empty.m
index f9777bf07..73a621328 100644
--- a/GraphBLAS/GraphBLAS/@GrB/empty.m
+++ b/GraphBLAS/GraphBLAS/@GrB/empty.m
@@ -21,7 +21,7 @@
     m = max (m, 0) ;
     n = max (n, 0) ;
     if (~ ((m == 0) || (n == 0)))
-        error ('at least one dimension must be zero') ;
+        error ('GrB:error', 'at least one dimension must be zero') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/end.m b/GraphBLAS/GraphBLAS/@GrB/end.m
index f7f701dd0..30686d266 100644
--- a/GraphBLAS/GraphBLAS/@GrB/end.m
+++ b/GraphBLAS/GraphBLAS/@GrB/end.m
@@ -6,18 +6,19 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: add linear indexing
-% FUTURE: use hypersparse matrices to implement multidimensionl nD arrays
-
 if (ndims == 1)
-    if (~isvector (G))
-        error ('Linear indexing not yet supported') ;
+    if (isvector (G))
+        % G(end) of a vector G
+        i = length (G) ;
+    else
+        % G(end) of a matrix G, for linear indexing
+        i = numel (G) ;
     end
-    i = length (G) ;
 elseif (ndims == 2)
     s = size (G) ;
     i = s (k) ;
 else
-    error ('%dD indexing not yet supported', ndims) ;
+    % sparse N-dimensional arrays for N > 2 will not be supported
+    error ('GrB:error', '%dD indexing not supported', ndims) ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/eps.m b/GraphBLAS/GraphBLAS/@GrB/eps.m
index ce7e7bb3d..ffb8c9a4b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/eps.m
+++ b/GraphBLAS/GraphBLAS/@GrB/eps.m
@@ -8,13 +8,12 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: GraphBLAS should have a built-in unary operator to
-% compute eps.
+% FUTURE: GraphBLAS should have a built-in eps unary operator.
 
 % convert to a built-in full matrix and use the built-in eps:
 
-% FUTURE: there should be a sparse version of 'eps'.  C is full because
-% eps (0) is 2^(-1024).
+% FUTURE: there should be a sparse version of 'eps'.
+% C is full because eps (0) is 2^(-1024).
 
 switch (GrB.type (G))
 
@@ -31,7 +30,7 @@
         C = max (eps (double (real (G))), eps (double (imag (G)))) ;
 
     otherwise
-        error ('input must be floating-point') ;
+        error ('GrB:error', 'input must be floating-point') ;
 
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/erf.m b/GraphBLAS/GraphBLAS/@GrB/erf.m
index dadba6f1e..82fc0e051 100644
--- a/GraphBLAS/GraphBLAS/@GrB/erf.m
+++ b/GraphBLAS/GraphBLAS/@GrB/erf.m
@@ -11,7 +11,7 @@
 G = G.opaque ;
 type = gbtype (G) ;
 if (gb_contains (type, 'complex'))
-    error ('input must be real') ;
+    error ('GrB:error', 'input must be real') ;
 end
 if (~gb_isfloat (type))
     op = 'erf.double' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/erfc.m b/GraphBLAS/GraphBLAS/@GrB/erfc.m
index c96c775b9..91694a684 100644
--- a/GraphBLAS/GraphBLAS/@GrB/erfc.m
+++ b/GraphBLAS/GraphBLAS/@GrB/erfc.m
@@ -11,7 +11,7 @@
 G = G.opaque ;
 type = gbtype (G) ;
 if (gb_contains (type, 'complex'))
-    error ('input must be real') ;
+    error ('GrB:error', 'input must be real') ;
 end
 if (~gb_isfloat (type))
     type = 'double' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/expand.m b/GraphBLAS/GraphBLAS/@GrB/expand.m
index 87bae9855..3ab905c3e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/expand.m
+++ b/GraphBLAS/GraphBLAS/@GrB/expand.m
@@ -23,7 +23,7 @@
 end
 
 if (~gb_isscalar (scalar))
-    error ('first input must be a scalar') ;
+    error ('GrB:error', 'first input must be a scalar') ;
 end
 
 if (isobject (S))
diff --git a/GraphBLAS/GraphBLAS/@GrB/extract.m b/GraphBLAS/GraphBLAS/@GrB/extract.m
index 91abf0bc7..572504e96 100644
--- a/GraphBLAS/GraphBLAS/@GrB/extract.m
+++ b/GraphBLAS/GraphBLAS/@GrB/extract.m
@@ -29,8 +29,9 @@
 %
 %       The J argument is identical, except that it is a list of column
 %       indices of A.  If only one cell array is provided, J = {  } is
-%       implied, refering to all n columns of A, like A(I,:).  1D
-%       indexing of a matrix A, as in C = A(I), is not yet supported.
+%       implied, refering to all n columns of A, like A(I,:).
+%       GrB.extract does not support linear indexing of a 2D matrix,
+%       as in C=A(I) when A is a 2D matrix.
 %
 %       If neither I nor J are provided on input, then this implies both
 %       I = { } and J = { }, or A(:,:) refering to all rows and columns
diff --git a/GraphBLAS/GraphBLAS/@GrB/find.m b/GraphBLAS/GraphBLAS/@GrB/find.m
index eb71f5c2d..8fbe8ce0c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/find.m
+++ b/GraphBLAS/GraphBLAS/@GrB/find.m
@@ -24,10 +24,8 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: add linear indexing
-
-% FUTURE: find (G,k,'first') and find (G,k,'last') are slow, since as
-% they are currently implemented, all entries are extracted and then the
+% FUTURE: find (G,k,'first') and find (G,k,'last') are slow.
+% They are currently implemented, all entries are extracted and then the
 % first or last k are selected from the extracted tuples.  It would be
 % faster to use a mexFunction that directly accesses the opaque content
 % of G, instead of using GrB_Matrix_extractTuples_*, which always extracts
@@ -40,7 +38,7 @@
 if (nargin > 1)
     k = ceil (double (gb_get_scalar (k))) ;
     if (k < 1)
-        error ('k must be positive') ;
+        error ('GrB:error', 'k must be positive') ;
     end
     if (~isequal (gbformat (G), 'by col'))
         % find (G, k) assumes the matrix is stored by column, so reformat G
@@ -76,10 +74,10 @@
         % extract indices from a column vector
         I = gbextracttuples (G) ;
     else
-        % FUTURE: this does not return the same thing as I = find (G)
-        % for the built-in find (..). (need to add 1D linear indexing)
-        error ('Linear indexing not yet supported') ;
-        % I = gbextracttuples (G) ;
+        % extract linear indices from a matrix
+        [I, J] = gbextracttuples (G) ;
+        % use the built-in sub2ind to convert the 2D indices to linear indices
+        I = sub2ind ([m n], I, J) ;
     end
 end
 
@@ -111,7 +109,8 @@
             X = X (n-k+1:n) ;
         end
     else
-        error ('invalid search option; must be ''first'' or ''last''') ;
+        error ('GrB:error', ...
+            'invalid search option; must be ''first'' or ''last''') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/flip.m b/GraphBLAS/GraphBLAS/@GrB/flip.m
index d7a24ea2b..1971c0b9f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/flip.m
+++ b/GraphBLAS/GraphBLAS/@GrB/flip.m
@@ -33,7 +33,7 @@
 
 dim = floor (double (dim)) ;
 if (dim <= 0)
-    error ('dim must be positive') ;
+    error ('GrB:error', 'dim must be positive') ;
 end
 
 if (dim == 1 && m ~= 1)
diff --git a/GraphBLAS/GraphBLAS/@GrB/format.m b/GraphBLAS/GraphBLAS/@GrB/format.m
index 9b0da3284..2575529de 100644
--- a/GraphBLAS/GraphBLAS/@GrB/format.m
+++ b/GraphBLAS/GraphBLAS/@GrB/format.m
@@ -122,7 +122,7 @@
 if (nargin == 0)
     % f = GrB.format ; get the global format
     if (nargout > 1)
-        error ('usage: f = GrB.format') ;
+        error ('GrB:error', 'usage: f = GrB.format') ;
     end
     f = gbformat ;
 else
diff --git a/GraphBLAS/GraphBLAS/@GrB/gamma.m b/GraphBLAS/GraphBLAS/@GrB/gamma.m
index 04d2c6501..dad78e046 100644
--- a/GraphBLAS/GraphBLAS/@GrB/gamma.m
+++ b/GraphBLAS/GraphBLAS/@GrB/gamma.m
@@ -11,7 +11,7 @@
 G = G.opaque ;
 type = gbtype (G) ;
 if (gb_contains (type, 'complex'))
-    error ('input must be real') ;
+    error ('GrB:error', 'input must be real') ;
 end
 if (~gb_isfloat (type))
     type = 'double' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/gammaln.m b/GraphBLAS/GraphBLAS/@GrB/gammaln.m
index 823e5404e..c64a1a36c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/gammaln.m
+++ b/GraphBLAS/GraphBLAS/@GrB/gammaln.m
@@ -12,7 +12,7 @@
 G = G.opaque ;
 type = gbtype (G) ;
 if (gb_contains (type, 'complex'))
-    error ('input must be real') ;
+    error ('GrB:error', 'input must be real') ;
 end
 if (~gb_isfloat (type))
     type = 'double' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/graph.m b/GraphBLAS/GraphBLAS/@GrB/graph.m
index 534141e67..ea3f5852e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/graph.m
+++ b/GraphBLAS/GraphBLAS/@GrB/graph.m
@@ -30,7 +30,7 @@
 
 [m, n, type] = gbsize (G) ;
 if (m ~= n)
-    error ('G must be square') ;
+    error ('GrB:error', 'G must be square') ;
 end
 
 % get the string options
@@ -44,7 +44,7 @@
         case { 'omitselfloops' }
             omitself = true ;
         otherwise
-            error ('unknown option') ;
+            error ('GrB:error', 'unknown option') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/incidence.m b/GraphBLAS/GraphBLAS/@GrB/incidence.m
index 7668f90b4..f7f7c9076 100644
--- a/GraphBLAS/GraphBLAS/@GrB/incidence.m
+++ b/GraphBLAS/GraphBLAS/@GrB/incidence.m
@@ -45,7 +45,7 @@
 
 [m, n] = gbsize (A) ;
 if (m ~= n)
-    error ('A must be square') ;
+    error ('GrB:error', 'A must be square') ;
 end
 
 % get the string options
@@ -60,9 +60,9 @@
         case { 'double', 'single', 'int8', 'int16', 'int32', 'int64' }
             type = arg ;
         case { 'uint8', 'uint16', 'uint32', 'uint64', 'logical' }
-            error ('type must be signed') ;
+            error ('GrB:error', 'type must be signed') ;
         otherwise
-            error ('unknown option') ;
+            error ('GrB:error', 'unknown option') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isbanded.m b/GraphBLAS/GraphBLAS/@GrB/isbanded.m
index d11e21810..290b0990e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isbanded.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isbanded.m
@@ -7,8 +7,6 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: this will be much faster when 'gb_bandwidth' is a mexFunction.
-
 if (isobject (A))
     A = A.opaque ;
 end
@@ -16,6 +14,6 @@
 lo = gb_get_scalar (lo) ;
 hi = gb_get_scalar (hi) ;
 
-[alo, ahi] = gb_bandwidth (A) ;
+[alo, ahi] = gbbandwidth (A, 1, 1) ;
 s = (alo <= lo) & (ahi <= hi) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isdiag.m b/GraphBLAS/GraphBLAS/@GrB/isdiag.m
index 6f0967082..f36c28aa3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isdiag.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isdiag.m
@@ -7,10 +7,6 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: this will be much faster when written as a mexFunction
-% that doesn't rely on gbselect.  Use a gb_bandwith mexFunction.
-
-G = G.opaque ;
-
-s = (gbnvals (gbselect ('diag', G, 0)) == gbnvals (G)) ;
+[lo,hi] = gbbandwidth (G.opaque, 1, 1) ;
+s = (lo == 0) && (hi == 0) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/istril.m b/GraphBLAS/GraphBLAS/@GrB/istril.m
index 40461c7ee..b5dc14534 100644
--- a/GraphBLAS/GraphBLAS/@GrB/istril.m
+++ b/GraphBLAS/GraphBLAS/@GrB/istril.m
@@ -10,9 +10,6 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: this will be much faster when written as a mexFunction
-% that doesn't rely on gbselect.  Use a gb_bandwith mexFunction.
-
-G = G.opaque ;
-s = (gbnvals (gbselect ('triu', G, 1)) == 0) ;
+[~,hi] = gbbandwidth (G.opaque, 0, 1) ;
+s = (hi == 0) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/istriu.m b/GraphBLAS/GraphBLAS/@GrB/istriu.m
index a66ceb335..220e4e147 100644
--- a/GraphBLAS/GraphBLAS/@GrB/istriu.m
+++ b/GraphBLAS/GraphBLAS/@GrB/istriu.m
@@ -10,9 +10,6 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: this will be much faster when written as a mexFunction
-% that doesn't rely on gbselect.  Use a gb_bandwith mexFunction.
-
-G = G.opaque ;
-s = (gbnvals (gbselect ('tril', G, -1)) == 0) ;
+[lo,~] = gbbandwidth (G.opaque, 1, 0) ;
+s = (lo == 0) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/ktruss.m b/GraphBLAS/GraphBLAS/@GrB/ktruss.m
index 7a2b97a6f..1093bf2d9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ktruss.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ktruss.m
@@ -45,7 +45,7 @@
     k = 3 ;
 end
 if (k < 3)
-    error ('k-truss defined only for k >= 3') ;
+    error ('GrB:error', 'k-truss defined only for k >= 3') ;
 end
 
 if (nargin < 3)
@@ -56,7 +56,7 @@
 
 [m, n] = size (A) ;
 if (m ~= n)
-    error ('A must be square') ;
+    error ('GrB:error', 'A must be square') ;
 end
 
 int_type = 'int64' ;
@@ -69,10 +69,10 @@
 if (check)
     % Do the costly checks.  These are optional.
     if (~issymmetric (C))
-        error ('A must have a symmetric pattern') ;
+        error ('GrB:error', 'A must have a symmetric pattern') ;
     end
     if (nnz (diag (C) > 0))
-        error ('A must have a zero-free diagonal') ;
+        error ('GrB:error', 'A must have a zero-free diagonal') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/laplacian.m b/GraphBLAS/GraphBLAS/@GrB/laplacian.m
index c52c01793..d6adcfacf 100644
--- a/GraphBLAS/GraphBLAS/@GrB/laplacian.m
+++ b/GraphBLAS/GraphBLAS/@GrB/laplacian.m
@@ -35,7 +35,7 @@
 
 [m, n] = gbsize (A) ;
 if (m ~= n)
-    error ('A must be square and symmetric') ;
+    error ('GrB:error', 'A must be square and symmetric') ;
 end
 
 % get the type
@@ -43,7 +43,7 @@
     type = 'double' ;
 elseif (~gb_issigned (type))
     % type must be signed
-    error ('type cannot be logical or unsigned integer') ;
+    error ('GrB:error', 'type cannot be logical or unsigned integer') ;
 end
 
 % S = spones (A)
@@ -53,7 +53,7 @@
 if (nargin > 2 && isequal (check, 'check'))
     % make sure spones (S) is symmetric
     if (~gb_issymmetric (S, 'nonskew', false))
-        error ('spones(A) must be symmetric') ;
+        error ('GrB:error', 'spones(A) must be symmetric') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/max.m b/GraphBLAS/GraphBLAS/@GrB/max.m
index 182267ef6..67f56c429 100644
--- a/GraphBLAS/GraphBLAS/@GrB/max.m
+++ b/GraphBLAS/GraphBLAS/@GrB/max.m
@@ -27,7 +27,7 @@
 
 type = gbtype (A) ;
 if (gb_contains (type, 'complex'))
-    error ('complex matrices not yet supported') ;
+    error ('GrB:error', 'complex matrices not yet supported') ;
 elseif (isequal (type, 'logical'))
     op = '|.logical' ;
 else
@@ -46,7 +46,8 @@
 else
     % C = max (A, [ ], option)
     if (~isempty (B))
-        error ('dimension argument not allowed with 2 input matrices') ;
+        error ('GrB:error', ...
+            'dimension argument not allowed with 2 input matrices') ;
     end
     C = GrB (gb_max3 (op, A, option)) ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/min.m b/GraphBLAS/GraphBLAS/@GrB/min.m
index 9b9099853..e78d94183 100644
--- a/GraphBLAS/GraphBLAS/@GrB/min.m
+++ b/GraphBLAS/GraphBLAS/@GrB/min.m
@@ -27,7 +27,7 @@
 
 type = gbtype (A) ;
 if (gb_contains (type, 'complex'))
-    error ('complex matrices not yet supported') ;
+    error ('GrB:error', 'complex matrices not yet supported') ;
 elseif (isequal (type, 'logical'))
     op = '&.logical' ;
 else
@@ -46,7 +46,8 @@
 else
     % C = min (A, [ ], option)
     if (~isempty (B))
-        error ('dimension argument not allowed with 2 input matrices') ;
+        error ('GrB:error', ...
+            'dimension argument not allowed with 2 input matrices') ;
     end
     C = GrB (gb_min3 (op, A, option)) ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/mis.m b/GraphBLAS/GraphBLAS/@GrB/mis.m
index 6f8b157e8..eef1752bb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mis.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mis.m
@@ -29,7 +29,7 @@
 
 [m, n] = size (A) ;
 if (m ~= n)
-    error ('A must be square') ;
+    error ('GrB:error', 'A must be square') ;
 end
 
 % convert A to logical
@@ -41,16 +41,16 @@
     if (isequal (check, 'check'))
         check = true ;
     else
-        error ('unknown option') ;
+        error ('GrB:error', 'unknown option') ;
     end
 end
 
 if (check)
     if (nnz (diag (A)) > 0)
-        error ('A must not have any diagonal entries') ;
+        error ('GrB:error', 'A must not have any diagonal entries') ;
     end
     if (~issymmetric (A))
-        error ('A must be symmetric') ;
+        error ('GrB:error', 'A must be symmetric') ;
     end
 end
 
@@ -124,7 +124,7 @@
 
     % this will not occur, unless the input is corrupted somehow
     if (last_ncand == ncand)
-        error ('method stalled; rerun with ''check'' option') ;
+        error ('GrB:error', 'method stalled; rerun with ''check'' option') ;
     end
     last_ncand = ncand ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/mpower.m b/GraphBLAS/GraphBLAS/@GrB/mpower.m
index ccf6165e6..ef4e8d766 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mpower.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mpower.m
@@ -25,14 +25,16 @@
     C = GrB (gb_power (A, B)) ;
 else
     if (am ~= an)
-        error ('For C=A^B, A must be square') ;
+        error ('GrB:error', 'For C=A^B, A must be square') ;
     end
     if (~b_is_scalar)
-        error ('For C=A^B, B must be a non-negative integer scalar') ;
+        error ('GrB:error', ...
+            'For C=A^B, B must be a non-negative integer scalar') ;
     end
     b = gb_scalar (B) ;
     if (~(isreal (b) && isfinite (b) && round (b) == b && b >= 0))
-        error ('For C=A^B, B must be a non-negative integer scalar') ;
+        error ('GrB:error', ...
+            'For C=A^B, B must be a non-negative integer scalar') ;
     end
     if (b == 0)
         % C = A^0 = I
diff --git a/GraphBLAS/GraphBLAS/@GrB/num2cell.m b/GraphBLAS/GraphBLAS/@GrB/num2cell.m
index 558c90907..442bf00c8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/num2cell.m
+++ b/GraphBLAS/GraphBLAS/@GrB/num2cell.m
@@ -46,7 +46,7 @@
         % split A into rows
         C = gbsplit (A, ones (m, 1), n) ;
     else
-        error ('unknown option') ;
+        error ('GrB:error', 'unknown option') ;
     end
 
     % convert each cell back into GrB matrices
diff --git a/GraphBLAS/GraphBLAS/@GrB/pagerank.m b/GraphBLAS/GraphBLAS/@GrB/pagerank.m
index 0f739a191..50d3808cd 100644
--- a/GraphBLAS/GraphBLAS/@GrB/pagerank.m
+++ b/GraphBLAS/GraphBLAS/@GrB/pagerank.m
@@ -54,7 +54,7 @@
 end
 
 if (~(isequal (opts.type, 'single') || isequal (opts.type, 'double')))
-    error ('opts.type must be ''single'' or ''double''') ;
+    error ('GrB:error', 'opts.type must be ''single'' or ''double''') ;
 end
 
 % get options
@@ -68,7 +68,7 @@
 
 [m, n] = size (A) ;
 if (m ~= n)
-    error ('A must be square') ;
+    error ('GrB:error', 'A must be square') ;
 end
 
 % select the semiring and determine if A is native
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_2d_to_1d.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_2d_to_1d.m
index afe61e120..0a05a551c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_2d_to_1d.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_2d_to_1d.m
@@ -1,8 +1,17 @@
-function k = gb_2d_to_1d (i, j, m)
+function [k, mn] = gb_2d_to_1d (i, j, m, n)
 %GB_2D_TO_1D convert 2D indices to 1D; the indices must be zero-based.
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
+% check for overflow
+if (double (m) * double (n) > 2^60)
+    error ('GrB:error', 'problem too large') ;
+end
+
+% mn = the length of the vector x=A(:), if A is m by n
+mn = int64 (m) * int64 (n) ;
+
+% convert the 2D indices (i,j) into 1D indices (k)
 k = i + j * m ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_bandwidth.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_bandwidth.m
deleted file mode 100644
index c19e7fc60..000000000
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_bandwidth.m
+++ /dev/null
@@ -1,23 +0,0 @@
-function [lo, hi] = gb_bandwidth (G)
-%GB_BANDWIDTH Determine the bandwidth of a GraphBLAS matrix.
-% Implements [lo, hi] = bandwidth (G).
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
-% SPDX-License-Identifier: GPL-3.0-or-later
-
-% FUTURE: this is slow; use gbselect with DIAGINDEX and then
-% find the min and max entries
-
-% compute the bandwidth
-if (gbnvals (G) == 0)
-    % matrix is empty
-    hi = 0 ;
-    lo = 0 ;
-else
-    desc.base = 'zero-based' ;
-    [i, j] = gbextracttuples (G, desc) ;
-    b = j - i ;
-    hi = max (0,  double (max (b))) ;
-    lo = max (0, -double (min (b))) ;
-end
-
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m
index 2c5aebf9c..c9e821681 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m
@@ -16,15 +16,15 @@
 btype = gbtype (B) ;
 
 if (gb_contains (atype, 'complex') || gb_contains (btype, 'complex'))
-    error ('inputs must be real') ;
+    error ('GrB:error', 'inputs must be real') ;
 end
 
 if (isequal (atype, 'logical') || isequal (btype, 'logical'))
-    error ('inputs must not be logical') ;
+    error ('GrB:error', 'inputs must not be logical') ;
 end
 
 if (~gb_contains (assumedtype, 'int'))
-    error ('assumedtype must be an integer type') ;
+    error ('GrB:error', 'assumedtype must be an integer type') ;
 end
 
 % C will have the same type as A on input
@@ -48,11 +48,17 @@
         B = gbnew (B, 'int8') ;
     end
 
-    if (gb_isscalar (A) || gb_isscalar (B))
-        % either A or B are scalars
-        C = gbapply2 (['bitshift.' atype], A, B) ;
+    a_is_scalar = gb_isscalar (A) ;
+    b_is_scalar = gb_isscalar (B) ;
+
+    if (a_is_scalar && ~b_is_scalar)
+        % A is a scalar, B is a matrix
+        C = gbapply2 (['bitshift.' atype], gbfull (A), B) ;
+    elseif (~a_is_scalar && b_is_scalar)
+        % A is a matrix, B is a scalar
+        C = gbapply2 (['bitshift.' atype], A, gbfull (B)) ;
     else
-        % both A and B are matrices.
+        % both A and B are matrices, or both are scalars
         % expand B by padding it with zeros from the pattern of A
         B = gbeadd ('1st.int8', B, gb_expand (0, A, 'int8')) ;
         C = gbemult (['bitshift.' atype], A, B) ;
@@ -65,7 +71,7 @@
         btype = assumedtype ;
     end
     if (~isequal (atype, btype))
-        error ('integer inputs must have the same type') ;
+        error ('GrB:error', 'integer inputs must have the same type') ;
     end
 
     switch (op)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m
index ee1e0ed7d..c05e79732 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m
@@ -12,11 +12,21 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-if (gb_isscalar (A) || gb_isscalar (B))
-    % either A or B are scalars
-    C = gbapply2 (A, op, B) ;
+if (gb_isscalar (A))
+    if (gb_isscalar (B))
+        % both A and B are scalars
+        C = gbemult (A, op, B) ;
+    else
+        % A is a scalar, B is a matrix
+        C = gbapply2 (gbfull (A), op, B) ;
+    end
 else
-    % both A and B are matrices
-    C = gbemult (A, op, B) ;
+    if (gb_isscalar (B))
+        % A is a matrix, B is a scalar
+        C = gbapply2 (A, op, gbfull (B)) ;
+    else
+        % both A and B are matrices
+        C = gbemult (A, op, B) ;
+    end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m
index df645a6a4..3c9a43d42 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m
@@ -16,7 +16,7 @@
         case { 'count', 'list', 'degree' }
             kind = arg ;
         otherwise
-            error ('unknown option') ;
+            error ('GrB:error', 'unknown option') ;
     end
 end
 
@@ -32,7 +32,7 @@
             % X = GrB.entries (A, 'list')
             result = unique (gbextractvalues (A)) ;
         otherwise
-            error ('''all'' and ''degree'' cannot be combined') ;
+            error ('GrB:error', '''all'' and ''degree'' cannot be combined') ;
     end
 
 else
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_eunion.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_eunion.m
index cc4de0106..98f671a10 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_eunion.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_eunion.m
@@ -23,27 +23,17 @@
         % both A and B are scalars.  Result is also a scalar.
         C = gbeadd (A, op, B) ;
     else
-        % A is a scalar, B is a matrix.  Result is full, unless A == 0.
-        if (gb_scalar (A) == 0)
-            % C = 0+B is a built-in matrix if B is a built-in matrix
-            C = B ;
-        else
-            % expand A to a full matrix
-            A = gb_scalar_to_full (bm, bn, type, gb_fmt (B), A) ;
-            C = gbeadd (A, op, B) ;
-        end
+        % A is a scalar, B is a matrix.  Result is full.
+        % expand A to a full matrix
+        A = gb_scalar_to_full (bm, bn, type, gb_fmt (B), A) ;
+        C = gbeadd (A, op, B) ;
     end
 else
     if (b_is_scalar)
-        % A is a matrix, B is a scalar.  Result is full, unless B == 0.
-        if (gb_scalar (B) == 0)
-            % C = A+0 is a built-in matrix if A is a built-in matrix
-            C = A ;
-        else
-            % expand B to a full matrix
-            B = gb_scalar_to_full (am, an, type, gb_fmt (A), B) ;
-            C = gbeadd (A, op, B) ;
-        end
+        % A is a matrix, B is a scalar.  Result is full.
+        % expand B to a full matrix
+        B = gb_scalar_to_full (am, an, type, gb_fmt (A), B) ;
+        C = gbeadd (A, op, B) ;
     else
         % both A and B are matrices.  Result is sparse.
         C = gbeunion (A, 0, op, B, 0) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m
index e502a0dbf..c8dbda6e1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m
@@ -6,9 +6,9 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% typecast the scalar to the desired type
-scalar = gbnew (scalar, type) ;
+% typecast the scalar to the desired type, and make sure it's full
+scalar = gbfull (gbnew (scalar, type)) ;
 
 % expand the scalar into the pattern of S
-C = gbapply2 (['1st.' type], scalar, S) ;
+C = gbapply2 (['2nd.' type], S, scalar) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m
index 475389f28..95d43aba7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m
@@ -10,7 +10,7 @@
 
 [m, n] = gbsize (A) ;
 if (m ~= 1 || n ~= 1)
-    error ('input parameter %s must be a scalar', inputname (1)) ;
+    error ('GrB:error', 'input parameter %s must be a scalar', inputname (1)) ;
 end
 
 x = gb_scalar (A) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m
index b895e8a78..9fe0efca4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m
@@ -50,7 +50,7 @@
     % C ({ }), C ({ I }), C ({start,fini}), or C ({start,inc,fini}).
     len = length (I) ;
     if (len > 3)
-        error ('invalid indexing: usage is A ({start,inc,fini})') ;
+        error ('GrB:error', 'invalid indexing: usage is A ({start,inc,fini})') ;
     elseif (len == 0)
         % C ({ })
         whole = true ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m
index 710d36409..95e51a205 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m
@@ -21,7 +21,7 @@
         I = double (I) ;
 
     case { 'single complex', 'double complex' }
-        error ('array indices must be integers') ;
+        error ('GrB:error', 'array indices must be integers') ;
 
     otherwise
         % any other integer must be typecast to double, int64, or uint64.
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m
index caf67c351..2e14c2556 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m
@@ -42,7 +42,7 @@
 
         otherwise
 
-            error ('invalid option') ;
+            error ('GrB:error', 'invalid option') ;
 
     end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m
index 581f85381..0b3d371ea 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m
@@ -23,7 +23,7 @@
             C = gbeadd (A, op, B) ;
         else
             % since A <= 0, the result is sparse.
-            C = gbapply2 (A, op, B) ;
+            C = gbapply2 (gbfull (A), op, B) ;
         end
     end
 else
@@ -35,7 +35,7 @@
             C = gbeadd (A, op, B) ;
         else
             % since B <= 0, the result is sparse.
-            C = gbapply2 (A, op, B) ;
+            C = gbapply2 (A, op, gbfull (B)) ;
         end
     else
         % both A and B are matrices.  Result is sparse.
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m
index b5ff9d7d6..21c0435d1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m
@@ -19,7 +19,7 @@
         % giving an m-by-1 column vector.
         C = gb_maxbyrow (op, A) ;
     else
-        error ('invalid option') ;
+        error ('GrB:error', 'invalid option') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m
index b4a475b6b..0538907df 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m
@@ -23,7 +23,7 @@
         % all columns A(:,j) have between 1 and m-1 entries
         C = gbapply2 (op, C, zero) ;
     else
-        d = gbapply2 (['1st.' ctype], zero, d) ;
+        d = gbapply2 (['2nd.' ctype], d, zero) ;
         % if d (j) is between 1 and m-1 and C (j) < 0 then C (j) = 0
         C = gbeadd (op, C, d) ;
     end
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m
index fb6e9c80b..5030ba162 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m
@@ -22,7 +22,7 @@
         % all rows A(i,:) have between 1 and n-1 entries
         C = gbapply2 (op, C, zero) ;
     else
-        d = gbapply2 (['1st.' ctype], zero, d) ;
+        d = gbapply2 (['2nd.' ctype], d, zero) ;
         % if d(i) is between 1 and n-1 and C(i) < 0 then C(i) = 0
         C = gbeadd (op, C, d) ;
     end
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m
index 18d7dcbbd..262723d9b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m
@@ -23,7 +23,7 @@
             C = gbeadd (A, op, B) ;
         else
             % since A >= 0, the result is sparse.
-            C = gbapply2 (A, op, B) ;
+            C = gbapply2 (gbfull (A), op, B) ;
         end
     end
 else
@@ -35,7 +35,7 @@
             C = gbeadd (A, op, B) ;
         else
             % since B >= 0, the result is sparse.
-            C = gbapply2 (A, op, B) ;
+            C = gbapply2 (A, op, gbfull (B)) ;
         end
     else
         % both A and B are matrices.  Result is sparse.
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m
index bc6d9a69a..584df343f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m
@@ -19,7 +19,7 @@
         % giving an m-by-1 column vector.
         C = gb_minbyrow (op, A) ;
     else
-        error ('invalid option') ;
+        error ('GrB:error', 'invalid option') ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m
index c7fcb9a2f..38345abd2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m
@@ -22,7 +22,7 @@
         % all columns A(:,j) have between 1 and m-1 entries
         C = gbapply2 (op, C, zero) ;
     else
-        d = gbapply2 (['1st.' ctype], zero, d) ;
+        d = gbapply2 (['2nd.' ctype], d, zero) ;
         % if d (j) is between 1 and m-1 and C (j) > 0 then C (j) = 0
         C = gbeadd (op, C, d) ;
     end
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m
index e5a1c385d..4425688c3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m
@@ -21,7 +21,7 @@
         % all rows A(i,:) have between 1 and n-1 entries
         C = gbapply2 (op, C, zero) ;
     else
-        d = gbapply2 (['1st.' ctype], zero, d) ;
+        d = gbapply2 (['2nd.' ctype], d, zero) ;
         % if d(i) is between 1 and n-1 and C(i) > 0 then C (i) = 0
         C = gbeadd (op, C, d) ;
     end
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m
index 77ee86e37..2ef2c05bb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m
@@ -8,11 +8,9 @@
 [m, n] = gbsize (G) ;
 s = m*n ;
 
-try
-    if (m > flintmax || n > flintmax || s > flintmax)
-        % use the VPA if available, for really huge matrices
+if (m > flintmax || n > flintmax || s > flintmax)
+    % use the VPA if available, for really huge matrices
+    if (exist ('vpa'))
         s = vpa (vpa (m, 64) * vpa (n, 64), 128) ;
     end
-catch
 end
-
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m
index fd0e43969..c1692affb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m
@@ -20,7 +20,7 @@
     if (ischar (arg))
         if (isequal (arg, 'like'))
             if (nargs ~= k+1)
-                error ('usage: GrB.%s (m, n, ''like'', G)', func) ;
+                error ('GrB:error', 'usage: GrB.%s (m, n, ''like'', G)', func) ;
             end
             arg = varargin {k+1} ;
             if (isobject (arg))
@@ -29,7 +29,7 @@
             type = gbtype (arg) ;
         else
             if (nargs ~= k)
-                error ('usage: GrB.%s (m, n, type)', func) ;
+                error ('GrB:error', 'usage: GrB.%s (m, n, type)', func) ;
             end
             type = arg ;
         end
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m
index 17995a881..c64fde439 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m
@@ -22,10 +22,10 @@
             % C = ones ([m n])
             [m, n] = gb_get_2scalars (arg1) ;
         else
-            error ('invalid dimensions') ;
+            error ('GrB:error', 'invalid dimensions') ;
         end
 
-    case { 2 }
+    otherwise
 
         % C = ones (m, n)
         m = gb_get_scalar (arg1) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m
index ea6f7e91d..4f73fe2a4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m
@@ -51,7 +51,7 @@
 
     otherwise
 
-        error ('unknown option') ;
+        error ('GrB:error', 'unknown option') ;
 end
 
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m
index ff8fe094e..a5a4c8fa1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m
@@ -32,13 +32,13 @@
                 end
                 [rm, rn, type] = gbsize (range) ;
                 if (rm*rn > 2)
-                    error ('range must contain at most 2 entries') ;
+                    error ('GrB:error', 'range can contain at most 2 entries') ;
                 end
                 range = gbfull (range, type, 0, struct ('kind', 'full')) ;
             case { 'unsymmetric', 'symmetric', 'hermitian' }
                 sym_option = arg ;
             otherwise
-                error ('unknown option') ;
+                error ('GrB:error', 'unknown option') ;
         end
     end
 end
@@ -60,7 +60,7 @@
     end
     [m, n] = gbsize (A) ;
     if ((symmetric || hermitian) && (m ~= n))
-        error ('input matrix must be square') ;
+        error ('GrB:error', 'input matrix must be square') ;
     end
     [I, J] = gbextracttuples (A, desc) ;
     e = length (I) ;
@@ -92,7 +92,7 @@
 
 else
 
-    error ('invalid usage') ;
+    error ('GrB:error', 'invalid usage') ;
 
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m
index 5ad70b5c6..ff12c6b89 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m
@@ -16,7 +16,7 @@
     end
 else
     if (~ischar (type))
-        error ('type must be a string') ;
+        error ('GrB:error', 'type must be a string') ;
     end
     op = ['1.' type] ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m
index a3971c9dc..832cb849b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m
@@ -35,6 +35,6 @@
 
     otherwise
 
-        error ('unknown option') ;
+        error ('GrB:error', 'unknown option') ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbbandwidth.m b/GraphBLAS/GraphBLAS/@GrB/private/gbbandwidth.m
new file mode 100644
index 000000000..5fc2e4a57
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbbandwidth.m
@@ -0,0 +1,7 @@
+function [lo, hi] = gbbandwidth (G, compute_hi, compute_lo)        %#ok
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+% SPDX-License-Identifier: GPL-3.0-or-later
+
+error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m b/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m
index 1874df67c..c90c92caa 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m
@@ -27,11 +27,11 @@ function gbmake (what)
 
 if (have_octave)
     if verLessThan ('octave', '7')
-        error ('Octave 7 or later is required') ;
+        error ('GrB:mex', 'Octave 7 or later is required') ;
     end
 else
     if verLessThan ('matlab', '9.4')
-        error ('MATLAB 9.4 (R2018a) or later is required') ;
+        error ('GrB:mex', 'MATLAB 9.4 (R2018a) or later is required') ;
     end
     % MATLAB 9.10 (R2021a) and following include a built-in GraphBLAS library
     % that conflicts with this version, so rename this version.
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m b/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m
index 0a893de1f..940544672 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m
@@ -1,4 +1,4 @@
-function c = gbreduce (cin, accum, op, A, desc)      %#ok
+function [c,k] = gbreduce (cin, accum, op, A, desc)      %#ok
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m b/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m
index d65fe3bb8..759d7c01d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m
@@ -1,4 +1,4 @@
-function C = gbselect (Cin, M, accum, op, A, b, desc)        %#ok
+function [C,k] = gbselect (Cin, M, accum, op, A, b, desc)        %#ok
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m b/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m
index bab8cb592..b3f20a692 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m
@@ -1,4 +1,4 @@
-function C = gbsubassign (Cin, M, accum, A, I, j, desc)      %#ok
+function [C,k] = gbsubassign (Cin, M, accum, A, I, j, desc)      %#ok
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m b/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m
index 550b3a355..ab6561c1d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m
@@ -1,4 +1,4 @@
-function C = gbtrans (Cin, M, accum, A, desc)        %#ok
+function [C,k] = gbtrans (Cin, M, accum, A, desc)        %#ok
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m b/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m
index 21dfe60e3..20982a877 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m
@@ -1,4 +1,4 @@
-function C = gbvreduce (Cin, M, accum, op, A, desc)      %#ok
+function [C,k] = gbvreduce (Cin, M, accum, op, A, desc)      %#ok
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c
index c29858efc..01642b6e8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c
@@ -18,8 +18,9 @@
 // C = gbapply2 (Cin, M, op, A, B, desc)
 // C = gbapply2 (Cin, M, accum, op, A, B, desc)
 
-// Either A or B (or both) must be a scalar (1-by-1, with 0 or 1 entries).
-// If the scalar has no entry, it is treated as the value zero.
+// Either A or B (or both) must be a non-empty scalar (1-by-1, with 1 entry).
+// If both A and B are non-empty scalars, then A is treated as the input
+// 'matrix' and B is treated as the scalar.
 
 // If Cin is not present then it is implicitly a matrix with no entries, of the
 // right size (which depends on A, B, and the descriptor).
@@ -96,49 +97,42 @@ void mexFunction
     // determine which input is the scalar and which is the matrix
     //--------------------------------------------------------------------------
 
-    GrB_Index anrows, ancols, bnrows, bncols ;
+    GrB_Index anrows, ancols, bnrows, bncols, anvals, bnvals ;
 
     // get the size of A and B
     OK (GrB_Matrix_nrows (&anrows, A)) ;
     OK (GrB_Matrix_ncols (&ancols, A)) ;
+    OK (GrB_Matrix_nvals (&anvals, A)) ;
     OK (GrB_Matrix_nrows (&bnrows, B)) ;
     OK (GrB_Matrix_ncols (&bncols, B)) ;
+    OK (GrB_Matrix_nvals (&bnvals, B)) ;
 
-    GrB_Scalar scalar = NULL, scalar0 = NULL ;
+    GrB_Scalar scalar = NULL ;
     bool binop_bind1st ;
-    if (anrows == 1 && ancols == 1)
-    {
-        // A is the scalar and B is the matrix
-        binop_bind1st = true ;
-        scalar = (GrB_Scalar) A ;   // NOTE: this is not allowed by the spec
-    }
-    else if (bnrows == 1 && bncols == 1)
+    bool A_is_scalar = (anrows == 1 && ancols == 1 && anvals == 1) ;
+    bool B_is_scalar = (bnrows == 1 && bncols == 1 && bnvals == 1) ;
+
+    if (B_is_scalar)
     {
         // A is the matrix and B is the scalar
         binop_bind1st = false ;
         scalar = (GrB_Scalar) B ;   // NOTE: this is not allowed by the spec
     }
+    else if (A_is_scalar)
+    {
+        // A is the scalar and B is the matrix
+        binop_bind1st = true ;
+        scalar = (GrB_Scalar) A ;   // NOTE: this is not allowed by the spec
+    }
     else
     {
-        ERROR ("either A or B must be a scalar") ;
+        ERROR ("either A or B must be a non-empty scalar") ;
     }
 
     //--------------------------------------------------------------------------
     // make sure the scalar has one entry
     //--------------------------------------------------------------------------
 
-    GrB_Index nvals ;
-    OK (GrB_Scalar_nvals (&nvals, scalar)) ;
-    if (nvals == 0)
-    {
-        // scalar must have an entry.  Create a new scalar zero.
-        OK (GrB_Scalar_dup (&scalar0, scalar)) ;
-        // the scalar need not be int32; this will typecast as needed
-        OK (GrB_Scalar_setElement_INT32 (scalar0, 0)) ;
-        OK (GrB_Scalar_wait (scalar0, GrB_MATERIALIZE)) ;
-        scalar = scalar0 ;
-    }
-
     // extract the int64 value of the scalar
     int64_t ithunk = 0 ;
     OK (GrB_Scalar_extractElement_INT64 (&ithunk, scalar)) ;
@@ -245,7 +239,6 @@ void mexFunction
     OK (GrB_Matrix_free (&M)) ;
     OK (GrB_Matrix_free (&A)) ;
     OK (GrB_Matrix_free (&B)) ;
-    OK (GrB_Scalar_free (&scalar0)) ;
     OK (GrB_Scalar_free (&Thunk)) ;
     OK (GrB_Descriptor_free (&desc)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargminmax.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargminmax.c
index 0cace365d..79bd1e7bd 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargminmax.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargminmax.c
@@ -372,5 +372,6 @@ void mexFunction
 
     pargout [0] = gb_export (&x, KIND_GRB) ;
     pargout [1] = gb_export (&p, KIND_GRB) ;
+    GB_WRAPUP ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargsort.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargsort.c
index 93c8e03f7..1e4dbe235 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargsort.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbargsort.c
@@ -32,6 +32,11 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     gb_usage (nargin == 3 && (nargout == 2 || nargout == 1), USAGE) ;
+
+    //--------------------------------------------------------------------------
+    // find the arguments and determine the sort direction
+    //--------------------------------------------------------------------------
+
     GrB_Matrix A = gb_get_shallow (pargin [0]) ;
     int dim = (int) mxGetScalar (pargin [1]) ;
     CHECK_ERROR (dim < 0 || dim > 2, "invalid dim") ;
@@ -45,7 +50,7 @@ void mexFunction
 
     GrB_BinaryOp op ;
     if (MATCH (direction, "ascend"))
-    {
+    { 
         // ascending sort
         if      (type == GrB_BOOL  ) op = GrB_LT_BOOL   ;
         else if (type == GrB_INT8  ) op = GrB_LT_INT8   ;
@@ -61,7 +66,7 @@ void mexFunction
         else ERROR ("unsupported type") ;
     }
     else if (MATCH (direction, "descend"))
-    {
+    { 
         // descending sort
         if      (type == GrB_BOOL  ) op = GrB_GT_BOOL   ;
         else if (type == GrB_INT8  ) op = GrB_GT_INT8   ;
@@ -77,18 +82,18 @@ void mexFunction
         else ERROR ("unsupported type") ;
     }
     else
-    {
+    { 
         ERROR2 ("unrecognized direction: %s\n", direction) ;
     }
 
     GrB_Descriptor desc ;
     if (dim == 1)
-    {
+    { 
         // sort the columns of A
         desc = GrB_DESC_T0 ;
     }
     else // dim == 2
-    {
+    { 
         // sort the rows of A
         desc = NULL ;
     }
@@ -119,7 +124,7 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     if (P != NULL)
-    {
+    { 
         OK (GrB_Matrix_apply_BinaryOp2nd_INT64 (P, NULL, NULL, GrB_PLUS_INT64,
             P, (int64_t) 1, NULL)) ;
     }
@@ -130,8 +135,9 @@ void mexFunction
 
     pargout [0] = gb_export (&C, KIND_GRB) ;
     if (nargout > 1)
-    {
+    { 
         pargout [1] = gb_export (&P, KIND_GRB) ;
     }
+    GB_WRAPUP ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbandwidth.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbandwidth.c
new file mode 100644
index 000000000..5c9039d1e
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbandwidth.c
@@ -0,0 +1,186 @@
+//------------------------------------------------------------------------------
+// gbbandwidth: compute the lower and/or upper bandwidth of a GrB matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+//------------------------------------------------------------------------------
+
+// usage:
+
+// [lo,hi] = gbbandwidth (A, compute_lo, compute_hi)
+
+#include "gb_interface.h"
+
+#define USAGE "usage: [lo,hi] = gbbandwidth (A, compute_lo, compute_hi)"
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    gb_usage (nargin == 3 && nargout == 2, USAGE) ;
+    GrB_Matrix A = gb_get_shallow (pargin [0]) ;
+    bool compute_lo = (bool) mxGetScalar (pargin [1]) ;
+    bool compute_hi = (bool) mxGetScalar (pargin [2]) ;
+    GrB_Index nrows, ncols ;
+    OK (GrB_Matrix_nrows (&nrows, A)) ;
+    OK (GrB_Matrix_ncols (&ncols, A)) ;
+
+    //--------------------------------------------------------------------------
+    // compute lo and hi
+    //--------------------------------------------------------------------------
+
+    int64_t hi = 0, lo = 0 ;
+    GrB_Matrix x = NULL, imin = NULL, imax = NULL, idiag = NULL ;
+
+    GxB_Format_Value fmt ;
+    OK (GxB_Matrix_Option_get (A, GxB_FORMAT, &fmt)) ;
+    bool by_col = (fmt == GxB_BY_COL) ;
+
+    if (by_col)
+    { 
+
+        //----------------------------------------------------------------------
+        // A is held by column
+        //----------------------------------------------------------------------
+
+        OK (GrB_Matrix_new (&x, GrB_BOOL, 1, nrows)) ;
+        OK (GrB_Matrix_new (&imin, GrB_INT64, 1, ncols)) ;
+        OK (GrB_Matrix_new (&imax, GrB_INT64, 1, ncols)) ;
+        OK (GrB_Matrix_new (&idiag, GrB_INT64, 1, ncols)) ;
+
+        // x = true (1, nrows)
+        OK (GrB_Matrix_assign_BOOL (x, NULL, NULL, true, GrB_ALL, 1, GrB_ALL,
+            nrows, NULL)) ;
+
+        if (compute_hi)
+        { 
+            // imin = x*A, where imin(j) = min row index in column j
+            OK (GrB_mxm (imin, NULL, NULL, GxB_MIN_FIRSTJ_INT64, x, A, NULL)) ;
+        }
+
+        if (compute_lo)
+        { 
+            // imax = x*A, where imax(j) = max row index in column j
+            OK (GrB_mxm (imax, NULL, NULL, GxB_MAX_FIRSTJ_INT64, x, A, NULL)) ;
+        }
+
+        // construct idiag: idiag(j) = j with same sparsity pattern as imin/imax
+        OK (GrB_Matrix_apply_IndexOp_INT64 (idiag, NULL, NULL,
+            GrB_COLINDEX_INT64, compute_hi ? imin : imax, 0, NULL)) ;
+
+        if (compute_hi)
+        { 
+            // imin = idiag - imin
+            OK (GrB_Matrix_eWiseMult_BinaryOp (imin, NULL, NULL,
+                GrB_MINUS_INT64, idiag, imin, NULL)) ;
+            // hi = max (imin, 0) ;
+            OK (GrB_Matrix_reduce_INT64 (&hi, GrB_MAX_INT64,
+                GrB_MAX_MONOID_INT64, imin, NULL)) ;
+        }
+
+        if (compute_lo)
+        { 
+            // imax = imax - idiag
+            OK (GrB_Matrix_eWiseMult_BinaryOp (imax, NULL, NULL,
+                GrB_MINUS_INT64, imax, idiag, NULL)) ;
+            // lo = max (imax, 0) ;
+            OK (GrB_Matrix_reduce_INT64 (&lo, GrB_MAX_INT64,
+                GrB_MAX_MONOID_INT64, imax, NULL)) ;
+        }
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // A is held by row
+        //----------------------------------------------------------------------
+
+        OK (GrB_Matrix_new (&x, GrB_BOOL, ncols, 1)) ;
+        OK (GrB_Matrix_new (&imin, GrB_INT64, nrows, 1)) ;
+        OK (GrB_Matrix_new (&imax, GrB_INT64, nrows, 1)) ;
+        OK (GrB_Matrix_new (&idiag, GrB_INT64, nrows, 1)) ;
+
+        // x = true (ncols, 1)
+        OK (GrB_Matrix_assign_BOOL (x, NULL, NULL, true, GrB_ALL, ncols,
+            GrB_ALL, 1, NULL)) ;
+
+        if (compute_lo)
+        { 
+            // imin = A*x, where imin(i) = min column index in row i
+            OK (GrB_mxm (imin, NULL, NULL, GxB_MIN_FIRSTJ_INT64, A, x, NULL)) ;
+        }
+
+        if (compute_hi)
+        { 
+            // imax = A*x, where imax(i) = max column index in row i
+            OK (GrB_mxm (imax, NULL, NULL, GxB_MAX_FIRSTJ_INT64, A, x, NULL)) ;
+        }
+
+        // construct idiag: idiag(i) = i with same sparsity pattern as imin/imax
+        OK (GrB_Matrix_apply_IndexOp_INT64 (idiag, NULL, NULL,
+            GrB_ROWINDEX_INT64, compute_lo ? imin : imax, 0, NULL)) ;
+
+        if (compute_lo)
+        { 
+            // imin = idiag - imin
+            OK (GrB_Matrix_eWiseMult_BinaryOp (imin, NULL, NULL,
+                GrB_MINUS_INT64, idiag, imin, NULL)) ;
+            // lo = max (imin, 0) ;
+            OK (GrB_Matrix_reduce_INT64 (&lo, GrB_MAX_INT64,
+                GrB_MAX_MONOID_INT64, imin, NULL)) ;
+        }
+
+        if (compute_hi)
+        { 
+            // imax = imax - idiag
+            OK (GrB_Matrix_eWiseMult_BinaryOp (imax, NULL, NULL,
+                GrB_MINUS_INT64, imax, idiag, NULL)) ;
+            // hi = max (imax, 0) ;
+            OK (GrB_Matrix_reduce_INT64 (&hi, GrB_MAX_INT64,
+                GrB_MAX_MONOID_INT64, imax, NULL)) ;
+        }
+    }
+
+    OK (GrB_Matrix_free (&x)) ;
+    OK (GrB_Matrix_free (&idiag)) ;
+    OK (GrB_Matrix_free (&imin)) ;
+    OK (GrB_Matrix_free (&imax)) ;
+
+    //--------------------------------------------------------------------------
+    // return result as int64 scalars
+    //--------------------------------------------------------------------------
+
+    if (lo > FLINTMAX || hi > FLINTMAX)
+    { 
+        // output is int64 to avoid flint overflow
+        int64_t *p ;
+        pargout [0] = mxCreateNumericMatrix (1, 1, mxINT64_CLASS, mxREAL) ;
+        // use mxGetData (best for Octave, fine for MATLAB)
+        p = (int64_t *) mxGetData (pargout [0]) ;
+        p [0] = (int64_t) lo ;
+        pargout [1] = mxCreateNumericMatrix (1, 1, mxINT64_CLASS, mxREAL) ;
+        p = (int64_t *) mxGetData (pargout [1]) ;
+        p [0] = (int64_t) hi ;
+    }
+    else
+    { 
+        // output is double
+        pargout [0] = mxCreateDoubleScalar ((double) lo) ;
+        pargout [1] = mxCreateDoubleScalar ((double) hi) ;
+    }
+
+    GB_WRAPUP ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c
index 05ed83f9e..a98e552ed 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c
@@ -163,8 +163,7 @@ void mexFunction
     else
     { 
         // m is provided on input
-        CHECK_ERROR (!gb_mxarray_is_scalar (pargin [3]), "m must be a scalar") ;
-        nrows = (GrB_Index) mxGetScalar (pargin [3]) ;
+        nrows = gb_mxget_uint64_scalar (pargin [3], "m") ;
     }
 
     if (nargin < 5)
@@ -184,8 +183,7 @@ void mexFunction
     else
     { 
         // n is provided on input
-        CHECK_ERROR (!gb_mxarray_is_scalar (pargin [4]), "n must be a scalar") ;
-        ncols = (GrB_Index) mxGetScalar (pargin [4]) ;
+        ncols = gb_mxget_uint64_scalar (pargin [4], "n") ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdeserialize.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdeserialize.c
index 5c56d5bc1..f97c176b8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdeserialize.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdeserialize.c
@@ -32,14 +32,16 @@ void mexFunction
 
     gb_usage ((nargin >= 1 || nargin <= 3) && nargout <= 1, USAGE) ;
     CHECK_ERROR (mxGetClassID (pargin [0]) != mxUINT8_CLASS
-        || mxGetN (pargin [0]) != 1, "blob must be a uint8 column vector") ;
+        || mxIsSparse (pargin [0]), "blob must be a uint8 dense matrix/vector");
 
     //--------------------------------------------------------------------------
-    // get the blob
+    // get the blob, normally a row or column vector, but can be a dense matrix
     //--------------------------------------------------------------------------
 
     void *blob = mxGetData (pargin [0]) ;
-    GrB_Index blob_size = (GrB_Index) mxGetM (pargin [0]) ;
+    GrB_Index m = (GrB_Index) mxGetM (pargin [0]) ;
+    GrB_Index n = (GrB_Index) mxGetN (pargin [0]) ;
+    GrB_Index blob_size = m*n ;
 
     //--------------------------------------------------------------------------
     // deserialize the blob into a matrix
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c
index 69e449fbb..9fdbbb1e6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c
@@ -130,18 +130,22 @@ void mexFunction
     if (anrows == 1 && ncells == 1)
     { 
         // only J is present
-        J = gb_mxcell_to_index (Cell [0], base, ancols, &J_allocated, &nj) ;
+        J = gb_mxcell_to_index (Cell [0], base, ancols, &J_allocated, &nj,
+            NULL) ;
     }
     else if (ncells == 1)
     { 
         // only I is present
-        I = gb_mxcell_to_index (Cell [0], base, anrows, &I_allocated, &ni) ;
+        I = gb_mxcell_to_index (Cell [0], base, anrows, &I_allocated, &ni,
+            NULL) ;
     }
     else if (ncells == 2)
     { 
         // both I and J are present
-        I = gb_mxcell_to_index (Cell [0], base, anrows, &I_allocated, &ni) ;
-        J = gb_mxcell_to_index (Cell [1], base, ancols, &J_allocated, &nj) ;
+        I = gb_mxcell_to_index (Cell [0], base, anrows, &I_allocated, &ni,
+            NULL) ;
+        J = gb_mxcell_to_index (Cell [1], base, ancols, &J_allocated, &nj,
+            NULL) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c
index 481f1141c..544404dde 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c
@@ -50,7 +50,6 @@ void mexFunction
         //----------------------------------------------------------------------
 
         // get the global format
-        gb_usage (nargout <= 1, USAGE) ;
         OK (GxB_Global_Option_get (GxB_FORMAT, &fmt)) ;
 
     }
@@ -60,11 +59,11 @@ void mexFunction
         if (mxIsChar (pargin [0]))
         { 
 
+
             //------------------------------------------------------------------
             // GrB.format (format)
             //------------------------------------------------------------------
 
-            gb_usage (nargout <= 1, USAGE) ;
             // parse the format string
             int ignore ;
             bool ok = gb_mxstring_to_format (pargin [0], &fmt, &ignore) ;
@@ -157,7 +156,7 @@ void mexFunction
         pargout [1] = mxCreateString (s) ;
     }
     if (nargout > 2)
-    {
+    { 
         pargout [2] = mxCreateString (iso ? "iso-valued" : "non-iso-valued") ;
     }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c
index f37935b25..a5d37d269 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c
@@ -39,7 +39,7 @@
     [m n] = size (C) ;
     mnz = nnz (M) ;         % A must be mnz-by-1
     if (~isequal (size (A), [mnz 1]))
-        error ('A must be nnz(M)-by-1')
+        error ('GrB:error', 'A must be nnz(M)-by-1')
     end
 
     [ai,  ~, ax] = GrB.extracttuples (A) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c
index e433b5ea8..42ab74000 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c
@@ -136,8 +136,8 @@ void mexFunction
             //------------------------------------------------------------------
 
             // m-by-n GraphBLAS double matrix, no entries, default format
-            GrB_Index nrows = mxGetScalar (pargin [0]) ;
-            GrB_Index ncols = mxGetScalar (pargin [1]) ;
+            GrB_Index nrows = gb_mxget_uint64_scalar (pargin [0], "m") ;
+            GrB_Index ncols = gb_mxget_uint64_scalar (pargin [1], "n") ;
             C = gb_new (GrB_FP64, nrows, ncols, -1, 0) ;
 
         }
@@ -167,8 +167,8 @@ void mexFunction
             //------------------------------------------------------------------
 
             // create an m-by-n matrix with no entries
-            GrB_Index nrows = mxGetScalar (pargin [0]) ;
-            GrB_Index ncols = mxGetScalar (pargin [1]) ;
+            GrB_Index nrows = gb_mxget_uint64_scalar (pargin [0], "m") ;
+            GrB_Index ncols = gb_mxget_uint64_scalar (pargin [1], "n") ;
             GrB_Type type = gb_mxstring_to_type (pargin [2]) ;
             bool ok = gb_mxstring_to_format (pargin [2], &fmt, &sparsity) ;
 
@@ -249,8 +249,8 @@ void mexFunction
 
             // create an m-by-n matrix with no entries, of the requested
             // type and format
-            GrB_Index nrows = mxGetScalar (pargin [0]) ;
-            GrB_Index ncols = mxGetScalar (pargin [1]) ;
+            GrB_Index nrows = gb_mxget_uint64_scalar (pargin [0], "m") ;
+            GrB_Index ncols = gb_mxget_uint64_scalar (pargin [1], "n") ;
 
             GrB_Type type = gb_mxstring_to_type (pargin [2]) ;
             bool ok = gb_mxstring_to_format (pargin [3], &fmt, &sparsity) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbreshape.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbreshape.c
new file mode 100644
index 000000000..57e125395
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbreshape.c
@@ -0,0 +1,51 @@
+//------------------------------------------------------------------------------
+// gbreshape: reshape a GraphBLAS matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+//------------------------------------------------------------------------------
+
+// usage:
+
+// C = gbreshape (A, nrows_new, ncols_new, by_col)
+
+#include "gb_interface.h"
+
+#define USAGE "usage: C = gbreshape (A, nrows_new, ncols_new, by_col)"
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    gb_usage ((nargin == 3 || nargin == 4) && nargout == 1, USAGE) ;
+    GrB_Matrix A = gb_get_shallow (pargin [0]) ;
+    GrB_Index nrows_new = gb_mxget_uint64_scalar (pargin [1], "nrows_new") ;
+    GrB_Index ncols_new = gb_mxget_uint64_scalar (pargin [2], "ncols_new") ;
+    bool by_col = (nargin == 3) ? true : ((bool) mxGetScalar (pargin [3])) ;
+
+    //--------------------------------------------------------------------------
+    // reshape the matrix
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix C = NULL ;
+    OK (GxB_Matrix_reshapeDup (&C, A, by_col, nrows_new, ncols_new, NULL)) ;
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    pargout [0] = gb_export (&C, KIND_GRB) ;
+    GB_WRAPUP ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c
index 8fc00c2cf..18e9734f0 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c
@@ -124,7 +124,7 @@ void mexFunction
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
         Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
-    CHECK_ERROR (nmatrices < 1 || nmatrices > 4 || nstrings < 1 || ncells > 0,  
+    CHECK_ERROR (nmatrices < 1 || nmatrices > 4 || nstrings < 1 || ncells > 0,
         USAGE) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbserialize.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbserialize.c
index 50e327395..0ab74fe93 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbserialize.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbserialize.c
@@ -49,7 +49,7 @@ void mexFunction
         {
             method = GxB_COMPRESSION_NONE ;
         }
-        else if (MATCH (method_name, "default") || MATCH (method_name, "lz4"))
+        else if (MATCH (method_name, "lz4"))
         { 
             method = GxB_COMPRESSION_LZ4 ;
         }
@@ -57,6 +57,11 @@ void mexFunction
         { 
             method = GxB_COMPRESSION_LZ4HC ;
         }
+        else if (MATCH (method_name, "default") || MATCH (method_name, "zstd"))
+        {
+            // the default is ZSTD, with level 1
+            method = GxB_COMPRESSION_ZSTD ;
+        }
         else if (MATCH (method_name, "debug"))
         { 
             // use GrB_Matrix_serializeSize and GrB_Matrix_serialize, just
@@ -82,11 +87,11 @@ void mexFunction
             method = GxB_COMPRESSION_LZSS ;
         }
         else if (MATCH (method_name, "intel:lz4"))
-        { 
+        {
             method = GxB_COMPRESSION_INTEL + GxB_COMPRESSION_LZ4 ;
         }
         else if (MATCH (method_name, "intel:lz4hc"))
-        { 
+        {
             method = GxB_COMPRESSION_INTEL + GxB_COMPRESSION_LZ4HC ;
         }
         else if (MATCH (method_name, "intel:zlib"))
@@ -112,10 +117,10 @@ void mexFunction
         }
         // get the method level
         if (nargin > 2)
-        {
+        { 
             level = (int) mxGetScalar (pargin [2]) ;
         }
-        if (level < 0 || level > 9) level = 0 ;
+        if (level < 0 || level > 999) level = 0 ;
         // set the descriptor
         OK (GxB_Desc_set (desc, GxB_COMPRESSION, method + level)) ;
     }
@@ -128,7 +133,7 @@ void mexFunction
     GrB_Index blob_size ;
 
     if (debug)
-    {
+    { 
         // debug GrB_Matrix_serializeSize and GrB_Matrix_serialize
         OK (GrB_Matrix_serializeSize (&blob_size, A)) ;
         blob = mxMalloc (blob_size) ;
@@ -137,7 +142,7 @@ void mexFunction
         blob = mxRealloc (blob, blob_size) ;
     }
     else
-    {
+    { 
         // use GxB_Matrix_serialize by default
         OK (GxB_Matrix_serialize (&blob, &blob_size, A, desc)) ;
     }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c
index 6506cfd01..1465f6457 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c
@@ -112,27 +112,45 @@ void gb_assign                  // gbassign or gbsubassign mexFunctions
     GrB_Index *J = (GrB_Index *) GrB_ALL ;
     GrB_Index ni = cnrows, nj = cncols ;
     bool I_allocated = false, J_allocated = false ;
+    int64_t I_max = -1, J_max = -1 ;
 
     if (cnrows > 1 && cncols > 1 && ncells == 1)
     {
-        ERROR ("linear indexing not yet supported") ;
+        ERROR ("Linear indexing not supported") ;
     }
 
     if (cnrows == 1 && ncells == 1)
     { 
         // only J is present
-        J = gb_mxcell_to_index (Cell [0], base, cncols, &J_allocated, &nj) ;
+        J = gb_mxcell_to_index (Cell [0], base, cncols, &J_allocated, &nj,
+            &J_max) ;
     }
     else if (ncells == 1)
     { 
         // only I is present
-        I = gb_mxcell_to_index (Cell [0], base, cnrows, &I_allocated, &ni) ;
+        I = gb_mxcell_to_index (Cell [0], base, cnrows, &I_allocated, &ni,
+            &I_max) ;
     }
     else if (ncells == 2)
     { 
         // both I and J are present
-        I = gb_mxcell_to_index (Cell [0], base, cnrows, &I_allocated, &ni) ;
-        J = gb_mxcell_to_index (Cell [1], base, cncols, &J_allocated, &nj) ;
+        I = gb_mxcell_to_index (Cell [0], base, cnrows, &I_allocated, &ni,
+            &I_max) ;
+        J = gb_mxcell_to_index (Cell [1], base, cncols, &J_allocated, &nj,
+            &J_max) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // expand C if needed
+    //--------------------------------------------------------------------------
+
+    GrB_Index cnrows_required = I_max + 1 ;
+    GrB_Index cncols_required = J_max + 1 ;
+    if (cnrows_required > cnrows || cncols_required > cncols)
+    {
+        GrB_Index cnrows_new = GB_IMAX (cnrows, cnrows_required) ;
+        GrB_Index cncols_new = GB_IMAX (cncols, cncols_required) ;
+        OK (GrB_Matrix_resize (C, cnrows_new, cncols_new)) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_error.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_error.c
new file mode 100644
index 000000000..f76d16bf1
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_error.c
@@ -0,0 +1,52 @@
+//------------------------------------------------------------------------------
+// gb_error: return a string from a GraphBLAS GrB_info
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+//------------------------------------------------------------------------------
+
+#include "gb_interface.h"
+
+const char *gb_error        // return an error message from a GrB_Info value
+(
+    GrB_Info info
+)
+{
+    switch (info)
+    {
+
+        case GrB_SUCCESS :              return ("success") ;
+
+        //----------------------------------------------------------------------
+        // informational codes, not an error:
+        //----------------------------------------------------------------------
+
+        case GrB_NO_VALUE :             return ("no entry present") ; 
+        case GxB_EXHAUSTED :            return ("iterator is exhausted") ;
+
+        //----------------------------------------------------------------------
+        // errors:
+        //----------------------------------------------------------------------
+
+        case GrB_UNINITIALIZED_OBJECT : return ("unitialized object") ;
+        case GrB_NULL_POINTER :         return ("input pointer is NULL") ;
+        case GrB_INVALID_VALUE :        return ("invalid value") ;
+        case GrB_INVALID_INDEX :        return ("row or column index out of bounds") ;
+        case GrB_DOMAIN_MISMATCH :      return ("object domains are not compatible") ;
+        case GrB_DIMENSION_MISMATCH :   return ("matrix dimensions are invalid") ;
+        case GrB_OUTPUT_NOT_EMPTY :     return ("output matrix already has values") ;
+        case GrB_NOT_IMPLEMENTED :      return ("method not implemented") ;
+        case GrB_OUT_OF_MEMORY :        return ("out of memory") ;
+        case GrB_INSUFFICIENT_SPACE :   return ("output array not large enough") ;
+        case GrB_INVALID_OBJECT :       return ("object is corrupted") ;
+        case GrB_INDEX_OUT_OF_BOUNDS :  return ("row or column index out of bounds") ;
+        case GrB_EMPTY_OBJECT :         return ("an object does not contain a value") ;
+        default :
+        case GrB_PANIC :                break ;
+    }
+
+    return ("unknown error") ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c
index 0c40f77ba..080e30a8f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c
@@ -92,7 +92,7 @@ void gb_get_mxargs
             // a matrix argument is C, M, A, or B
             if ((*nmatrices) >= 6)
             { 
-                // at most 4 matrix inputs are allowed
+                // at most 6 matrix inputs are allowed
                 ERROR (usage) ;
             }
             Matrix [(*nmatrices)++] = (mxArray *) pargin [k] ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_interface.h b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_interface.h
index 2834b2705..7c7bb22e5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_interface.h
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_interface.h
@@ -58,14 +58,21 @@ void gbcov_put (void) ;
 
 #define CHECK_ERROR(error,message) if (error) ERROR (message) ;
 
-#define OK(method) CHECK_ERROR ((method) != GrB_SUCCESS, "GrB:error") ;
+#define OK(method)                                          \
+{                                                           \
+    GrB_Info info = method ;                                \
+    if (info != GrB_SUCCESS)                                \
+    {                                                       \
+        ERROR (gb_error (info)) ;                           \
+    }                                                       \
+}
 
 #define OK0(method)                                         \
 {                                                           \
     GrB_Info info = method ;                                \
     if (!(info == GrB_SUCCESS || info == GrB_NO_VALUE))     \
     {                                                       \
-        ERROR ("GrB:error") ;                               \
+        ERROR (gb_error (info)) ;                           \
     }                                                       \
 }
 
@@ -214,6 +221,11 @@ void gb_usage       // check usage and make sure GxB_init has been called
     const char *message     // error message if usage is not correct
 ) ;
 
+const char *gb_error        // return an error string from a GrB_Info value
+(
+    GrB_Info info
+) ;
+
 void gb_find_dot            // find 1st and 2nd dot ('.') in a string
 (
     int32_t position [2],   // positions of one or two dots
@@ -374,6 +386,12 @@ bool gb_mxarray_is_scalar   // true if built-in array is a scalar
     const mxArray *S
 ) ;
 
+uint64_t gb_mxget_uint64_scalar // return uint64 value of a MATLAB scalar
+(
+    const mxArray *mxscalar,    // MATLAB scalar to extract
+    char *name                  // name of the scalar
+) ;
+
 bool gb_mxarray_is_empty    // true if built-in array is NULL, or 2D and 0-by-0
 (
     const mxArray *S
@@ -399,7 +417,8 @@ GrB_Index *gb_mxcell_to_index   // return index list I
     base_enum_t base,           // I is one-based or zero-based
     const GrB_Index n,          // dimension of matrix being indexed
     bool *I_allocated,          // true if output array I is allocated
-    GrB_Index *ni               // length (I)
+    GrB_Index *ni,              // length (I)
+    int64_t *I_max              // max (I) is computed if I_max is not NULL
 ) ;
 
 GrB_BinaryOp gb_first_binop         // return GrB_FIRST_[type] operator
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c
index d5ebd5a37..ce08ec75a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c
@@ -82,15 +82,9 @@ int64_t *gb_mxarray_to_list     // return List of integers
             bool ok = GB_helper3 (List, List_double, (*len), List_max) ;
             CHECK_ERROR (!ok, "index must be integer") ;
         }
-        else if (class == mxINT64_CLASS)
+        else
         { 
-            // input list is 1-based int64
-            int64_t *List_int64 = (int64_t *) mxGetData (mxList) ;
-            GB_helper3i (List, List_int64, (*len), List_max) ;
-        }
-        else // if (class == mxUINT64_CLASS)
-        { 
-            // input list is 1-based uint64
+            // input list is 1-based int64 or uint64
             int64_t *List_int64 = (int64_t *) mxGetData (mxList) ;
             GB_helper3i (List, List_int64, (*len), List_max) ;
         }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c
index 4b0e17fe8..652335ad5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c
@@ -29,7 +29,9 @@ GrB_Index *gb_mxcell_to_index   // return index list I
     base_enum_t base,           // I is one-based or zero-based
     const GrB_Index n,          // dimension of matrix being indexed
     bool *I_allocated,          // true if output array I is allocated
-    GrB_Index *ni               // length (I)
+    GrB_Index *ni,              // length (I)
+    int64_t *I_max              // max (I) is computed if I_max is not NULL.
+                                // I_max is 0-based.
 )
 {
 
@@ -74,6 +76,11 @@ GrB_Index *gb_mxcell_to_index   // return index list I
         (*ni) = n ;
         (*I_allocated) = false ;
         I = (GrB_Index *) GrB_ALL ;
+        if (I_max != NULL)
+        { 
+            // I_max is the last index in the matrix, based on its dimension.
+            (*I_max) = ((int64_t) n) - 1 ;
+        }
 
     }
     else if (len == 1)
@@ -86,6 +93,23 @@ GrB_Index *gb_mxcell_to_index   // return index list I
         (*ni) = Item_len [0] ;
         (*I_allocated) = Item_allocated [0] ;
         I = (GrB_Index *) (Item [0]) ;
+        if (I_max != NULL)
+        {
+            // find the max entry in the list
+            if (Item_max [0] >= 0)
+            { 
+                // the max entry has already been computed (1-based)
+                // convert from 1-based to 0-based
+                (*I_max) = Item_max [0] - 1 ;
+            }
+            else
+            { 
+                // find the max entry (0-based)
+                GrB_Index List_max = 0 ;
+                GB_helper4 (I, (*ni), &List_max) ;
+                (*I_max) = ((int64_t) List_max) - 1 ;
+            }
+        }
 
     }
     else if (len == 2)
@@ -109,6 +133,11 @@ GrB_Index *gb_mxcell_to_index   // return index list I
         if (Item_allocated [1]) gb_mxfree ((void **) (& (Item [1]))) ;
 
         (*ni) = GxB_RANGE ;
+        if (I_max != NULL)
+        { 
+            // find the last index in the start:fini list
+            (*I_max) = (int64_t) I [GxB_END] ;
+        }
 
     }
     else // if (len == 3)
@@ -128,29 +157,59 @@ GrB_Index *gb_mxcell_to_index   // return index list I
         I [GxB_BEGIN] = Item [0][0] ;
         I [GxB_END  ] = Item [2][0] ;
         I [GxB_INC  ] = 0 ;
-        int64_t inc = Item [1][0] ;
+        int64_t iinc = Item [1][0] ;
 
         if (Item_allocated [1])
         { 
-            // the 2nd item in the list is inc, and if it was passed in as
+            // the 2nd item in the list is iinc, and if it was passed in as
             // 1-based, it has been decremented.  So increment it to get back
             // to the correct value.
-            inc++ ;
+            iinc++ ;
         }
 
         if (Item_allocated [0]) gb_mxfree ((void **) (& (Item [0]))) ;
         if (Item_allocated [1]) gb_mxfree ((void **) (& (Item [1]))) ;
         if (Item_allocated [2]) gb_mxfree ((void **) (& (Item [2]))) ;
 
-        if (inc < 0)
+        if (iinc < 0)
         { 
-            I [GxB_INC] = (GrB_Index) (-inc) ;
+            I [GxB_INC] = (GrB_Index) (-iinc) ;
             (*ni) = GxB_BACKWARDS ;
+            if (I_max != NULL)
+            {
+                // find the first entry in the list ibegin:iinc:iend.
+                (*I_max) = -1 ;
+                int64_t ibegin = (int64_t) I [GxB_BEGIN] ;
+                int64_t iend   = (int64_t) I [GxB_END] ;
+                if (iinc != 0 && ibegin >= iend)
+                { 
+                    // the list is non-empty, for example, 7:-2:4 = [7 5]
+                    // I_max = GB_ijlist (NULL, 0, GB_STRIDE, I)
+                    (*I_max) = ibegin ;
+                }
+            }
         }
         else
         { 
-            I [GxB_INC] = (GrB_Index) (inc) ;
+            I [GxB_INC] = (GrB_Index) (iinc) ;
             (*ni) = GxB_STRIDE ;
+            if (I_max != NULL)
+            {
+                // find the last entry in the list ibegin:iinc:iend.
+                (*I_max) = -1 ;
+                int64_t ibegin = (int64_t) I [GxB_BEGIN] ;
+                int64_t iend   = (int64_t) I [GxB_END] ;
+                if (iinc != 0 && ibegin <= iend)
+                { 
+                    // the list is non-empty, for example, 4:2:9 = [4 6 8]
+                    // nI = length of the expanded list (see GB_ijproperties),
+                    // which is 3 for the list 4:2:9.
+                    int64_t nI = ((iend - ibegin) / iinc) + 1 ;
+                    // I_max = GB_ijlist (NULL, nI-1, GB_STRIDE, I),
+                    // which is 8 for the list 4:2:9
+                    (*I_max) = ibegin + (nI-1) * iinc ;
+                }
+            }
         }
     }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxget_uint64_scalar.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxget_uint64_scalar.c
new file mode 100644
index 000000000..5c225bf1c
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxget_uint64_scalar.c
@@ -0,0 +1,50 @@
+//------------------------------------------------------------------------------
+// gb_mxget_int64_scalar: return an int64 scalar
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+//------------------------------------------------------------------------------
+
+#include "gb_interface.h"
+
+uint64_t gb_mxget_uint64_scalar // return uint64 value of a MATLAB scalar
+(
+    const mxArray *mxscalar,    // MATLAB scalar to extract
+    char *name                  // name of the scalar
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    if (!gb_mxarray_is_scalar (mxscalar))
+    { 
+        GB_COV_PUT ;
+        mexErrMsgIdAndTxt ("GrB:error", "%s must be a scalar", name) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // extract the scalar
+    //--------------------------------------------------------------------------
+
+    uint64_t *p, scalar ;
+
+    switch (mxGetClassID (mxscalar))
+    {
+        case mxINT64_CLASS    : 
+        case mxUINT64_CLASS   : 
+            p = (uint64_t *) mxGetData (mxscalar) ;
+            scalar = p [0] ;
+            break ;
+
+        default               : 
+            scalar = (uint64_t) mxGetScalar (mxscalar) ;
+            break ;
+    }
+
+    return (scalar) ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop_or_idxunop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop_or_idxunop.c
index d178aae9e..dda08de36 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop_or_idxunop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop_or_idxunop.c
@@ -30,9 +30,7 @@ void gb_mxstring_to_binop_or_idxunop    // binop or idxunop from a string
 
     if (gb_mxarray_is_empty (mxstring))
     { 
-        // no operator is present, or present and empty; this is not yet an
-        // error, since many uses of GraphBLAS functions use an optional accum
-        // operator.
+        // no operator is present, or present and empty
         return ;
     }
 
@@ -45,7 +43,7 @@ void gb_mxstring_to_binop_or_idxunop    // binop or idxunop from a string
     gb_mxstring_to_string (opstring, LEN, mxstring, "binary/index operator") ;
 
     //--------------------------------------------------------------------------
-    // convert the string to a binary operator
+    // convert the string to a binary operator or index unary operator
     //--------------------------------------------------------------------------
 
     (*op2) = gb_string_to_binop_or_idxunop (opstring, atype, btype, idxunop,
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c
index ae3b791ac..9122e4fad 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c
@@ -427,6 +427,13 @@ GrB_UnaryOp gb_string_and_type_to_unop  // return op from string and type
         if (type == GrB_FP32  ) return (GxB_ERFC_FP32) ;
         if (type == GrB_FP64  ) return (GxB_ERFC_FP64) ;
 
+    }
+    else if (MATCH (op_name, "cbrt"))
+    { 
+
+        if (type == GrB_FP32  ) return (GxB_CBRT_FP32) ;
+        if (type == GrB_FP64  ) return (GxB_CBRT_FP64) ;
+
     }
     else if (MATCH (op_name, "conj"))
     { 
diff --git a/GraphBLAS/GraphBLAS/@GrB/rdivide.m b/GraphBLAS/GraphBLAS/@GrB/rdivide.m
index 2a334bf8d..5c368d4c1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/rdivide.m
+++ b/GraphBLAS/GraphBLAS/@GrB/rdivide.m
@@ -24,12 +24,17 @@
 b_is_scalar = (bm == 1) && (bn == 1) ;
 ctype = gboptype (atype, btype) ;
 
+if (a_is_scalar && gb_scalar (A) == 0 && gb_isfloat (ctype))
+    A = 0 ;
+end
+
 if (a_is_scalar)
     if (b_is_scalar)
         % both A and B are scalars
-        C = GrB (gbemult (A, '/', B)) ;
+        C = GrB (gbemult (A, '/', gbfull (B))) ;
     else
-        % A is a scalar, B is a matrix.  Expand B to full with type of C
+        % A is a scalar, B is a matrix.
+        % Expand B to full with type of C
         C = GrB (gbapply2 (A, '/', gbfull (B, ctype))) ;
     end
 else
@@ -38,7 +43,7 @@
         if (gb_scalar (B) == 0 && gb_isfloat (atype))
             % 0/0 is Nan, and thus must be computed computed if A is
             % floating-point.  The result is a full matrix.
-            % expand B t a full matrix and cast to the type of A
+            % expand B into a full matrix and cast to the type of A
             B = gb_scalar_to_full (am, an, atype, gb_fmt (A), B) ;
             C = GrB (gbemult (A, '/', B)) ;
         else
diff --git a/GraphBLAS/GraphBLAS/@GrB/reshape.m b/GraphBLAS/GraphBLAS/@GrB/reshape.m
index 7585e66b6..a8e864879 100644
--- a/GraphBLAS/GraphBLAS/@GrB/reshape.m
+++ b/GraphBLAS/GraphBLAS/@GrB/reshape.m
@@ -4,13 +4,15 @@
 % matrix whose elements are taken columnwise from G.  The matrix G must
 % have numel (G) == m*n.  That is numel (G) == numel (C) must be true.
 %
+% An optional parameter allows G to be to be reshaped row-wise instead
+% of columnwise:  C = reshape (G, m, n, 'by row') or C = 
+% reshape (G, [m n], 'by row').  The default is 'by column'.
+%
 % See also GrB/numel, squeeze.
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: this would be faster as a built-in GxB_reshape function.
-
 if (isobject (G))
     G = G.opaque ;
 end
@@ -19,20 +21,21 @@
 mold = int64 (mold) ;
 nold = int64 (nold) ;
 
-[mnew, nnew] = gb_parse_dimensions (varargin {:}) ;
+% the third output of gb_parse_args is not actually a type, but 'by row', 'by
+% col', or 'double' if not present on input.
+[mnew, nnew, type] = gb_parse_args ('reshape', varargin {:}) ;
 mnew = int64 (mnew) ;
 nnew = int64 (nnew) ;
 
-if (mold * nold ~= mnew * nnew)
-    error ('number of elements must not change') ;
+switch (type)
+    case 'by row'
+        by_col = false ;
+    case { 'by column', 'double' }
+        % if type is 'double', the row/colwise parameter is not present
+        by_col = true ;
+    otherwise
+        error ('GrB:error', 'unknown reshape option') ;
 end
 
-desc.base = 'zero-based' ;
-[iold, jold, x] = gbextracttuples (G, desc) ;
-% convert i and j from 2D (mold-by-nold) to 1D indices
-k = gb_2d_to_1d (iold, jold, mold) ;
-% convert k from 1D indices to 2D (mnew-by-nnew)
-[inew, jnew] = gb_1d_to_2d (k, mnew) ;
-% rebuild the new matrix
-C = GrB (gbbuild (inew, jnew, x, mnew, nnew, desc)) ;
+C = GrB (gbreshape (G, mnew, nnew, by_col)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/serialize.m b/GraphBLAS/GraphBLAS/@GrB/serialize.m
index 395694419..1a7984153 100644
--- a/GraphBLAS/GraphBLAS/@GrB/serialize.m
+++ b/GraphBLAS/GraphBLAS/@GrB/serialize.m
@@ -12,21 +12,23 @@
 % more compact blob at the cost of higher run time.  Levels outside
 % the allowable range are changed to the default level.
 %
-%   'lz4'    LZ4, with no level setting.  This is the default if the
-%            method is not specified.  Very fast with good compression.
-%            For large problems, lz4 can be faster than no compression,
-%            and it cuts the size of the blob by about 3x on average.
+%   'zstd'  ZSTD.  The level can be 1 to 19 with 1 the default.
+%           This is the default method if no method is specified.
 %
-%   'none'   no compression.
+%   'lz4'   LZ4, with no level setting. Fast with decent compression.
+%           For large problems, lz4 can be faster than no compression,
+%           and it cuts the size of the blob by about 3x on average.
 %
-%   'lz4hc'  LZ4HC, much slower than LZ4 but results in a more compact blob.
-%            The level can be 1 to 9 with 9 the default.  LZ4HC level 1
-%            provides excellent compression compared with LZ4, and higher
-%            levels of LZ4HC only slightly improve compression quality.
+%   'none'  no compression.
+%
+%   'lz4hc' LZ4HC, much slower than LZ4 but results in a more compact blob.
+%           The level can be 1 to 9 with 9 the default.  LZ4HC level 1
+%           provides excellent compression compared with LZ4, and higher
+%           levels of LZ4HC only slightly improve compression quality.
 %
 % Example:
 %   G = GrB (magic (5))
-%   blob = GrB.serialize (G) ;      % compressed via LZ4
+%   blob = GrB.serialize (G) ;      % compressed via ZSTD, level 1
 %   f = fopen ('G.bin', 'wb') ;
 %   fwrite (f, blob) ;
 %   fclose (f)
diff --git a/GraphBLAS/GraphBLAS/@GrB/sprand.m b/GraphBLAS/GraphBLAS/@GrB/sprand.m
index 750307da5..43ff64f7c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sprand.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sprand.m
@@ -31,6 +31,6 @@
     C = GrB (gb_random (m, n, d)) ;
 else
     % the 'rc' input option is not supported
-    error ('usage: sprand(A) or sprand(m,n,d)') ;
+    error ('GrB:error', 'usage: sprand(A) or sprand(m,n,d)') ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/sprandn.m b/GraphBLAS/GraphBLAS/@GrB/sprandn.m
index 4b6d73829..ccd8dbdb8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sprandn.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sprandn.m
@@ -29,6 +29,6 @@
     C = GrB (gb_random (m, n, d, 'normal')) ;
 else
     % the 'rc' input option is not supported
-    error ('usage: sprandn(A) or sprandn(m,n,d)') ;
+    error ('GrB:error', 'usage: sprandn(A) or sprandn(m,n,d)') ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/subsasgn.m b/GraphBLAS/GraphBLAS/@GrB/subsasgn.m
index 228834837..b1bdccb6a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/subsasgn.m
+++ b/GraphBLAS/GraphBLAS/@GrB/subsasgn.m
@@ -8,9 +8,9 @@
 % sparse matrix C, both statements delete all entries in C(I,J) since
 % Built-in sparse matrices never include explicit zeros.
 %
-% With a single index, C(I) = A, both C and A must be vectors; linear
-% indexing is not yet supported.  In this case A must either be a vector
-% of length the same as I, or a scalar.
+% Linear indexing is not fully yet supported.  With a single index,
+% C(I) = A, both C and A must be vectors, except for C(:) = A where A is
+% a matrix and C is a vector.
 %
 % If M is a logical matrix, C (M) = x is an assignment via logical
 % indexing, where C and M have the same size, and x(:) is either a vector
@@ -25,23 +25,23 @@
 % times faster than C (M) = A (M) using purely built-in sparse matrices C,
 % M, and A, when the matrices are large.
 %
-% If I or J are very large colon notation expressions, then C(I,J)=A is
+% If I or J are very large colon notation expressions, then C(I,J) = A is
 % not possible, because I and J are created as explicit lists first,
 % before passing them to GraphBLAS.  See GrB.subassign instead.  See also
 % the example with 'help GrB.extract'.
 %
-% Unlike the built-in C(I,J)=A, the GraphBLAS assignment does not change
-% the size of C.
+% Just as the built-in C(I,J) = A, the GraphBLAS assignment can change the
+% size of C if the indices I and J extend past the current dimesions of C.
 %
 % See also GrB/subsref, GrB/subsindex, GrB.assign, GrB.subassign.
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: add linear indexing, and allow the matrix to grow/shrink in size.
+% FUTURE: add all forms of linear indexing.
 
 if (~isequal (S.type, '()'))
-    error ('index type %s not supported', S.type) ;
+    error ('GrB:error', 'index type %s not supported', S.type) ;
 end
 
 if (isobject (C))
@@ -74,23 +74,39 @@
     else
         % C (I) = A
         [cm, cn] = gbsize (C) ;
+        [I, whole] = gb_index (S) ;
         if (cm == 1 || cn == 1)
             % C (I) = A for a vector or scalar C
-            C = GrB (gbsubassign (C, gb_index (S), A)) ;
+            C = GrB (gbsubassign (C, I, A)) ;
         else
-            % C (I) = A for a matrix C
-            error ('Linear indexing not yet supported') ;
+            if (whole)
+                [am, an] = gbsize (A) ;
+                if (am == 1 && an == 1)
+                    % C (:) = scalar, the same as C (:,:) = scalar.
+                    % C becomes an iso full matrix
+                    C_empty = gbnew (cm, cn, gbtype (C)) ;
+                    C = GrB (gbsubassign (C_empty, { }, { }, A)) ;
+                else
+                    % C (:) = A for a matrix C and vector A
+                    C = GrB (gbreshape (A, cm, cn, 'by column')) ;
+                end
+            else
+                % C (I) = A, general case not yet supported
+                error ('GrB:error', ...
+                    'Except for C(:)=A, linear indexing not yet supported') ;
+            end
         end
     end
 
 elseif (ndims == 2)
 
-    % C(I,J) = A where A is length(I)-by-length(J), or a scalar
+    % C (I,J) = A where A is length(I)-by-length(J), or a scalar
     C = GrB (gbsubassign (C, gb_index (S.subs {1}), gb_index (S.subs {2}), A)) ;
 
 else
 
-    error ('%dD indexing not yet supported', ndims) ;
+    % sparse N-dimensional arrays for N > 2 will not be supported
+    error ('GrB:error', '%dD indexing not supported', ndims) ;
 
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/subsindex.m b/GraphBLAS/GraphBLAS/@GrB/subsindex.m
index 420cd05af..f58e52d38 100644
--- a/GraphBLAS/GraphBLAS/@GrB/subsindex.m
+++ b/GraphBLAS/GraphBLAS/@GrB/subsindex.m
@@ -30,7 +30,7 @@
     % double or single: convert to int64
     I = gbextractvalues (G) ;
     if (~isequal (I, round (I)))
-        error ('array indices must be integers') ;
+        error ('GrB:error', 'array indices must be integers') ;
     end
     I = int64 (I) ;
 elseif (gb_contains (type, 'int'))
@@ -38,7 +38,7 @@
     I = gbextractvalues (G) ;
 else
     % logical or complex
-    error ('array indices must be integers') ;
+    error ('GrB:error', 'array indices must be integers') ;
 end
 
 % I must contain entries in range 0 to prod (size (A)) - 1,
diff --git a/GraphBLAS/GraphBLAS/@GrB/subsref.m b/GraphBLAS/GraphBLAS/@GrB/subsref.m
index 438c6c065..8bf56e726 100644
--- a/GraphBLAS/GraphBLAS/@GrB/subsref.m
+++ b/GraphBLAS/GraphBLAS/@GrB/subsref.m
@@ -2,7 +2,8 @@
 %SUBSREF C = A(I,J) or C = A(I); extract submatrix.
 % C = A(I,J) extracts the A(I,J) submatrix of the GraphBLAS matrix A.
 % With a single index, C = A(I) extracts a subvector C of a vector A.
-% Linear indexing of a matrix is not yet supported.
+% For linear indexing of a 2D matrix, only C=A(:) is currently supported.
+% C = A(I) is not yet supported if A is a 2D matrix.
 %
 % x = A (M) for a logical matrix M constructs an nnz(M)-by-1 vector x, for
 % built-in-style logical indexing.  A or M may be built-in sparse or full
@@ -33,7 +34,7 @@
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
 
-% FUTURE: add linear indexing.
+% FUTURE: add all forms of linear indexing.
 
 if (isobject (A))
     A = A.opaque ;
@@ -41,11 +42,11 @@
 [m, n] = gbsize (A) ;
 
 if (length (S) > 1)
-    error ('nested indexing not supported') ;
+    error ('GrB:error', 'nested indexing not supported') ;
 end
 
 if (~isequal (S.type, '()'))
-    error ('index type %s not supported', S.type) ;
+    error ('GrB:error', 'index type %s not supported', S.type) ;
 end
 
 ndims = length (S.subs) ;
@@ -62,9 +63,9 @@
         C = GrB (gblogextract (A, S)) ;
     else
         % C = A (I)
+        [I, whole] = gb_index (S) ;
         if (m == 1 || n == 1)
             % C = A (I) for a vector A
-            [I, whole] = gb_index (S) ;
             if (m > 1)
                 C = gbextract (A, I, { }) ;
             else
@@ -77,7 +78,15 @@
             C = GrB (C) ;
         else
             % C = A (I) for a matrix A
-            error ('Linear indexing not yet supported') ;
+            if (whole)
+                % C = A (:), whole matrix case
+                [~, mn] = gb_2d_to_1d (0, 0, m, n) ;
+                C = GrB (gbreshape (A, mn, 1, 'by column')) ;
+            else
+                % C = A (I), general case not yet supported
+                error ('GrB:error', ...
+                    'Except for C=A(:), linear indexing not yet supported') ;
+            end
         end
     end
 
@@ -88,7 +97,8 @@
 
 else
 
-    error ('%dD indexing not yet supported', ndims) ;
+    % sparse N-dimensional arrays for N > 2 will not be supported
+    error ('GrB:error', '%dD indexing not supported', ndims) ;
 
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/tricount.m b/GraphBLAS/GraphBLAS/@GrB/tricount.m
index 4a361ed1b..e7b286536 100644
--- a/GraphBLAS/GraphBLAS/@GrB/tricount.m
+++ b/GraphBLAS/GraphBLAS/@GrB/tricount.m
@@ -20,7 +20,7 @@
 
 [m, n] = size (A) ;
 if (m ~= n)
-    error ('A must be square') ;
+    error ('GrB:error', 'A must be square') ;
 end
 
 d = [ ] ;
@@ -47,7 +47,7 @@
 end
 
 if (check && ~issymmetric (spones (A)))
-    error ('pattern of A must be symmetric') ;
+    error ('GrB:error', 'pattern of A must be symmetric') ;
 end
 
 if (isequal (class (d), 'GrB'))
diff --git a/GraphBLAS/GraphBLAS/@GrB/unopinfo.m b/GraphBLAS/GraphBLAS/@GrB/unopinfo.m
index 15c6c3806..285eb59bf 100644
--- a/GraphBLAS/GraphBLAS/@GrB/unopinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/unopinfo.m
@@ -70,6 +70,7 @@ function unopinfo (op, type)
 %   tgamma      z = tgamma (x)  gamma function, also 'gamma'
 %   erf         z = erf (x)     error function
 %   erfc        z = erfc (x)    complementary error function
+%   cbrt        z = cbrt (x)    cube root
 %   frexpx      z = frexpx (x)  mantissa from ANSI C11 frexp function
 %   frexpe      z = frexpe (x)  exponent from ANSI C11 frexp function;
 %                               the built-in [f,e]=log2(x) returns
diff --git a/GraphBLAS/GraphBLAS/CMakeLists.txt b/GraphBLAS/GraphBLAS/CMakeLists.txt
index 98eb0cb07..ec604ebd7 100644
--- a/GraphBLAS/GraphBLAS/CMakeLists.txt
+++ b/GraphBLAS/GraphBLAS/CMakeLists.txt
@@ -28,11 +28,13 @@ endif ( )
 # MacOS settings
 set ( CMAKE_MACOSX_RPATH TRUE )
 
-# version of SuiteSparse:GraphBLAS (must match ../CMakeLists.txt)
-set ( GraphBLAS_DATE "Apr 8, 2022" )
+# version must match ../CMakeLists.txt:
+
+# version of SuiteSparse:GraphBLAS
+set ( GraphBLAS_DATE "Aug 8, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
-set ( GraphBLAS_VERSION_MINOR 0 )
-set ( GraphBLAS_VERSION_SUB   3 )
+set ( GraphBLAS_VERSION_MINOR 2 )
+set ( GraphBLAS_VERSION_SUB   0 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 
@@ -133,7 +135,7 @@ message ( STATUS "CMAKE have OpenMP:         " ${OPENMP_FOUND} )
 
 set ( CMAKE_INCLUDE_CURRENT_DIR ON )
 
-include_directories ( ../Source/Template ../Source ../Include rename ../Source/Generated1 ../lz4 ../rmm_wrap ../Source/Generated2 )
+include_directories ( ../Source/Template ../Source ../Include rename ../Source/Generated1 ../lz4 ../zstd ../zstd/zstd_subset ../rmm_wrap ../Source/Generated2 )
 
 #-------------------------------------------------------------------------------
 # compiler options:
diff --git a/GraphBLAS/GraphBLAS/README.md b/GraphBLAS/GraphBLAS/README.md
index 037f3ccdd..49314f1a6 100644
--- a/GraphBLAS/GraphBLAS/README.md
+++ b/GraphBLAS/GraphBLAS/README.md
@@ -111,19 +111,16 @@ messages during the test.  This is expected.
 
 # FUTURE: Not yet supported for GrB matrices in Octave/MATLAB:
 
-    linear indexing
-    2nd output for [x,i] = max (...) and [x,i] = min (...); needs
-        modified reduction methods inside GraphBLAS
+    linear indexing, except for C=A(:) to index the whole matrix A
+        or C(:)=A to index the whole matrix C.
+    2nd output for [x,i] = max (...) and [x,i] = min (...):
+        use GrB.argmin and GrB.argmax instead.
     'includenan' for min and max
     min and max for complex matrices
     singleton expansion
-    3D and higher dimensional matrices:
-        this might be done by converting the higher dimensioal
-        indices down to a large 2D space, and relying on hypersparsity.
     saturating element-wise binary and unary operators for integers.
         See also the discussion in the User Guide.
 
 These functions are supported, but are not yet as fast as they could be:
-bandwidth, eps, isbanded, isdiag, ishermitian, issymmetric, istril, istriu,
-spfun.
+eps, ishermitian, issymmetric, spfun.
 
diff --git a/GraphBLAS/GraphBLAS/demo/Contents.m b/GraphBLAS/GraphBLAS/demo/Contents.m
index 8d5db0a73..192b37111 100644
--- a/GraphBLAS/GraphBLAS/demo/Contents.m
+++ b/GraphBLAS/GraphBLAS/demo/Contents.m
@@ -1,21 +1,24 @@
 % SuiteSparse/GraphBLAS/GraphBLAS/demo: demos for GraphBLAS @GrB interface
 %
-%   dnn_builtin - Sparse deep neural network without @GrB
-%   gbdemo      - run the graphblas_demo.m
-%   gbdemo2     - Extreme performance differences: GraphBLAS vs built-in methods
-%   graphblas_demo      - GraphBLAS: graph algorithms in the language of linear algebra
-%   graphblas_demo2     - Run the GraphBLAS demo2
-%
+%   dnn_builtin     - Sparse deep neural network without @GrB
+%   gbdemo          - run the graphblas_demo.m
+%   gbdemo2         - Extreme performance differences: GraphBLAS vs built-in methods
+%   graphblas_demo  - GraphBLAS: graph algorithms in the language of linear algebra
+%   graphblas_demo2 - Run the GraphBLAS demo2
 %   dnn_builtin2gb  - convert sparse deep neural network from built-in to GraphBLAS
-%   dnn_run     - Run the DNN benchmarks
-%   mxm_demo    - performance test of real and complex A*B
-%
-% Folders and other files:
+%   dnn_run         - Run the DNN benchmarks
+%   mxm_demo        - performance test of real and complex A*B
+%   bfs_builtin     - breadth-first-search using purely built-in methods
+%   cats_demo       - cats_demo.m
+%   demo_nproc      - determine the default # of cores, or set the # of cores to use
+%   demo_octave     - return true if Octave is in use, false for MATLAB
+%   demo_whoami     - return 'Octave' or 'MATLAB'
+%   tmask           - performance tests for logical indexing
 %
-%   dnn_results   - DNN performance results
-%   html          - output of graphblas_demo
 %   mxm_demo_DellXPS13.txt   - mxm_demo results on Intel Core i7-8565U (4 core)
 %   mxm_demo_DGX_Station.txt - mxm_demo results on Intel Xeon E5-2689 (20 core)
+%
+% The html folder contains the output of graphblas_demo.
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_builtin.m b/GraphBLAS/GraphBLAS/demo/dnn_builtin.m
index dbf9be944..ec4532994 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_builtin.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_builtin.m
@@ -1,5 +1,5 @@
 function Y = dnn_builtin (W, bias, Y0)
-%DNN_BUILTIN Sparse deep neural network in without @GrB methods 
+%DNN_BUILTIN Sparse deep neural network without @GrB
 % Performs ReLU inference using input feature vector(s) Y0, DNN weights W,
 % and bias vectors.
 %
@@ -32,3 +32,4 @@
     Y (Y > 32) = 32 ;
 end
 
+
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_builtin2gb.m b/GraphBLAS/GraphBLAS/demo/dnn_builtin2gb.m
index 6b27d5ae0..1974d549c 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_builtin2gb.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_builtin2gb.m
@@ -1,5 +1,5 @@
 function [W, bias, Y0] = dnn_builtin2gb (W, bias, Y0)
-%DNN_MAT2GB convert sparse deep neural network from built-in to GraphBLAS
+%DNN_BUILTIN2GB convert sparse deep neural network from built-in to GraphBLAS
 %
 % Usage:
 %
@@ -37,3 +37,4 @@
     bias {k} = GrB.build (1:n, 1:n, bias {k}, n, n, '+', prec, d) ;
 end
 
+
diff --git a/GraphBLAS/GraphBLAS/demo/gbdemo2.m b/GraphBLAS/GraphBLAS/demo/gbdemo2.m
index a61662077..bdf952362 100644
--- a/GraphBLAS/GraphBLAS/demo/gbdemo2.m
+++ b/GraphBLAS/GraphBLAS/demo/gbdemo2.m
@@ -1,5 +1,5 @@
 function gbdemo2 (bnz)
-%GBDEMO2 Extreme performance differences: GraphBLAS vs built-in methods.
+%GBDEMO2 Extreme performance differences: GraphBLAS vs built-in methods
 %
 % Usage:
 %
@@ -99,3 +99,4 @@ function gbdemo2 (bnz)
 
 end
 
+
diff --git a/GraphBLAS/GraphBLAS/demo/graphblas_demo.m b/GraphBLAS/GraphBLAS/demo/graphblas_demo.m
index c9cc37301..d933fb4e7 100644
--- a/GraphBLAS/GraphBLAS/demo/graphblas_demo.m
+++ b/GraphBLAS/GraphBLAS/demo/graphblas_demo.m
@@ -827,25 +827,8 @@
 % the equivalent built-in operators and functions in MATLAB.
 %
 % There are few notable exceptions; these will be addressed in the future.
-% These include bandwidth, istriu, istril, isdiag, reshape, issymmetric,
-% and ishermitian, all of which should be faster in a future release.
-
-%%
-% Here is an example that illustrates the performance of istril.
-A = sparse (rand (2000)) ;
-tic
-c1 = istril (A) ;
-builtin_time = toc ;
-A = GrB (A) ;
-tic
-c2 = istril (A) ;
-gb_time = toc ;
-fprintf ('\n%s: %g sec, GraphBLAS: %g sec\n', ...
-    demo_whoami, builtin_time, gb_time) ;
-if (gb_time > builtin_time)
-    fprintf ('GraphBLAS is slower by a factor of %g\n', ...
-        gb_time / builtin_time) ;
-end
+% These include reshape, issymmetric, and ishermitian, all of which should
+% be faster in a future release.
 
 %%
 % (4) Linear indexing:
@@ -853,15 +836,15 @@
 % If A is an m-by-n 2D MATLAB matrix, with n > 1, A(:) is a column vector
 % of length m*n.  The index operation A(i) accesses the ith entry in the
 % vector A(:).  This is called linear indexing in MATLAB.  It is not yet
-% available for GraphBLAS matrices in this MATLAB interface to GraphBLAS,
-% but will be added in the future.
+% fully available for GraphBLAS matrices in this MATLAB interface to
+% GraphBLAS, but will be added in the future.
 
 %%
 % (5) Implicit singleton dimension expansion 
 %
 % In MATLAB C=A+B where A is m-by-n and B is a 1-by-n row vector
 % implicitly expands B to a matrix, computing C(i,j)=A(i,j)+B(j).  This
-% implicit expansion is not yet suported in GraphBLAS with C=A+B.
+% implicit expansion is not yet supported in GraphBLAS with C=A+B.
 % However, it can be done with C = GrB.mxm ('+.+', A, diag(GrB(B))).
 % That's a nice example of the power of semirings, but it's not
 % immediately obvious, and not as clear a syntax as C=A+B.  The
diff --git a/GraphBLAS/GraphBLAS/demo/tmask.m b/GraphBLAS/GraphBLAS/demo/tmask.m
index 0d5c552a5..371a430da 100644
--- a/GraphBLAS/GraphBLAS/demo/tmask.m
+++ b/GraphBLAS/GraphBLAS/demo/tmask.m
@@ -1,4 +1,4 @@
-% tmask.m
+% TMASK performance tests for logical indexing
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/rename/GB_rename.h b/GraphBLAS/GraphBLAS/rename/GB_rename.h
index 0230c8506..6c6afb791 100644
--- a/GraphBLAS/GraphBLAS/rename/GB_rename.h
+++ b/GraphBLAS/GraphBLAS/rename/GB_rename.h
@@ -1447,6 +1447,8 @@
 #define GxB_BXOR_UINT8_MONOID GxM_BXOR_UINT8_MONOID
 #define GxB_CARG_FC32 GxM_CARG_FC32
 #define GxB_CARG_FC64 GxM_CARG_FC64
+#define GxB_CBRT_FP32 GxM_CBRT_FP32
+#define GxB_CBRT_FP64 GxM_CBRT_FP64
 #define GxB_CEIL_FC32 GxM_CEIL_FC32
 #define GxB_CEIL_FC64 GxM_CEIL_FC64
 #define GxB_CEIL_FP32 GxM_CEIL_FP32
@@ -2024,6 +2026,7 @@
 #define GxB_Matrix_import_HyperCSC GxM_Matrix_import_HyperCSC
 #define GxB_Matrix_import_HyperCSR GxM_Matrix_import_HyperCSR
 #define GxB_Matrix_iso GxM_Matrix_iso
+#define GxB_Matrix_isStoredElement GxM_Matrix_isStoredElement
 #define GxB_Matrix_memoryUsage GxM_Matrix_memoryUsage
 #define GxB_Matrix_Option_get GxM_Matrix_Option_get
 #define GxB_Matrix_Option_set GxM_Matrix_Option_set
@@ -2037,6 +2040,8 @@
 #define GxB_Matrix_pack_HyperCSR GxM_Matrix_pack_HyperCSR
 #define GxB_Matrix_reduce_FC32 GxM_Matrix_reduce_FC32
 #define GxB_Matrix_reduce_FC64 GxM_Matrix_reduce_FC64
+#define GxB_Matrix_reshape GxM_Matrix_reshape
+#define GxB_Matrix_reshapeDup GxM_Matrix_reshapeDup
 #define GxB_Matrix_resize GxM_Matrix_resize
 #define GxB_Matrix_select GxM_Matrix_select
 #define GxB_Matrix_select_FC32 GxM_Matrix_select_FC32
@@ -3265,6 +3270,7 @@
 #define GxB_Vector_import_CSC GxM_Vector_import_CSC
 #define GxB_Vector_import_Full GxM_Vector_import_Full
 #define GxB_Vector_iso GxM_Vector_iso
+#define GxB_Vector_isStoredElement GxM_Vector_isStoredElement
 #define GxB_Vector_memoryUsage GxM_Vector_memoryUsage
 #define GxB_Vector_Option_get GxM_Vector_Option_get
 #define GxB_Vector_Option_set GxM_Vector_Option_set
@@ -3551,6 +3557,7 @@
 #define GB_emult_08_phase2 GM_emult_08_phase2
 #define GB_emult_sparsity GM_emult_sparsity
 #define GB_entry_check GM_entry_check
+#define GB_ERR_getErrorString GM_ERR_getErrorString
 #define GB_eslice GM_eslice
 #define GB_ewise GM_ewise
 #define GB_ewise_generic GM_ewise_generic
@@ -3569,6 +3576,35 @@
 #define GB_frexpef GM_frexpef
 #define GB_frexpx GM_frexpx
 #define GB_frexpxf GM_frexpxf
+#define GB_FSE_buildCTable_raw GM_FSE_buildCTable_raw
+#define GB_FSE_buildCTable_rle GM_FSE_buildCTable_rle
+#define GB_FSE_buildCTable_wksp GM_FSE_buildCTable_wksp
+#define GB_FSE_buildDTable GM_FSE_buildDTable
+#define GB_FSE_buildDTable_raw GM_FSE_buildDTable_raw
+#define GB_FSE_buildDTable_rle GM_FSE_buildDTable_rle
+#define GB_FSE_buildDTable_wksp GM_FSE_buildDTable_wksp
+#define GB_FSE_compress GM_FSE_compress
+#define GB_FSE_compress2 GM_FSE_compress2
+#define GB_FSE_compressBound GM_FSE_compressBound
+#define GB_FSE_compress_usingCTable GM_FSE_compress_usingCTable
+#define GB_FSE_compress_wksp GM_FSE_compress_wksp
+#define GB_FSE_createCTable GM_FSE_createCTable
+#define GB_FSE_createDTable GM_FSE_createDTable
+#define GB_FSE_decompress GM_FSE_decompress
+#define GB_FSE_decompress_usingDTable GM_FSE_decompress_usingDTable
+#define GB_FSE_decompress_wksp GM_FSE_decompress_wksp
+#define GB_FSE_decompress_wksp_bmi2 GM_FSE_decompress_wksp_bmi2
+#define GB_FSE_freeCTable GM_FSE_freeCTable
+#define GB_FSE_freeDTable GM_FSE_freeDTable
+#define GB_FSE_getErrorName GM_FSE_getErrorName
+#define GB_FSE_NCountWriteBound GM_FSE_NCountWriteBound
+#define GB_FSE_normalizeCount GM_FSE_normalizeCount
+#define GB_FSE_optimalTableLog GM_FSE_optimalTableLog
+#define GB_FSE_optimalTableLog_internal GM_FSE_optimalTableLog_internal
+#define GB_FSE_readNCount GM_FSE_readNCount
+#define GB_FSE_readNCount_bmi2 GM_FSE_readNCount_bmi2
+#define GB_FSE_versionNumber GM_FSE_versionNumber
+#define GB_FSE_writeNCount GM_FSE_writeNCount
 #define GB_Global GM_Global
 #define GB_Global_abort_function GM_Global_abort_function
 #define GB_Global_abort_function_set GM_Global_abort_function_set
@@ -3664,6 +3700,71 @@
 #define GB_helper5 GM_helper5
 #define GB_helper7 GM_helper7
 #define GB_helper8 GM_helper8
+#define GB_HIST_count GM_HIST_count
+#define GB_HIST_countFast GM_HIST_countFast
+#define GB_HIST_countFast_wksp GM_HIST_countFast_wksp
+#define GB_HIST_count_simple GM_HIST_count_simple
+#define GB_HIST_count_wksp GM_HIST_count_wksp
+#define GB_HIST_isError GM_HIST_isError
+#define GB_HUF_buildCTable GM_HUF_buildCTable
+#define GB_HUF_buildCTable_wksp GM_HUF_buildCTable_wksp
+#define GB_HUF_compress GM_HUF_compress
+#define GB_HUF_compress1X GM_HUF_compress1X
+#define GB_HUF_compress1X_repeat GM_HUF_compress1X_repeat
+#define GB_HUF_compress1X_usingCTable GM_HUF_compress1X_usingCTable
+#define GB_HUF_compress1X_usingCTable_bmi2 GM_HUF_compress1X_usingCTable_bmi2
+#define GB_HUF_compress1X_wksp GM_HUF_compress1X_wksp
+#define GB_HUF_compress2 GM_HUF_compress2
+#define GB_HUF_compress4X_repeat GM_HUF_compress4X_repeat
+#define GB_HUF_compress4X_usingCTable GM_HUF_compress4X_usingCTable
+#define GB_HUF_compress4X_usingCTable_bmi2 GM_HUF_compress4X_usingCTable_bmi2
+#define GB_HUF_compress4X_wksp GM_HUF_compress4X_wksp
+#define GB_HUF_compressBound GM_HUF_compressBound
+#define GB_HUF_decompress GM_HUF_decompress
+#define GB_HUF_decompress1X1 GM_HUF_decompress1X1
+#define GB_HUF_decompress1X1_DCtx GM_HUF_decompress1X1_DCtx
+#define GB_HUF_decompress1X1_DCtx_wksp GM_HUF_decompress1X1_DCtx_wksp
+#define GB_HUF_decompress1X1_DCtx_wksp_bmi2 GM_HUF_decompress1X1_DCtx_wksp_bmi2
+#define GB_HUF_decompress1X1_usingDTable GM_HUF_decompress1X1_usingDTable
+#define GB_HUF_decompress1X2 GM_HUF_decompress1X2
+#define GB_HUF_decompress1X2_DCtx GM_HUF_decompress1X2_DCtx
+#define GB_HUF_decompress1X2_DCtx_wksp GM_HUF_decompress1X2_DCtx_wksp
+#define GB_HUF_decompress1X2_usingDTable GM_HUF_decompress1X2_usingDTable
+#define GB_HUF_decompress1X_DCtx GM_HUF_decompress1X_DCtx
+#define GB_HUF_decompress1X_DCtx_wksp GM_HUF_decompress1X_DCtx_wksp
+#define GB_HUF_decompress1X_usingDTable GM_HUF_decompress1X_usingDTable
+#define GB_HUF_decompress1X_usingDTable_bmi2 GM_HUF_decompress1X_usingDTable_bmi2
+#define GB_HUF_decompress4X1 GM_HUF_decompress4X1
+#define GB_HUF_decompress4X1_DCtx GM_HUF_decompress4X1_DCtx
+#define GB_HUF_decompress4X1_DCtx_wksp GM_HUF_decompress4X1_DCtx_wksp
+#define GB_HUF_decompress4X1_usingDTable GM_HUF_decompress4X1_usingDTable
+#define GB_HUF_decompress4X2 GM_HUF_decompress4X2
+#define GB_HUF_decompress4X2_DCtx GM_HUF_decompress4X2_DCtx
+#define GB_HUF_decompress4X2_DCtx_wksp GM_HUF_decompress4X2_DCtx_wksp
+#define GB_HUF_decompress4X2_usingDTable GM_HUF_decompress4X2_usingDTable
+#define GB_HUF_decompress4X_DCtx GM_HUF_decompress4X_DCtx
+#define GB_HUF_decompress4X_hufOnly GM_HUF_decompress4X_hufOnly
+#define GB_HUF_decompress4X_hufOnly_wksp GM_HUF_decompress4X_hufOnly_wksp
+#define GB_HUF_decompress4X_hufOnly_wksp_bmi2 GM_HUF_decompress4X_hufOnly_wksp_bmi2
+#define GB_HUF_decompress4X_usingDTable GM_HUF_decompress4X_usingDTable
+#define GB_HUF_decompress4X_usingDTable_bmi2 GM_HUF_decompress4X_usingDTable_bmi2
+#define GB_HUF_estimateCompressedSize GM_HUF_estimateCompressedSize
+#define GB_HUF_getErrorName GM_HUF_getErrorName
+#define GB_HUF_getNbBitsFromCTable GM_HUF_getNbBitsFromCTable
+#define GB_HUF_optimalTableLog GM_HUF_optimalTableLog
+#define GB_HUF_readCTable GM_HUF_readCTable
+#define GB_HUF_readDTableX1 GM_HUF_readDTableX1
+#define GB_HUF_readDTableX1_wksp GM_HUF_readDTableX1_wksp
+#define GB_HUF_readDTableX1_wksp_bmi2 GM_HUF_readDTableX1_wksp_bmi2
+#define GB_HUF_readDTableX2 GM_HUF_readDTableX2
+#define GB_HUF_readDTableX2_wksp GM_HUF_readDTableX2_wksp
+#define GB_HUF_readDTableX2_wksp_bmi2 GM_HUF_readDTableX2_wksp_bmi2
+#define GB_HUF_readStats GM_HUF_readStats
+#define GB_HUF_readStats_wksp GM_HUF_readStats_wksp
+#define GB_HUF_selectDecoder GM_HUF_selectDecoder
+#define GB_HUF_validateCTable GM_HUF_validateCTable
+#define GB_HUF_writeCTable GM_HUF_writeCTable
+#define GB_HUF_writeCTable_wksp GM_HUF_writeCTable_wksp
 #define GB_hypermatrix_prune GM_hypermatrix_prune
 #define GB_hyper_prune GM_hyper_prune
 #define GB_hyper_realloc GM_hyper_realloc
@@ -3711,8 +3812,8 @@
 #define GB_LZ4_compress_fast_extState GM_LZ4_compress_fast_extState
 #define GB_LZ4_compress_fast_extState_fastReset GM_LZ4_compress_fast_extState_fastReset
 #define GB_LZ4_compress_forceExtDict GM_LZ4_compress_forceExtDict
-#define GB_LZ4_compressHC GM_LZ4_compressHC
 #define GB_LZ4_compress_HC GM_LZ4_compress_HC
+#define GB_LZ4_compressHC GM_LZ4_compressHC
 #define GB_LZ4_compressHC2 GM_LZ4_compressHC2
 #define GB_LZ4_compressHC2_continue GM_LZ4_compressHC2_continue
 #define GB_LZ4_compressHC2_limitedOutput GM_LZ4_compressHC2_limitedOutput
@@ -3791,6 +3892,7 @@
 #define GB_Matrix_check GM_Matrix_check
 #define GB_Matrix_diag GM_Matrix_diag
 #define GB_Matrix_free GM_Matrix_free
+#define GB_Matrix_new GM_Matrix_new
 #define GB_Matrix_removeElement GM_Matrix_removeElement
 #define GB_matvec_check GM_matvec_check
 #define GB_matvec_type GM_matvec_type
@@ -3825,6 +3927,14 @@
 #define GB_Pending_realloc GM_Pending_realloc
 #define GB_phbix_free GM_phbix_free
 #define GB_ph_free GM_ph_free
+#define GB_POOL_add GM_POOL_add
+#define GB_POOL_create GM_POOL_create
+#define GB_POOL_create_advanced GM_POOL_create_advanced
+#define GB_POOL_free GM_POOL_free
+#define GB_POOL_joinJobs GM_POOL_joinJobs
+#define GB_POOL_resize GM_POOL_resize
+#define GB_POOL_sizeof GM_POOL_sizeof
+#define GB_POOL_tryAdd GM_POOL_tryAdd
 #define GB_positional_binop_ijflip GM_positional_binop_ijflip
 #define GB_positional_idxunop_ijflip GM_positional_idxunop_ijflip
 #define GB_positional_offset GM_positional_offset
@@ -3852,6 +3962,7 @@
 #define GB_realloc_memory GM_realloc_memory
 #define GB_reduce_to_scalar GM_reduce_to_scalar
 #define GB_reduce_to_vector GM_reduce_to_vector
+#define GB_reshape GM_reshape
 #define GB_resize GM_resize
 #define GB_Scalar_check GM_Scalar_check
 #define GB_Scalar_reduce GM_Scalar_reduce
@@ -3937,3 +4048,291 @@
 #define GB_Vector_removeElement GM_Vector_removeElement
 #define GB_wait GM_wait
 #define GB_xalloc_memory GM_xalloc_memory
+#define GB_ZSTD_adjustCParams GM_ZSTD_adjustCParams
+#define GB_ZSTD_buildBlockEntropyStats GM_ZSTD_buildBlockEntropyStats
+#define GB_ZSTD_buildCTable GM_ZSTD_buildCTable
+#define GB_ZSTD_buildFSETable GM_ZSTD_buildFSETable
+#define GB_ZSTD_calloc GM_ZSTD_calloc
+#define GB_ZSTD_CCtx_getParameter GM_ZSTD_CCtx_getParameter
+#define GB_ZSTD_CCtx_loadDictionary GM_ZSTD_CCtx_loadDictionary
+#define GB_ZSTD_CCtx_loadDictionary_advanced GM_ZSTD_CCtx_loadDictionary_advanced
+#define GB_ZSTD_CCtx_loadDictionary_byReference GM_ZSTD_CCtx_loadDictionary_byReference
+#define GB_ZSTD_CCtxParams_getParameter GM_ZSTD_CCtxParams_getParameter
+#define GB_ZSTD_CCtxParams_init GM_ZSTD_CCtxParams_init
+#define GB_ZSTD_CCtxParams_init_advanced GM_ZSTD_CCtxParams_init_advanced
+#define GB_ZSTD_CCtxParams_reset GM_ZSTD_CCtxParams_reset
+#define GB_ZSTD_CCtxParams_setParameter GM_ZSTD_CCtxParams_setParameter
+#define GB_ZSTD_CCtx_refCDict GM_ZSTD_CCtx_refCDict
+#define GB_ZSTD_CCtx_refPrefix GM_ZSTD_CCtx_refPrefix
+#define GB_ZSTD_CCtx_refPrefix_advanced GM_ZSTD_CCtx_refPrefix_advanced
+#define GB_ZSTD_CCtx_refThreadPool GM_ZSTD_CCtx_refThreadPool
+#define GB_ZSTD_CCtx_reset GM_ZSTD_CCtx_reset
+#define GB_ZSTD_CCtx_setParameter GM_ZSTD_CCtx_setParameter
+#define GB_ZSTD_CCtx_setParametersUsingCCtxParams GM_ZSTD_CCtx_setParametersUsingCCtxParams
+#define GB_ZSTD_CCtx_setPledgedSrcSize GM_ZSTD_CCtx_setPledgedSrcSize
+#define GB_ZSTD_CCtx_trace GM_ZSTD_CCtx_trace
+#define GB_ZSTD_checkContinuity GM_ZSTD_checkContinuity
+#define GB_ZSTD_checkCParams GM_ZSTD_checkCParams
+#define GB_ZSTD_compress GM_ZSTD_compress
+#define GB_ZSTD_compress2 GM_ZSTD_compress2
+#define GB_ZSTD_compress_advanced GM_ZSTD_compress_advanced
+#define GB_ZSTD_compress_advanced_internal GM_ZSTD_compress_advanced_internal
+#define GB_ZSTD_compressBegin GM_ZSTD_compressBegin
+#define GB_ZSTD_compressBegin_advanced GM_ZSTD_compressBegin_advanced
+#define GB_ZSTD_compressBegin_advanced_internal GM_ZSTD_compressBegin_advanced_internal
+#define GB_ZSTD_compressBegin_usingCDict GM_ZSTD_compressBegin_usingCDict
+#define GB_ZSTD_compressBegin_usingCDict_advanced GM_ZSTD_compressBegin_usingCDict_advanced
+#define GB_ZSTD_compressBegin_usingDict GM_ZSTD_compressBegin_usingDict
+#define GB_ZSTD_compressBlock GM_ZSTD_compressBlock
+#define GB_ZSTD_compressBlock_btlazy2 GM_ZSTD_compressBlock_btlazy2
+#define GB_ZSTD_compressBlock_btlazy2_dictMatchState GM_ZSTD_compressBlock_btlazy2_dictMatchState
+#define GB_ZSTD_compressBlock_btlazy2_extDict GM_ZSTD_compressBlock_btlazy2_extDict
+#define GB_ZSTD_compressBlock_btopt GM_ZSTD_compressBlock_btopt
+#define GB_ZSTD_compressBlock_btopt_dictMatchState GM_ZSTD_compressBlock_btopt_dictMatchState
+#define GB_ZSTD_compressBlock_btopt_extDict GM_ZSTD_compressBlock_btopt_extDict
+#define GB_ZSTD_compressBlock_btultra GM_ZSTD_compressBlock_btultra
+#define GB_ZSTD_compressBlock_btultra2 GM_ZSTD_compressBlock_btultra2
+#define GB_ZSTD_compressBlock_btultra_dictMatchState GM_ZSTD_compressBlock_btultra_dictMatchState
+#define GB_ZSTD_compressBlock_btultra_extDict GM_ZSTD_compressBlock_btultra_extDict
+#define GB_ZSTD_compressBlock_doubleFast GM_ZSTD_compressBlock_doubleFast
+#define GB_ZSTD_compressBlock_doubleFast_dictMatchState GM_ZSTD_compressBlock_doubleFast_dictMatchState
+#define GB_ZSTD_compressBlock_doubleFast_extDict GM_ZSTD_compressBlock_doubleFast_extDict
+#define GB_ZSTD_compressBlock_fast GM_ZSTD_compressBlock_fast
+#define GB_ZSTD_compressBlock_fast_dictMatchState GM_ZSTD_compressBlock_fast_dictMatchState
+#define GB_ZSTD_compressBlock_fast_extDict GM_ZSTD_compressBlock_fast_extDict
+#define GB_ZSTD_compressBlock_greedy GM_ZSTD_compressBlock_greedy
+#define GB_ZSTD_compressBlock_greedy_dedicatedDictSearch GM_ZSTD_compressBlock_greedy_dedicatedDictSearch
+#define GB_ZSTD_compressBlock_greedy_dedicatedDictSearch_row GM_ZSTD_compressBlock_greedy_dedicatedDictSearch_row
+#define GB_ZSTD_compressBlock_greedy_dictMatchState GM_ZSTD_compressBlock_greedy_dictMatchState
+#define GB_ZSTD_compressBlock_greedy_dictMatchState_row GM_ZSTD_compressBlock_greedy_dictMatchState_row
+#define GB_ZSTD_compressBlock_greedy_extDict GM_ZSTD_compressBlock_greedy_extDict
+#define GB_ZSTD_compressBlock_greedy_extDict_row GM_ZSTD_compressBlock_greedy_extDict_row
+#define GB_ZSTD_compressBlock_greedy_row GM_ZSTD_compressBlock_greedy_row
+#define GB_ZSTD_compressBlock_lazy GM_ZSTD_compressBlock_lazy
+#define GB_ZSTD_compressBlock_lazy2 GM_ZSTD_compressBlock_lazy2
+#define GB_ZSTD_compressBlock_lazy2_dedicatedDictSearch GM_ZSTD_compressBlock_lazy2_dedicatedDictSearch
+#define GB_ZSTD_compressBlock_lazy2_dedicatedDictSearch_row GM_ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
+#define GB_ZSTD_compressBlock_lazy2_dictMatchState GM_ZSTD_compressBlock_lazy2_dictMatchState
+#define GB_ZSTD_compressBlock_lazy2_dictMatchState_row GM_ZSTD_compressBlock_lazy2_dictMatchState_row
+#define GB_ZSTD_compressBlock_lazy2_extDict GM_ZSTD_compressBlock_lazy2_extDict
+#define GB_ZSTD_compressBlock_lazy2_extDict_row GM_ZSTD_compressBlock_lazy2_extDict_row
+#define GB_ZSTD_compressBlock_lazy2_row GM_ZSTD_compressBlock_lazy2_row
+#define GB_ZSTD_compressBlock_lazy_dedicatedDictSearch GM_ZSTD_compressBlock_lazy_dedicatedDictSearch
+#define GB_ZSTD_compressBlock_lazy_dedicatedDictSearch_row GM_ZSTD_compressBlock_lazy_dedicatedDictSearch_row
+#define GB_ZSTD_compressBlock_lazy_dictMatchState GM_ZSTD_compressBlock_lazy_dictMatchState
+#define GB_ZSTD_compressBlock_lazy_dictMatchState_row GM_ZSTD_compressBlock_lazy_dictMatchState_row
+#define GB_ZSTD_compressBlock_lazy_extDict GM_ZSTD_compressBlock_lazy_extDict
+#define GB_ZSTD_compressBlock_lazy_extDict_row GM_ZSTD_compressBlock_lazy_extDict_row
+#define GB_ZSTD_compressBlock_lazy_row GM_ZSTD_compressBlock_lazy_row
+#define GB_ZSTD_compressBound GM_ZSTD_compressBound
+#define GB_ZSTD_compressCCtx GM_ZSTD_compressCCtx
+#define GB_ZSTD_compressContinue GM_ZSTD_compressContinue
+#define GB_ZSTD_compressEnd GM_ZSTD_compressEnd
+#define GB_ZSTD_compressLiterals GM_ZSTD_compressLiterals
+#define GB_ZSTD_compressRleLiteralsBlock GM_ZSTD_compressRleLiteralsBlock
+#define GB_ZSTD_compressSequences GM_ZSTD_compressSequences
+#define GB_ZSTD_compressStream GM_ZSTD_compressStream
+#define GB_ZSTD_compressStream2 GM_ZSTD_compressStream2
+#define GB_ZSTD_compressStream2_simpleArgs GM_ZSTD_compressStream2_simpleArgs
+#define GB_ZSTD_compressSuperBlock GM_ZSTD_compressSuperBlock
+#define GB_ZSTD_compress_usingCDict GM_ZSTD_compress_usingCDict
+#define GB_ZSTD_compress_usingCDict_advanced GM_ZSTD_compress_usingCDict_advanced
+#define GB_ZSTD_compress_usingDict GM_ZSTD_compress_usingDict
+#define GB_ZSTD_copyCCtx GM_ZSTD_copyCCtx
+#define GB_ZSTD_copyDCtx GM_ZSTD_copyDCtx
+#define GB_ZSTD_copyDDictParameters GM_ZSTD_copyDDictParameters
+#define GB_ZSTD_cParam_getBounds GM_ZSTD_cParam_getBounds
+#define GB_ZSTD_createCCtx GM_ZSTD_createCCtx
+#define GB_ZSTD_createCCtx_advanced GM_ZSTD_createCCtx_advanced
+#define GB_ZSTD_createCCtxParams GM_ZSTD_createCCtxParams
+#define GB_ZSTD_createCDict GM_ZSTD_createCDict
+#define GB_ZSTD_createCDict_advanced GM_ZSTD_createCDict_advanced
+#define GB_ZSTD_createCDict_advanced2 GM_ZSTD_createCDict_advanced2
+#define GB_ZSTD_createCDict_byReference GM_ZSTD_createCDict_byReference
+#define GB_ZSTD_createCStream GM_ZSTD_createCStream
+#define GB_ZSTD_createCStream_advanced GM_ZSTD_createCStream_advanced
+#define GB_ZSTD_createDCtx GM_ZSTD_createDCtx
+#define GB_ZSTD_createDCtx_advanced GM_ZSTD_createDCtx_advanced
+#define GB_ZSTD_createDDict GM_ZSTD_createDDict
+#define GB_ZSTD_createDDict_advanced GM_ZSTD_createDDict_advanced
+#define GB_ZSTD_createDDict_byReference GM_ZSTD_createDDict_byReference
+#define GB_ZSTD_createDStream GM_ZSTD_createDStream
+#define GB_ZSTD_createDStream_advanced GM_ZSTD_createDStream_advanced
+#define GB_ZSTD_crossEntropyCost GM_ZSTD_crossEntropyCost
+#define GB_ZSTD_CStreamInSize GM_ZSTD_CStreamInSize
+#define GB_ZSTD_CStreamOutSize GM_ZSTD_CStreamOutSize
+#define GB_ZSTD_customCalloc GM_ZSTD_customCalloc
+#define GB_ZSTD_customFree GM_ZSTD_customFree
+#define GB_ZSTD_customMalloc GM_ZSTD_customMalloc
+#define GB_ZSTD_cycleLog GM_ZSTD_cycleLog
+#define GB_ZSTD_DCtx_getParameter GM_ZSTD_DCtx_getParameter
+#define GB_ZSTD_DCtx_loadDictionary GM_ZSTD_DCtx_loadDictionary
+#define GB_ZSTD_DCtx_loadDictionary_advanced GM_ZSTD_DCtx_loadDictionary_advanced
+#define GB_ZSTD_DCtx_loadDictionary_byReference GM_ZSTD_DCtx_loadDictionary_byReference
+#define GB_ZSTD_DCtx_refDDict GM_ZSTD_DCtx_refDDict
+#define GB_ZSTD_DCtx_refPrefix GM_ZSTD_DCtx_refPrefix
+#define GB_ZSTD_DCtx_refPrefix_advanced GM_ZSTD_DCtx_refPrefix_advanced
+#define GB_ZSTD_DCtx_reset GM_ZSTD_DCtx_reset
+#define GB_ZSTD_DCtx_setFormat GM_ZSTD_DCtx_setFormat
+#define GB_ZSTD_DCtx_setMaxWindowSize GM_ZSTD_DCtx_setMaxWindowSize
+#define GB_ZSTD_DCtx_setParameter GM_ZSTD_DCtx_setParameter
+#define GB_ZSTD_DDict_dictContent GM_ZSTD_DDict_dictContent
+#define GB_ZSTD_DDict_dictSize GM_ZSTD_DDict_dictSize
+#define GB_ZSTD_decodeLiteralsBlock GM_ZSTD_decodeLiteralsBlock
+#define GB_ZSTD_decodeSeqHeaders GM_ZSTD_decodeSeqHeaders
+#define GB_ZSTD_decodingBufferSize_min GM_ZSTD_decodingBufferSize_min
+#define GB_ZSTD_decompress GM_ZSTD_decompress
+#define GB_ZSTD_decompressBegin GM_ZSTD_decompressBegin
+#define GB_ZSTD_decompressBegin_usingDDict GM_ZSTD_decompressBegin_usingDDict
+#define GB_ZSTD_decompressBegin_usingDict GM_ZSTD_decompressBegin_usingDict
+#define GB_ZSTD_decompressBlock GM_ZSTD_decompressBlock
+#define GB_ZSTD_decompressBlock_internal GM_ZSTD_decompressBlock_internal
+#define GB_ZSTD_decompressBound GM_ZSTD_decompressBound
+#define GB_ZSTD_decompressContinue GM_ZSTD_decompressContinue
+#define GB_ZSTD_decompressDCtx GM_ZSTD_decompressDCtx
+#define GB_ZSTD_decompressStream GM_ZSTD_decompressStream
+#define GB_ZSTD_decompressStream_simpleArgs GM_ZSTD_decompressStream_simpleArgs
+#define GB_ZSTD_decompress_usingDDict GM_ZSTD_decompress_usingDDict
+#define GB_ZSTD_decompress_usingDict GM_ZSTD_decompress_usingDict
+#define GB_ZSTD_dedicatedDictSearch_lazy_loadDictionary GM_ZSTD_dedicatedDictSearch_lazy_loadDictionary
+#define GB_ZSTD_defaultCLevel GM_ZSTD_defaultCLevel
+#define GB_ZSTD_dParam_getBounds GM_ZSTD_dParam_getBounds
+#define GB_ZSTD_DStreamInSize GM_ZSTD_DStreamInSize
+#define GB_ZSTD_DStreamOutSize GM_ZSTD_DStreamOutSize
+#define GB_ZSTD_encodeSequences GM_ZSTD_encodeSequences
+#define GB_ZSTD_endStream GM_ZSTD_endStream
+#define GB_ZSTD_estimateCCtxSize GM_ZSTD_estimateCCtxSize
+#define GB_ZSTD_estimateCCtxSize_usingCCtxParams GM_ZSTD_estimateCCtxSize_usingCCtxParams
+#define GB_ZSTD_estimateCCtxSize_usingCParams GM_ZSTD_estimateCCtxSize_usingCParams
+#define GB_ZSTD_estimateCDictSize GM_ZSTD_estimateCDictSize
+#define GB_ZSTD_estimateCDictSize_advanced GM_ZSTD_estimateCDictSize_advanced
+#define GB_ZSTD_estimateCStreamSize GM_ZSTD_estimateCStreamSize
+#define GB_ZSTD_estimateCStreamSize_usingCCtxParams GM_ZSTD_estimateCStreamSize_usingCCtxParams
+#define GB_ZSTD_estimateCStreamSize_usingCParams GM_ZSTD_estimateCStreamSize_usingCParams
+#define GB_ZSTD_estimateDCtxSize GM_ZSTD_estimateDCtxSize
+#define GB_ZSTD_estimateDDictSize GM_ZSTD_estimateDDictSize
+#define GB_ZSTD_estimateDStreamSize GM_ZSTD_estimateDStreamSize
+#define GB_ZSTD_estimateDStreamSize_fromFrame GM_ZSTD_estimateDStreamSize_fromFrame
+#define GB_ZSTD_fillDoubleHashTable GM_ZSTD_fillDoubleHashTable
+#define GB_ZSTD_fillHashTable GM_ZSTD_fillHashTable
+#define GB_ZSTD_findDecompressedSize GM_ZSTD_findDecompressedSize
+#define GB_ZSTD_findFrameCompressedSize GM_ZSTD_findFrameCompressedSize
+#define GB_ZSTD_flushStream GM_ZSTD_flushStream
+#define GB_ZSTD_frameHeaderSize GM_ZSTD_frameHeaderSize
+#define GB_ZSTD_free GM_ZSTD_free
+#define GB_ZSTD_freeCCtx GM_ZSTD_freeCCtx
+#define GB_ZSTD_freeCCtxParams GM_ZSTD_freeCCtxParams
+#define GB_ZSTD_freeCDict GM_ZSTD_freeCDict
+#define GB_ZSTD_freeCStream GM_ZSTD_freeCStream
+#define GB_ZSTD_freeDCtx GM_ZSTD_freeDCtx
+#define GB_ZSTD_freeDDict GM_ZSTD_freeDDict
+#define GB_ZSTD_freeDStream GM_ZSTD_freeDStream
+#define GB_ZSTD_fseBitCost GM_ZSTD_fseBitCost
+#define GB_ZSTD_generateSequences GM_ZSTD_generateSequences
+#define GB_ZSTD_getBlockSize GM_ZSTD_getBlockSize
+#define GB_ZSTD_getcBlockSize GM_ZSTD_getcBlockSize
+#define GB_ZSTD_getCParams GM_ZSTD_getCParams
+#define GB_ZSTD_getCParamsFromCCtxParams GM_ZSTD_getCParamsFromCCtxParams
+#define GB_ZSTD_getCParamsFromCDict GM_ZSTD_getCParamsFromCDict
+#define GB_ZSTD_getDecompressedSize GM_ZSTD_getDecompressedSize
+#define GB_ZSTD_getDictID_fromCDict GM_ZSTD_getDictID_fromCDict
+#define GB_ZSTD_getDictID_fromDDict GM_ZSTD_getDictID_fromDDict
+#define GB_ZSTD_getDictID_fromDict GM_ZSTD_getDictID_fromDict
+#define GB_ZSTD_getDictID_fromFrame GM_ZSTD_getDictID_fromFrame
+#define GB_ZSTD_getErrorCode GM_ZSTD_getErrorCode
+#define GB_ZSTD_getErrorName GM_ZSTD_getErrorName
+#define GB_ZSTD_getErrorString GM_ZSTD_getErrorString
+#define GB_ZSTD_getFrameContentSize GM_ZSTD_getFrameContentSize
+#define GB_ZSTD_getFrameHeader GM_ZSTD_getFrameHeader
+#define GB_ZSTD_getFrameHeader_advanced GM_ZSTD_getFrameHeader_advanced
+#define GB_ZSTD_getFrameProgression GM_ZSTD_getFrameProgression
+#define GB_ZSTD_getParams GM_ZSTD_getParams
+#define GB_ZSTD_getSeqStore GM_ZSTD_getSeqStore
+#define GB_ZSTD_initCStream GM_ZSTD_initCStream
+#define GB_ZSTD_initCStream_advanced GM_ZSTD_initCStream_advanced
+#define GB_ZSTD_initCStream_internal GM_ZSTD_initCStream_internal
+#define GB_ZSTD_initCStream_srcSize GM_ZSTD_initCStream_srcSize
+#define GB_ZSTD_initCStream_usingCDict GM_ZSTD_initCStream_usingCDict
+#define GB_ZSTD_initCStream_usingCDict_advanced GM_ZSTD_initCStream_usingCDict_advanced
+#define GB_ZSTD_initCStream_usingDict GM_ZSTD_initCStream_usingDict
+#define GB_ZSTD_initDStream GM_ZSTD_initDStream
+#define GB_ZSTD_initDStream_usingDDict GM_ZSTD_initDStream_usingDDict
+#define GB_ZSTD_initDStream_usingDict GM_ZSTD_initDStream_usingDict
+#define GB_ZSTD_initStaticCCtx GM_ZSTD_initStaticCCtx
+#define GB_ZSTD_initStaticCDict GM_ZSTD_initStaticCDict
+#define GB_ZSTD_initStaticCStream GM_ZSTD_initStaticCStream
+#define GB_ZSTD_initStaticDCtx GM_ZSTD_initStaticDCtx
+#define GB_ZSTD_initStaticDDict GM_ZSTD_initStaticDDict
+#define GB_ZSTD_initStaticDStream GM_ZSTD_initStaticDStream
+#define GB_ZSTD_insertAndFindFirstIndex GM_ZSTD_insertAndFindFirstIndex
+#define GB_ZSTD_insertBlock GM_ZSTD_insertBlock
+#define GB_ZSTD_invalidateRepCodes GM_ZSTD_invalidateRepCodes
+#define GB_ZSTD_isFrame GM_ZSTD_isFrame
+#define GB_ZSTD_isSkippableFrame GM_ZSTD_isSkippableFrame
+#define GB_ZSTD_ldm_adjustParameters GM_ZSTD_ldm_adjustParameters
+#define GB_ZSTD_ldm_blockCompress GM_ZSTD_ldm_blockCompress
+#define GB_ZSTD_ldm_fillHashTable GM_ZSTD_ldm_fillHashTable
+#define GB_ZSTD_ldm_generateSequences GM_ZSTD_ldm_generateSequences
+#define GB_ZSTD_ldm_getMaxNbSeq GM_ZSTD_ldm_getMaxNbSeq
+#define GB_ZSTD_ldm_getTableSize GM_ZSTD_ldm_getTableSize
+#define GB_ZSTD_ldm_skipRawSeqStoreBytes GM_ZSTD_ldm_skipRawSeqStoreBytes
+#define GB_ZSTD_ldm_skipSequences GM_ZSTD_ldm_skipSequences
+#define GB_ZSTD_loadCEntropy GM_ZSTD_loadCEntropy
+#define GB_ZSTD_loadDEntropy GM_ZSTD_loadDEntropy
+#define GB_ZSTD_malloc GM_ZSTD_malloc
+#define GB_ZSTD_maxCLevel GM_ZSTD_maxCLevel
+#define GB_ZSTD_mergeBlockDelimiters GM_ZSTD_mergeBlockDelimiters
+#define GB_ZSTD_minCLevel GM_ZSTD_minCLevel
+#define GB_ZSTDMT_compressStream_generic GM_ZSTDMT_compressStream_generic
+#define GB_ZSTDMT_createCCtx_advanced GM_ZSTDMT_createCCtx_advanced
+#define GB_ZSTDMT_freeCCtx GM_ZSTDMT_freeCCtx
+#define GB_ZSTDMT_getFrameProgression GM_ZSTDMT_getFrameProgression
+#define GB_ZSTDMT_initCStream_internal GM_ZSTDMT_initCStream_internal
+#define GB_ZSTDMT_nextInputSizeHint GM_ZSTDMT_nextInputSizeHint
+#define GB_ZSTDMT_sizeof_CCtx GM_ZSTDMT_sizeof_CCtx
+#define GB_ZSTDMT_toFlushNow GM_ZSTDMT_toFlushNow
+#define GB_ZSTDMT_updateCParams_whileCompressing GM_ZSTDMT_updateCParams_whileCompressing
+#define GB_ZSTD_nextInputType GM_ZSTD_nextInputType
+#define GB_ZSTD_nextSrcSizeToDecompress GM_ZSTD_nextSrcSizeToDecompress
+#define GB_ZSTD_noCompressLiterals GM_ZSTD_noCompressLiterals
+#define GB_ZSTD_readSkippableFrame GM_ZSTD_readSkippableFrame
+#define GB_ZSTD_referenceExternalSequences GM_ZSTD_referenceExternalSequences
+#define GB_ZSTD_reset_compressedBlockState GM_ZSTD_reset_compressedBlockState
+#define GB_ZSTD_resetCStream GM_ZSTD_resetCStream
+#define GB_ZSTD_resetDStream GM_ZSTD_resetDStream
+#define GB_ZSTD_resetSeqStore GM_ZSTD_resetSeqStore
+#define GB_ZSTD_row_update GM_ZSTD_row_update
+#define GB_ZSTD_selectBlockCompressor GM_ZSTD_selectBlockCompressor
+#define GB_ZSTD_selectEncodingType GM_ZSTD_selectEncodingType
+#define GB_ZSTD_seqToCodes GM_ZSTD_seqToCodes
+#define GB_ZSTD_sizeof_CCtx GM_ZSTD_sizeof_CCtx
+#define GB_ZSTD_sizeof_CDict GM_ZSTD_sizeof_CDict
+#define GB_ZSTD_sizeof_CStream GM_ZSTD_sizeof_CStream
+#define GB_ZSTD_sizeof_DCtx GM_ZSTD_sizeof_DCtx
+#define GB_ZSTD_sizeof_DDict GM_ZSTD_sizeof_DDict
+#define GB_ZSTD_sizeof_DStream GM_ZSTD_sizeof_DStream
+#define GB_ZSTD_toFlushNow GM_ZSTD_toFlushNow
+#define GB_ZSTD_updateTree GM_ZSTD_updateTree
+#define GB_ZSTD_versionNumber GM_ZSTD_versionNumber
+#define GB_ZSTD_versionString GM_ZSTD_versionString
+#define GB_ZSTD_writeLastEmptyBlock GM_ZSTD_writeLastEmptyBlock
+#define GB_ZSTD_writeSkippableFrame GM_ZSTD_writeSkippableFrame
+#define GB_ZSTD_XXH32 GM_ZSTD_XXH32
+#define GB_ZSTD_XXH32_canonicalFromHash GM_ZSTD_XXH32_canonicalFromHash
+#define GB_ZSTD_XXH32_copyState GM_ZSTD_XXH32_copyState
+#define GB_ZSTD_XXH32_createState GM_ZSTD_XXH32_createState
+#define GB_ZSTD_XXH32_digest GM_ZSTD_XXH32_digest
+#define GB_ZSTD_XXH32_freeState GM_ZSTD_XXH32_freeState
+#define GB_ZSTD_XXH32_hashFromCanonical GM_ZSTD_XXH32_hashFromCanonical
+#define GB_ZSTD_XXH32_reset GM_ZSTD_XXH32_reset
+#define GB_ZSTD_XXH32_update GM_ZSTD_XXH32_update
+#define GB_ZSTD_XXH64 GM_ZSTD_XXH64
+#define GB_ZSTD_XXH64_canonicalFromHash GM_ZSTD_XXH64_canonicalFromHash
+#define GB_ZSTD_XXH64_copyState GM_ZSTD_XXH64_copyState
+#define GB_ZSTD_XXH64_createState GM_ZSTD_XXH64_createState
+#define GB_ZSTD_XXH64_digest GM_ZSTD_XXH64_digest
+#define GB_ZSTD_XXH64_freeState GM_ZSTD_XXH64_freeState
+#define GB_ZSTD_XXH64_hashFromCanonical GM_ZSTD_XXH64_hashFromCanonical
+#define GB_ZSTD_XXH64_reset GM_ZSTD_XXH64_reset
+#define GB_ZSTD_XXH64_update GM_ZSTD_XXH64_update
+#define GB_ZSTD_XXH_versionNumber GM_ZSTD_XXH_versionNumber
diff --git a/GraphBLAS/GraphBLAS/rename/rename.awk b/GraphBLAS/GraphBLAS/rename/rename.awk
index fd1501a0f..843e77d62 100644
--- a/GraphBLAS/GraphBLAS/rename/rename.awk
+++ b/GraphBLAS/GraphBLAS/rename/rename.awk
@@ -4,5 +4,7 @@
     gsub (/GxB_/, "GxM_", gbrename) ;
     gsub (/GrB_/, "GrM_", gbrename) ;
     gsub (/GB_/, "GM_", gbrename) ;
-    printf "#define %s %s\n", gbname, gbrename
+    if (length (gbname) > 0) {
+        printf "#define %s %s\n", gbname, gbrename
+    }
 }
diff --git a/GraphBLAS/GraphBLAS/test/Contents.m b/GraphBLAS/GraphBLAS/test/Contents.m
index 59bf736c1..964b376fd 100644
--- a/GraphBLAS/GraphBLAS/test/Contents.m
+++ b/GraphBLAS/GraphBLAS/test/Contents.m
@@ -119,6 +119,12 @@
 %  gbtest113 - test ones and eq
 %  gbtest114 - test kron with iso matrices
 %  gbtest115 - test serialize/deserialize
+%  gbtest116 - list all idxunop operators for GrB.apply2
+%  gbtest117 - test idxunop in GrB.apply2
+%  gbtest118 - test GrB.argsort
+%  gbtest119 - test GrB.eunion
+%  gbtest120 - test subsref
+%  gbtest121 - test times with scalars
 %
 % Utilities and other tests:
 %
@@ -131,6 +137,7 @@
 %  gbtest_complex - return list of complex operators
 %  gbtest_err     - compare two matrices
 %  gb_contains    - same as contains (text, pattern)
+%  gb_octave      - return true if Octave is in use, false for MATLAB
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
diff --git a/GraphBLAS/GraphBLAS/test/gbtest.m b/GraphBLAS/GraphBLAS/test/gbtest.m
index 99d0627e0..7c48ddd9b 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest.m
@@ -171,6 +171,8 @@
 gbtest117 % test idxunop in GrB.apply2
 gbtest118 % test GrB.argsort
 gbtest119 % test GrB.eunion
+gbtest120 % test subsref
+gbtest121 % test times with scalars
 gbtest96  % test GrB.optype
 
 if (~have_octave)
diff --git a/GraphBLAS/GraphBLAS/test/gbtest0.m b/GraphBLAS/GraphBLAS/test/gbtest0.m
index d788b5f99..4f8ddc8c8 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest0.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest0.m
@@ -5,6 +5,7 @@
 % SPDX-License-Identifier: GPL-3.0-or-later
 
 GrB.clear
+GrB.init
 
 assert (isequal (GrB.format, 'by col')) ;
 assert (isequal (GrB.chunk, 64*1024)) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest101.m b/GraphBLAS/GraphBLAS/test/gbtest101.m
index 3b5afef8c..bbaeda633 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest101.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest101.m
@@ -33,9 +33,10 @@
 assert (isequal (t1, t2)) ;
 
 [s1, f1] = GrB.format (G) ;
-[s2, f2] = GrB.format (G2) ;
+[s2, f2, iso] = GrB.format (G2) ;
 assert (isequal (s1, s2)) ;
 assert (isequal (f1, f2)) ;
+iso
 
 H2 = GrB (H, 'hyper') ;
 fprintf ('================== v3 hypersparse struct:\n') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest115.m b/GraphBLAS/GraphBLAS/test/gbtest115.m
index a24a5ee03..f0511e47c 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest115.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest115.m
@@ -7,7 +7,7 @@
 rng ('default') ;
 
 types = gbtest_types ;
-compression_methods = { 'none', 'lz4', 'lz4hc', 'debug' } ;
+compression_methods = { 'none', 'lz4', 'lz4hc', 'zstd', 'debug' } ;
 
 for k = 1:length(types)
     type = types {k} ;
@@ -26,13 +26,29 @@
         B = GrB.deserialize (blob) ;
         assert (isequal (A, B)) ;
 
-        % levels 0:9 for lz4hc
+        B = GrB.deserialize (blob, 'fast') ;
+        assert (isequal (A, B)) ;
+
+        B = GrB.deserialize (blob, 'secure') ;
+        assert (isequal (A, B)) ;
+
+        B = GrB.deserialize (blob, 'secure', type) ;
+        assert (isequal (A, B)) ;
+
         if (k2 == 3)
+            % levels 0:9 for lz4hc
             for level = 0:9
                 blob = GrB.serialize (A, method, level) ;
                 B = GrB.deserialize (blob) ;
                 assert (isequal (A, B)) ;
             end
+        elseif (k2 == 4)
+            % levels 0:19 for zstd
+            for level = 0:19
+                blob = GrB.serialize (A, method, level) ;
+                B = GrB.deserialize (blob) ;
+                assert (isequal (A, B)) ;
+            end
         end
     end
 end
diff --git a/GraphBLAS/GraphBLAS/test/gbtest117.m b/GraphBLAS/GraphBLAS/test/gbtest117.m
index 5277cc1fe..14887d2b5 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest117.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest117.m
@@ -81,4 +81,10 @@
 
 end
 
+%   diagindex       j - (i + thunk)
+C1 = GrB.apply2 ('diagindex', 0, 3) ;
+assert (C1 == -3) ;
+C1 = GrB.apply2 ('diagindex', 1, 0) ;
+assert (C1 == 0) ;
+
 fprintf ('gbtest117: all tests passed\n') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest118.m b/GraphBLAS/GraphBLAS/test/gbtest118.m
index 98d640676..46f1a20b2 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest118.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest118.m
@@ -65,6 +65,9 @@
         assert (isequal (Pk, a)) ;
     end
 
+    C2 = GrB.argsort (A) ;
+    assert (isequal (C, C2)) ;
+
     [C, P] = GrB.argsort (A, 'descend') ;
     for k = 1:n
         Ak = A (:,k) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest119.m b/GraphBLAS/GraphBLAS/test/gbtest119.m
index 0485dd041..5e573718c 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest119.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest119.m
@@ -171,5 +171,9 @@
 C1 = GrB.eunion (accum, c, m, a, alpha, op, b, beta, desc) ; assert (isequal (C1, C2)) ;
 C1 = GrB.eunion (accum, c, m, a, alpha, b, beta, op, desc) ; assert (isequal (C1, C2)) ;
 
+beta = GrB (beta) ;
+alpha = GrB (alpha) ;
+C1 = GrB.eunion (accum, c, m, a, alpha, op, b, beta, desc) ; assert (isequal (C1, C2)) ;
+
 fprintf ('gbtest119: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest12.m b/GraphBLAS/GraphBLAS/test/gbtest12.m
index 712560242..09fa25be8 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest12.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest12.m
@@ -105,5 +105,17 @@
 err = norm (C-G, 1) ;
 assert (err < 1e-12)
 
+C1 = 2 - C ;
+C2 = 2 - G ;
+assert (isequal (C1, C2)) ;
+
+C1 = 0 - C ;
+C2 = 0 - G ;
+assert (isequal (C1, C2)) ;
+
+C1 = C - 2 ;
+C2 = C - 2 ;
+assert (isequal (C1, C2)) ;
+
 fprintf ('gbtest12: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest120.m b/GraphBLAS/GraphBLAS/test/gbtest120.m
new file mode 100644
index 000000000..2e757ba45
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/test/gbtest120.m
@@ -0,0 +1,36 @@
+function gbtest120
+%GBTEST120 test subsref
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+% SPDX-License-Identifier: GPL-3.0-or-later
+
+x = sparse (1:5) ;
+C1 = x (:) ;
+y = GrB (x) ;
+C2 = y (:) ;
+assert (isequal (C1, C2)) ;
+
+x = sparse (magic (4)) ;
+C1 = x (:) ;
+y = GrB (x) ;
+C2 = y (:) ;
+assert (isequal (C1, C2)) ;
+
+% linear indexing would require a 128-bit integer, so it fails
+n = 2^50 ;
+H = GrB (n,n) ;
+H (1,1) = 42 ;
+H (n,n) = 99 ;
+H
+try
+    C = H (:)
+    ok = false ;
+catch expected_error
+    % 'problem too large'
+    ok = true ;
+end
+assert (ok)
+expected_error
+
+fprintf ('gbtest120: all tests passed\n') ;
+
diff --git a/GraphBLAS/GraphBLAS/test/gbtest121.m b/GraphBLAS/GraphBLAS/test/gbtest121.m
new file mode 100644
index 000000000..bc938e265
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/test/gbtest121.m
@@ -0,0 +1,15 @@
+function gbtest121
+%GBTEST121 test times with scalars
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+% SPDX-License-Identifier: GPL-3.0-or-later
+
+a = pi ;
+b = 2 ;
+c1 = a.*b ;
+c2 = GrB (a) .* GrB (b) ;
+
+assert (isequal (c1, c2)) ;
+
+fprintf ('gbtest121: all tests passed\n') ;
+
diff --git a/GraphBLAS/GraphBLAS/test/gbtest122.m b/GraphBLAS/GraphBLAS/test/gbtest122.m
new file mode 100644
index 000000000..df11ab132
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/test/gbtest122.m
@@ -0,0 +1,64 @@
+function gbtest122
+%GBTEST122 test reshape (extended methods in GrB)
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+% SPDX-License-Identifier: GPL-3.0-or-later
+
+rng ('default')
+
+for m = 1:12
+    for n = 1:12
+        for kind = [0 1]
+            if (kind == 0)
+                A = rand (m, n) ;
+            else
+                A = sprand (m, n, 0.3) ;
+            end
+            G = GrB (A) ;
+            mn = m*n ;
+            H = GrB (A, 'by row') ;
+
+            f = factor (mn) ;
+
+            for k = 1:length (f)
+                S = nchoosek (f, k) ;
+                for i = 1:size(S,1)
+
+                    % reshape by column
+                    m2 = prod (S (i,:)) ;
+                    n2 = mn / m2 ;
+                    C1 = reshape (A, m2, n2) ;
+                    C2 = reshape (G, m2, n2) ;
+                    assert (gbtest_eq (C1, C2)) ;
+
+                    C3 = reshape (H, m2, n2) ;
+                    assert (gbtest_eq (C1, C3)) ;
+
+                    C1 = reshape (A, [m2 n2]) ;
+                    C2 = reshape (G, [m2 n2]) ;
+                    assert (gbtest_eq (C1, C2)) ;
+
+                    C3 = reshape (H, [m2 n2]) ;
+                    assert (gbtest_eq (C1, C3)) ;
+
+                    % reshape by row
+                    C1 = reshape (A', n2, m2)' ;
+                    C2 = reshape (G, m2, n2, 'by row') ;
+                    assert (gbtest_eq (C1, C2)) ;
+
+                    C3 = reshape (H, m2, n2, 'by row') ;
+                    assert (gbtest_eq (C1, C3)) ;
+
+                    C2 = reshape (G, [m2 n2], 'by row') ;
+                    assert (gbtest_eq (C1, C2)) ;
+
+                    C3 = reshape (H, [m2 n2], 'by row') ;
+                    assert (gbtest_eq (C1, C3)) ;
+
+                end
+            end
+        end
+    end
+end
+
+fprintf ('gbtest122: all tests passed\n') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest13.m b/GraphBLAS/GraphBLAS/test/gbtest13.m
index 7afc3c48f..cabdcdcbf 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest13.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest13.m
@@ -30,10 +30,13 @@
     assert (isequal (J (nz), J1)) ;
 
     [I1, J1] = find (G) ;
-    nz = find (C (:) ~= 0) ;
     assert (isequal (I (nz), I1)) ;
     assert (isequal (J (nz), J1)) ;
 
+    [I1] = find (G) ;
+    [I0] = find (C) ;
+    assert (isequal (I0, I1)) ;
+
     [I0, J0, X0] = GrB.extracttuples (G, desc0)  ;
     assert (isequal (C (:), X0)) ;
     assert (isequal (I_0, I0)) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest15.m b/GraphBLAS/GraphBLAS/test/gbtest15.m
index 7607d2906..a5ff4c6f8 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest15.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest15.m
@@ -10,6 +10,7 @@
     'sinh', 'cosh', 'tanh', 'asinh', 'acosh', 'atanh', ...
     'signum', 'ceil', 'floor', 'round', 'trunc', 'pow2', ...
     'expm1', 'log10', 'log1p', 'log2', 'lgamma', 'tgamma', 'erf', ...
+    'cbrt', ...
     'erfc', 'conj', 'creal', 'cimag', 'carg', 'isinf', 'isnan', ...
     'isinfinite', 'frexpx', 'frexpe', 'i0', 'i1', 'j0', 'j1' } ;
 
@@ -31,7 +32,7 @@
 GrB.unopinfo ;
 
 fprintf ('number of unary ops: %d\n', nops) ;
-assert (nops == 212) ;
+assert (nops == 214) ;
 
 fprintf ('gbtest15: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest16.m b/GraphBLAS/GraphBLAS/test/gbtest16.m
index 23ca30850..542fc4635 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest16.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest16.m
@@ -58,6 +58,10 @@
 C2 = A (I,J)  ;
 assert (gbtest_eq (C2, Cout)) ;
 
+desc.base = 'zero-based' ;
+Cout = GrB.extract (A, { int64(I) - 1 }, { int64(J) - 1 }, desc) ;
+assert (gbtest_eq (C2, Cout)) ;
+
 G = GrB.random (1, 10, inf) ;
 A = double (G) ;
 C0 = A (1:3) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest20.m b/GraphBLAS/GraphBLAS/test/gbtest20.m
index f446f84c6..69aab198b 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest20.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest20.m
@@ -28,5 +28,11 @@
     end
 end
 
+n = 2^60 ;
+G = GrB (n, n) ;
+G (n,1) = 1
+[lo, hi] = bandwidth (G)
+assert (lo == int64 (2^60) - 1)
+
 fprintf ('\ngbtest20: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest35.m b/GraphBLAS/GraphBLAS/test/gbtest35.m
index 64d5bce9e..d5f563950 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest35.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest35.m
@@ -1,5 +1,5 @@
 function gbtest35
-%GBTEST35 test reshape
+%GBTEST35 test reshape (built-in variant)
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: GPL-3.0-or-later
@@ -8,26 +8,32 @@
 
 for m = 0:6
     for n = 0:10
-        A = rand (m, n) ;
-        G = GrB (A) ;
-        mn = m*n ;
-        if (mn == 0)
-            C1 = reshape (A, n, m) ;
-            C2 = reshape (G, n, m) ;
-            assert (gbtest_eq (C1, C2)) ;
-        else
-            f = factor (mn) ;
-            for k = 1:length (f)
-                S = nchoosek (f, k) ;
-                for i = 1:size(S,1)
-                    m2 = prod (S (i,:)) ;
-                    n2 = mn / m2 ;
-                    C1 = reshape (A, m2, n2) ;
-                    C2 = reshape (G, m2, n2) ;
-                    assert (gbtest_eq (C1, C2)) ;
-                    C1 = reshape (A, [m2 n2]) ;
-                    C2 = reshape (G, [m2 n2]) ;
-                    assert (gbtest_eq (C1, C2)) ;
+        for kind = [0 1]
+            if (kind == 0)
+                A = rand (m, n) ;
+            else
+                A = sprand (m, n, 0.3) ;
+            end
+            G = GrB (A) ;
+            mn = m*n ;
+            if (mn == 0)
+                C1 = reshape (A, n, m) ;
+                C2 = reshape (G, n, m) ;
+                assert (gbtest_eq (C1, C2)) ;
+            else
+                f = factor (mn) ;
+                for k = 1:length (f)
+                    S = nchoosek (f, k) ;
+                    for i = 1:size(S,1)
+                        m2 = prod (S (i,:)) ;
+                        n2 = mn / m2 ;
+                        C1 = reshape (A, m2, n2) ;
+                        C2 = reshape (G, m2, n2) ;
+                        assert (gbtest_eq (C1, C2)) ;
+                        C1 = reshape (A, [m2 n2]) ;
+                        C2 = reshape (G, [m2 n2]) ;
+                        assert (gbtest_eq (C1, C2)) ;
+                    end
                 end
             end
         end
diff --git a/GraphBLAS/GraphBLAS/test/gbtest37.m b/GraphBLAS/GraphBLAS/test/gbtest37.m
index f884b1597..03936baad 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest37.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest37.m
@@ -53,75 +53,90 @@
                 GC = GrB.prune (GA * GA') ;
             end
 
-            assert (gbtest_eq (A, GA)) ;
-            assert (gbtest_eq (L, GL)) ;
-            assert (gbtest_eq (U, GU)) ;
-            assert (gbtest_eq (D, GD)) ;
-            assert (gbtest_eq (C, GC)) ;
-
-            if (~islogical (A))
-                % built-in istril, istriu, and isdiag
-                % are not defined when A is logical.
-                assert (istril (A) == istril (GA)) ;
-                assert (istril (L) == istril (GL)) ;
-                assert (istril (U) == istril (GU)) ;
-                assert (istril (D) == istril (GD)) ;
-                assert (istril (C) == istril (GC)) ;
-
-                assert (istriu (A) == istriu (GA)) ;
-                assert (istriu (L) == istriu (GL)) ;
-                assert (istriu (U) == istriu (GU)) ;
-                assert (istriu (D) == istriu (GD)) ;
-                assert (istriu (C) == istriu (GC)) ;
-
-                assert (isdiag (A) == isdiag (GA)) ;
-                assert (isdiag (L) == isdiag (GL)) ;
-                assert (isdiag (U) == isdiag (GU)) ;
-                assert (isdiag (D) == isdiag (GD)) ;
-                assert (isdiag (C) == isdiag (GC)) ;
-            end
+            for fmt = 0:1
+
+                if (fmt == 1)
+                    GA = GrB (GA, 'by row') ;
+                    GL = GrB (GL, 'by row') ;
+                    GU = GrB (GU, 'by row') ;
+                    GD = GrB (GD, 'by row') ;
+                    GC = GrB (GC, 'by row') ;
+                end
+
+                assert (gbtest_eq (A, GA)) ;
+                assert (gbtest_eq (L, GL)) ;
+                assert (gbtest_eq (U, GU)) ;
+                assert (gbtest_eq (D, GD)) ;
+                assert (gbtest_eq (C, GC)) ;
+
+                if (~islogical (A))
+                    % built-in istril, istriu, and isdiag
+                    % are not defined when A is logical.
+                    assert (istril (A) == istril (GA)) ;
+                    assert (istril (L) == istril (GL)) ;
+                    assert (istril (U) == istril (GU)) ;
+                    assert (istril (D) == istril (GD)) ;
+                    assert (istril (C) == istril (GC)) ;
+
+                    assert (istriu (A) == istriu (GA)) ;
+                    assert (istriu (L) == istriu (GL)) ;
+                    assert (istriu (U) == istriu (GU)) ;
+                    assert (istriu (D) == istriu (GD)) ;
+                    assert (istriu (C) == istriu (GC)) ;
+
+                    assert (isdiag (A) == isdiag (GA)) ;
+                    assert (isdiag (L) == isdiag (GL)) ;
+                    assert (isdiag (U) == isdiag (GU)) ;
+                    assert (isdiag (D) == isdiag (GD)) ;
+                    assert (isdiag (C) == isdiag (GC)) ;
+                end
 
-            assert (ishermitian (A) == ishermitian (GA)) ;
-            assert (ishermitian (L) == ishermitian (GL)) ;
-            assert (ishermitian (U) == ishermitian (GU)) ;
-            assert (ishermitian (D) == ishermitian (GD)) ;
-            assert (ishermitian (C) == ishermitian (GC)) ;
-
-            assert (ishermitian (A, 'skew') == ishermitian (GA, 'skew')) ;
-
-            assert (issymmetric (A) == issymmetric (GA)) ;
-            assert (issymmetric (L) == issymmetric (GL)) ;
-            assert (issymmetric (U) == issymmetric (GU)) ;
-            assert (issymmetric (D) == issymmetric (GD)) ;
-            assert (issymmetric (C) == issymmetric (GC)) ;
-
-            assert (issymmetric (A, 'skew') == issymmetric (GA, 'skew')) ;
-
-            if (~islogical (A))
-                assert (isequal (bandwidth (A), bandwidth (GA))) ;
-                assert (isequal (bandwidth (L), bandwidth (GL))) ;
-                assert (isequal (bandwidth (U), bandwidth (GU))) ;
-                assert (isequal (bandwidth (D), bandwidth (GD))) ;
-                assert (isequal (bandwidth (C), bandwidth (GC))) ;
-
-                assert (bandwidth (A, 'lower') == bandwidth (GA, 'lower')) ;
-                assert (bandwidth (L, 'lower') == bandwidth (GL, 'lower')) ;
-                assert (bandwidth (U, 'lower') == bandwidth (GU, 'lower')) ;
-                assert (bandwidth (D, 'lower') == bandwidth (GD, 'lower')) ;
-                assert (bandwidth (C, 'lower') == bandwidth (GC, 'lower')) ;
-
-                assert (bandwidth (A, 'upper') == bandwidth (GA, 'upper')) ;
-                assert (bandwidth (L, 'upper') == bandwidth (GL, 'upper')) ;
-                assert (bandwidth (U, 'upper') == bandwidth (GU, 'upper')) ;
-                assert (bandwidth (D, 'upper') == bandwidth (GD, 'upper')) ;
-                assert (bandwidth (C, 'upper') == bandwidth (GC, 'upper')) ;
-
-                for lo = 0:nmax
-                    for hi = 0:nmax
-                        assert (isbanded (A, lo, hi) == isbanded (GA, lo, hi)) ;
-                        assert (isbanded (L, lo, hi) == isbanded (GL, lo, hi)) ;
-                        assert (isbanded (U, lo, hi) == isbanded (GU, lo, hi)) ;
-                        assert (isbanded (D, lo, hi) == isbanded (GD, lo, hi)) ;
+                assert (ishermitian (A) == ishermitian (GA)) ;
+                assert (ishermitian (L) == ishermitian (GL)) ;
+                assert (ishermitian (U) == ishermitian (GU)) ;
+                assert (ishermitian (D) == ishermitian (GD)) ;
+                assert (ishermitian (C) == ishermitian (GC)) ;
+
+                assert (ishermitian (A, 'skew') == ishermitian (GA, 'skew')) ;
+
+                assert (issymmetric (A) == issymmetric (GA)) ;
+                assert (issymmetric (L) == issymmetric (GL)) ;
+                assert (issymmetric (U) == issymmetric (GU)) ;
+                assert (issymmetric (D) == issymmetric (GD)) ;
+                assert (issymmetric (C) == issymmetric (GC)) ;
+
+                assert (issymmetric (A, 'skew') == issymmetric (GA, 'skew')) ;
+
+                if (~islogical (A))
+                    assert (isequal (bandwidth (A), bandwidth (GA))) ;
+                    assert (isequal (bandwidth (L), bandwidth (GL))) ;
+                    assert (isequal (bandwidth (U), bandwidth (GU))) ;
+                    assert (isequal (bandwidth (D), bandwidth (GD))) ;
+                    assert (isequal (bandwidth (C), bandwidth (GC))) ;
+
+                    assert (bandwidth (A, 'lower') == bandwidth (GA, 'lower')) ;
+                    assert (bandwidth (L, 'lower') == bandwidth (GL, 'lower')) ;
+                    assert (bandwidth (U, 'lower') == bandwidth (GU, 'lower')) ;
+                    assert (bandwidth (D, 'lower') == bandwidth (GD, 'lower')) ;
+                    assert (bandwidth (C, 'lower') == bandwidth (GC, 'lower')) ;
+
+                    assert (bandwidth (A, 'upper') == bandwidth (GA, 'upper')) ;
+                    assert (bandwidth (L, 'upper') == bandwidth (GL, 'upper')) ;
+                    assert (bandwidth (U, 'upper') == bandwidth (GU, 'upper')) ;
+                    assert (bandwidth (D, 'upper') == bandwidth (GD, 'upper')) ;
+                    assert (bandwidth (C, 'upper') == bandwidth (GC, 'upper')) ;
+
+                    for lo = 0:nmax
+                        for hi = 0:nmax
+                            assert (isbanded (A,  lo, hi) == ...
+                                    isbanded (GA, lo, hi)) ;
+                            assert (isbanded (L,  lo, hi) == ...
+                                    isbanded (GL, lo, hi)) ;
+                            assert (isbanded (U,  lo, hi) == ...
+                                    isbanded (GU, lo, hi)) ;
+                            assert (isbanded (D,  lo, hi) == ...
+                                    isbanded (GD, lo, hi)) ;
+                        end
                     end
                 end
             end
diff --git a/GraphBLAS/GraphBLAS/test/gbtest43.m b/GraphBLAS/GraphBLAS/test/gbtest43.m
index d8fa9f032..21fb8291f 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest43.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest43.m
@@ -11,10 +11,13 @@
 
 try
     x = prod (G, 'crud') ; %#ok<*NASGU>
-    ok = false %#ok<*NOPRT>
+    ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -23,7 +26,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -32,7 +38,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -41,6 +50,10 @@
     ok = false ;
 catch expected_error
     expected_error
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -49,6 +62,10 @@
     ok = false ;
 catch expected_error
     expected_error
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -57,7 +74,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -66,7 +86,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -75,7 +98,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -84,7 +110,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -93,7 +122,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -102,7 +134,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -111,7 +146,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -120,7 +158,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -129,7 +170,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -139,7 +183,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -148,7 +195,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -157,7 +207,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -166,7 +219,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -175,7 +231,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -184,7 +243,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -193,7 +255,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -202,7 +267,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -211,7 +279,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -221,7 +292,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -230,7 +304,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -239,7 +316,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -248,7 +328,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -257,7 +340,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -266,7 +352,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -275,7 +364,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -285,7 +377,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -294,7 +389,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -303,7 +401,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -312,15 +413,10 @@
     ok = false ;
 catch expected_error
     expected_error
-end
-assert (ok) ;
-
-try
-    C = reshape (v, [2 2], 2) ;
-    ok = false ;
-catch expected_error
-    expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -329,7 +425,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -338,7 +437,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -347,7 +449,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -356,7 +461,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -365,7 +473,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -375,7 +486,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -384,7 +498,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -393,7 +510,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -402,7 +522,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -411,7 +534,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -420,7 +546,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -429,7 +558,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -441,7 +573,10 @@
     assert (false) ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 
 n = 10 ;
@@ -455,7 +590,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -464,7 +602,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -473,7 +614,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok );
 
@@ -482,7 +626,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok );
 
@@ -493,7 +640,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -504,7 +654,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -515,7 +668,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -526,7 +682,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -535,7 +694,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -544,7 +706,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -553,7 +718,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -562,7 +730,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -571,7 +742,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -580,7 +754,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -590,7 +767,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -600,7 +780,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -610,7 +793,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok)
 
@@ -619,7 +805,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok)
 
@@ -628,7 +817,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -637,7 +829,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -646,7 +841,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -656,7 +854,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -665,7 +866,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -674,7 +878,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -683,7 +890,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -692,7 +902,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -701,7 +914,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -710,7 +926,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -719,7 +938,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -728,7 +950,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -737,7 +962,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -746,7 +974,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -755,7 +986,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -764,7 +998,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -773,7 +1010,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -782,7 +1022,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -791,7 +1034,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -800,7 +1046,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -809,7 +1058,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -818,7 +1070,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -827,7 +1082,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -836,7 +1094,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -845,7 +1106,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -854,7 +1118,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -863,7 +1130,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -872,7 +1142,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -881,7 +1154,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -890,7 +1166,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -899,7 +1178,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -908,7 +1190,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -917,7 +1202,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -926,7 +1214,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -935,7 +1226,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -944,7 +1238,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -953,7 +1250,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -962,7 +1262,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -971,7 +1274,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -980,7 +1286,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -989,7 +1298,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -998,7 +1310,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -1007,7 +1322,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -1016,7 +1334,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
@@ -1025,7 +1346,10 @@
     ok = false ;
 catch expected_error
     expected_error
-    disp (expected_error.stack (end-1))
+    fprintf ('    message: %s\n', expected_error.message) ;
+    for k = 1:length (expected_error.stack)
+        disp (expected_error.stack (k))
+    end
 end
 assert (ok) ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest44.m b/GraphBLAS/GraphBLAS/test/gbtest44.m
index 70a9f7deb..80e9b7396 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest44.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest44.m
@@ -49,5 +49,18 @@
 C = sparse (true (3, 4)) ;
 assert (isequal (C, G))
 
+% test linear indexing for subsasgn
+A = sprand (4, 3, 0.5) ;
+G = GrB (A) ;
+C1 = A ; C1 (:) = pi ;
+C2 = G ; C2 (:) = pi ;
+assert (isequal (C1, C2)) ;
+X = sprand (12, 1, 0.5) ;
+X
+C1 = A ; C1 (:) = X
+C2 = G ; C2 (:) = X
+whos
+assert (isequal (C1, C2)) ;
+
 fprintf ('gbtest44: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest45.m b/GraphBLAS/GraphBLAS/test/gbtest45.m
index c81f8913e..6934b2c5c 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest45.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest45.m
@@ -16,7 +16,7 @@
     t = GrB.vreduce ('+', G, d) ;
     z = sum (G, 2) ;
     w = sum (A, 2) ;
-    
+
     assert (isequal (w, x)) ;
     assert (isequal (w, y)) ;
     assert (isequal (w, z)) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest46.m b/GraphBLAS/GraphBLAS/test/gbtest46.m
index 0cf16c3f2..fec1f8837 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest46.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest46.m
@@ -6,6 +6,8 @@
 
 rng ('default') ;
 d.kind = 'sparse' ;
+d0.kind = 'sparse' ;
+d0.base = 'zero-based' ;
 
 types = gbtest_types ;
 for k = 1:length (types)
@@ -39,12 +41,27 @@
     C3 = GrB.assign (G, pi, { 1:3}, { 1 }) ;
     C4 = GrB.assign (G, pg, { 1:3}, { 1 }) ;
     C5 = GrB.assign (G, pg, { 1:3}, { 1 }, d) ;
+    C6 = GrB.assign (G, pg, { int64(1:3)-1 }, { int64(0) }, d0) ;
+    C7 = GrB.assign (G, pg, { int64(0), int64(2) }, { int64(0) }, d0) ;
+    C8 = GrB.assign (G, pg, { int64(0), int64(1), int64(2) }, { int64(0) }, ...
+        d0) ;
     assert (isequal (C1, C2)) ;
     assert (isequal (C1, C3)) ;
     assert (isequal (C1, C4)) ;
     assert (isequal (C1, C5)) ;
+    assert (isequal (C1, C6)) ;
+    assert (isequal (C1, C7)) ;
+    assert (isequal (C1, C8)) ;
     assert (isequal (class (C5), 'double')) ;
 
+    x = [ 1 2 3 4 5 ]' ;
+    C1 = A ;
+    C1 (5:-1:1,1) = x ;
+    G = GrB (A) ;
+    C8 = GrB.assign (G, x, { int64(4), int64(-1), int64(0) }, { int64(0) }, ...
+        d0) ;
+    assert (isequal (C1, C8)) ;
+
 end
 
 fprintf ('gbtest46: all tests passed\n') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest52.m b/GraphBLAS/GraphBLAS/test/gbtest52.m
index d5f2c5be3..305d12e19 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest52.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest52.m
@@ -7,11 +7,13 @@
 GrB.format
 GrB.format ('by col') ;
 f = GrB.format %#ok<*NOPRT>
+assert (isequal (f, 'by col')) ;
 A = magic (4)
 G = GrB (A)
 assert (isequal (f, GrB.format (G))) ;
 GrB.format ('by row')
 f = GrB.format %#ok<*NASGU>
+assert (isequal (f, 'by row')) ;
 
 H = GrB (5,5)
 assert (isequal ('by row', GrB.format (H))) ;
@@ -24,6 +26,7 @@
 
 GrB.format ('by col')
 f = GrB.format
+assert (isequal (f, 'by col')) ;
 
 H = GrB (5,5)
 assert (isequal ('by col', GrB.format (H))) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest56.m b/GraphBLAS/GraphBLAS/test/gbtest56.m
index 18625b007..6e5ed1f00 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest56.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest56.m
@@ -6,7 +6,7 @@
 
 for m1 = -1:5
     for n1 = -1:5
-        
+
         m = max (m1, 0) ;
         n = max (n1, 0) ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest74.m b/GraphBLAS/GraphBLAS/test/gbtest74.m
index d60c183a0..32e863e38 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest74.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest74.m
@@ -133,6 +133,10 @@
         C2 = bitshift (A2, B2, type) ;
         assert (isequal (C1, C2)) ;
 
+        C1 = bitshift (A, 2, type) ;
+        C2 = bitshift (A2, 2, type) ;
+        assert (isequal (C1, C2)) ;
+
         % sparse case
 
         A = sprand (10, 10, 0.5) * imax ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest76.m b/GraphBLAS/GraphBLAS/test/gbtest76.m
index 40a588715..2b0ec5e8a 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest76.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest76.m
@@ -285,6 +285,11 @@ function gbtest76b (A, B, G, H, tol)
     err = norm (C1-C2, 1) ;
     assert (err < tol) ;
 
+    C1 = nthroot (real (A), 3) ;
+    C2 = cbrt (real (G)) ;
+    err = norm (C1-C2, 1) ;
+    assert (err < tol) ;
+
     C1 = exp (A) ;
     C2 = exp (G) ;
     err = norm (C1-C2, 1) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest77.m b/GraphBLAS/GraphBLAS/test/gbtest77.m
index 39a2d2e01..23e88bf4a 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest77.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest77.m
@@ -50,6 +50,15 @@
 end
 assert (ok) ;
 
+try
+    C = cbrt (Z)
+    ok = false ;
+catch expected_error
+    expected_error
+    disp (expected_error.stack (end-1))
+end
+assert (ok) ;
+
 try
     C = atan2 (Z,G)
     ok = false ;
@@ -477,7 +486,7 @@
 assert (ok) ;
 
 try
-    find (G)
+    norm (G, 3)
     ok = false ;
 catch expected_error
     expected_error
@@ -489,7 +498,83 @@
 assert (ok) ;
 
 try
-    norm (G, 3)
+    C = GrB.apply2 (G, '', '', pi, G) ;
+    ok = false ;
+catch expected_error
+    expected_error
+    s = expected_error.stack ;
+    for k = 1:length (s)
+        disp (s (k)) ;
+    end
+end
+assert (ok) ;
+
+try
+    C = GrB.select ('garbage', G) ;
+    ok = false ;
+catch expected_error
+    expected_error
+    s = expected_error.stack ;
+    for k = 1:length (s)
+        disp (s (k)) ;
+    end
+end
+assert (ok) ;
+
+try
+    C = GrB.eunion (G, G, G, G, G, G, G) ;
+    ok = false ;
+catch expected_error
+    expected_error
+    s = expected_error.stack ;
+    for k = 1:length (s)
+        disp (s (k)) ;
+    end
+end
+assert (ok) ;
+
+try
+    blob = GrB.serialize (G, 'garbage') ;
+    ok = false ;
+catch expected_error
+    expected_error
+    s = expected_error.stack ;
+    for k = 1:length (s)
+        disp (s (k)) ;
+    end
+end
+assert (ok) ;
+
+try
+    I = [1 2] ;
+    J = [3 4] ;
+    X = [pi 2] ;
+    gunk = magic (3) ;
+    C = GrB.build (I, J, X, gunk, gunk) ;
+    ok = false ;
+catch expected_error
+    expected_error
+    s = expected_error.stack ;
+    for k = 1:length (s)
+        disp (s (k)) ;
+    end
+end
+assert (ok) ;
+
+try
+    [C,P] = GrB.argsort (G, 2, 'garbage') ;
+    ok = false ;
+catch expected_error
+    expected_error
+    s = expected_error.stack ;
+    for k = 1:length (s)
+        disp (s (k)) ;
+    end
+end
+assert (ok) ;
+
+try
+    [f,s,iso] = GrB.format ;
     ok = false ;
 catch expected_error
     expected_error
diff --git a/GraphBLAS/GraphBLAS/test/gbtest8.m b/GraphBLAS/GraphBLAS/test/gbtest8.m
index f75988a82..a30aa70cf 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest8.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest8.m
@@ -172,6 +172,9 @@
     C2 = GrB.select (A, '>0') ;
     assert (gbtest_eq (C1, C2))
 
+    C2 = GrB.select ('positive.double', A) ;
+    assert (gbtest_eq (C1, C2))
+
 %-------------------------------------------------------------------------
 % nonnegative
 %-------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_err.m b/GraphBLAS/GraphBLAS/test/gbtest_err.m
index fefe6f8bc..1101c49a2 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_err.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_err.m
@@ -2,7 +2,7 @@
 %GBTEST_ERR compare two matrices
 %
 % err = gbtest_err (A, B)
-%  
+%
 % Returns the norm (A-B,1), ignoring inf's and nan's.
 % Also tests the result of isinf and isnan for A and B.
 
diff --git a/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m b/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m
index 0f8a0958f..ac35d8af9 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m
+++ b/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m
@@ -94,7 +94,7 @@
     flags = [ flags  ' LDFLAGS="$LDFLAGS  -fopenmp -fPIC" '] ;
 end
 
-inc = [ inc '-I. -I../util -I../../../../../../Include -I../../../../../../Source -I../../../../../../Source/Template' ] ;
+inc = [ inc '-I. -I../util -I../../../../../../Include -I../../../../../../Source -I../../../../../../Source/Template -I../../../../../../cpu_features/include ' ] ;
 
 cd tmp/@GrB/private
 try
diff --git a/GraphBLAS/Include/GraphBLAS.h b/GraphBLAS/Include/GraphBLAS.h
index 6d1d4b349..52f1af9f9 100644
--- a/GraphBLAS/Include/GraphBLAS.h
+++ b/GraphBLAS/Include/GraphBLAS.h
@@ -221,10 +221,10 @@
 
 // The version of this implementation, and the GraphBLAS API version:
 #define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
-#define GxB_IMPLEMENTATION_DATE "Apr 8, 2022"
+#define GxB_IMPLEMENTATION_DATE "Aug 8, 2022"
 #define GxB_IMPLEMENTATION_MAJOR 7
-#define GxB_IMPLEMENTATION_MINOR 0
-#define GxB_IMPLEMENTATION_SUB   3
+#define GxB_IMPLEMENTATION_MINOR 2
+#define GxB_IMPLEMENTATION_SUB   0
 #define GxB_SPEC_DATE "Nov 15, 2021"
 #define GxB_SPEC_MAJOR 2
 #define GxB_SPEC_MINOR 0
@@ -352,21 +352,25 @@ GrB_Info ;
 
 typedef enum
 {
-    GrB_NONBLOCKING = 0,    // methods may return with pending computations
-    GrB_BLOCKING = 1        // no computations are ever left pending
+    GrB_NONBLOCKING = 0,        // methods may return with pending computations
+    GrB_BLOCKING = 1,           // no computations are ever left pending
+//  DRAFT: in progress, do not use:
+    GxB_NONBLOCKING_GPU = 2,    // non-blocking mode, allow use of GPU(s)
+    GxB_BLOCKING_GPU = 3,       // blocking mode, allow use of GPU(s)
 }
 GrB_Mode ;
 
 GB_PUBLIC
 GrB_Info GrB_init           // start up GraphBLAS
 (
-    GrB_Mode mode           // blocking or non-blocking mode
+    GrB_Mode mode           // blocking or non-blocking mode, no GPU
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_init           // start up GraphBLAS and also define malloc, etc
 (
-    GrB_Mode mode,          // blocking or non-blocking mode
+    GrB_Mode mode,          // blocking or non-blocking mode,
+                            // with or without GPU
     // pointers to memory management functions
     void * (* user_malloc_function  ) (size_t),
     void * (* user_calloc_function  ) (size_t, size_t),
@@ -467,7 +471,7 @@ GrB_Info GrB_getVersion         // runtime access to C API version number
 //      done, and this setting has no effect.
 //
 // GxB_COMPRESSION: compression method for GxB_Matrix_serialize and
-//      GxB_Vector_serialize.  The default is LZ4.
+//      GxB_Vector_serialize.  The default is ZSTD (level 1).
 //
 // GxB_IMPORT:  GxB_FAST_IMPORT (faster, for trusted input data) or
 //      GxB_SECURE_IMPORT (slower, for untrusted input data), for the
@@ -945,6 +949,10 @@ GB_PUBLIC GrB_UnaryOp
     GxB_LGAMMA_FP32,    GxB_TGAMMA_FP32,    GxB_ERF_FP32,       GxB_ERFC_FP32,
     GxB_LGAMMA_FP64,    GxB_TGAMMA_FP64,    GxB_ERF_FP64,       GxB_ERFC_FP64,
 
+    // z = cbrt (x)
+    GxB_CBRT_FP32,
+    GxB_CBRT_FP64,
+
     // frexpx and frexpe return the mantissa and exponent, respectively,
     // from the ANSI C11 frexp function.  The exponent is returned as a
     // floating-point value, not an integer.
@@ -3196,6 +3204,17 @@ GrB_Info GrB_Vector_extractElement  // x = v(i)
     (x, v, i)
 #endif
 
+// GxB_Vector_isStoredElement determines if v(i) is present in the structure
+// of the vector v, as a stored element.  It does not return the value.  It
+// returns GrB_SUCCESS if the element is present, or GrB_NO_VALUE otherwise.
+
+GB_PUBLIC
+GrB_Info GxB_Vector_isStoredElement // determine if v(i) is a stored element
+(
+    const GrB_Vector v,             // vector to check
+    GrB_Index i                     // row index
+) ;
+
 //------------------------------------------------------------------------------
 // GrB_Vector_removeElement
 //------------------------------------------------------------------------------
@@ -3994,6 +4013,18 @@ GrB_Info GrB_Matrix_extractElement      // x = A(i,j)
     (x, A, i, j)
 #endif
 
+// GxB_Matrix_isStoredElement determines if A(i,j) is present in the structure
+// of the matrix A, as a stored element.  It does not return the value.  It
+// returns GrB_SUCCESS if the element is present, or GrB_NO_VALUE otherwise.
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_isStoredElement // determine if A(i,j) is a stored element
+(
+    const GrB_Matrix A,                 // matrix to check
+    GrB_Index i,                        // row index
+    GrB_Index j                         // column index
+) ;
+
 //------------------------------------------------------------------------------
 // GrB_Matrix_removeElement
 //------------------------------------------------------------------------------
@@ -11247,10 +11278,10 @@ GrB_Info GrB_Matrix_exportHint  // suggest the best export format
 
 // GrB_Matrix_serialize/deserialize are slightly different from their GxB*
 // counterparts.  The blob is allocated by GxB_Matrix_serialize, and must be
-// freed by GxB_serialize_free (which calls the ANSI C11 free if GrB_init was
-// used).  By contrast, the GrB* methods require the user application to pass
-// in a preallocated blob to GrB_Matrix_serialize, whose size can be given by
-// GrB_Matrix_serializeSize (as a loose upper bound).
+// freed by the same free() method passed to GxB_init (or the ANSI C11 free()
+// if GrB_init was used).  By contrast, the GrB* methods require the user
+// application to pass in a preallocated blob to GrB_Matrix_serialize, whose
+// size can be given by GrB_Matrix_serializeSize (as a loose upper bound).
 
 // The GrB* and GxB* methods can be mixed.  GrB_Matrix_serialize and
 // GxB_Matrix_serialize construct the same blob (assuming they are given the
@@ -11339,20 +11370,14 @@ GrB_Info GrB_Matrix_exportHint  // suggest the best export format
     free (blob) ;
 */
 
-// Three methods are currently implemented: no compression, LZ4, and LZ4HC
+// Currently implemented: no compression, LZ4, LZ4HC, and ZSTD
 #define GxB_COMPRESSION_NONE -1     // no compression
-#define GxB_COMPRESSION_DEFAULT 0   // LZ4
+#define GxB_COMPRESSION_DEFAULT 0   // ZSTD (level 1)
 #define GxB_COMPRESSION_LZ4   1000  // LZ4
 #define GxB_COMPRESSION_LZ4HC 2000  // LZ4HC, with default level 9
+#define GxB_COMPRESSION_ZSTD  3000  // ZSTD, with default level 1
 
-// possible future methods that could be added:
-// #define GxB_COMPRESSION_ZLIB  3000  // ZLIB, with default level 6
-// #define GxB_COMPRESSION_LZO   4000  // LZO, with default level 2
-// #define GxB_COMPRESSION_BZIP2 5000  // BZIP2, with default level 9
-// #define GxB_COMPRESSION_LZSS  6000  // LZSS
-
-// using the Intel IPP versions, if available (not yet supported);
-#define GxB_COMPRESSION_INTEL   1000000
+#define GxB_COMPRESSION_INTEL   1000000 // not yet supported
 
 // Most of the above methods have a level parameter that controls the tradeoff
 // between run time and the amount of compression obtained.  Higher levels
@@ -11360,31 +11385,16 @@ GrB_Info GrB_Matrix_exportHint  // suggest the best export format
 
 //  LZ4     no level setting
 //  LZ4HC   1: fast, 9: default, 9: max
-
-//  these methos are not yet supported but may be added in the future:
-//  ZLIB    1: fast, 6: default, 9: max
-//  LZO     1: fast (X1ST), 2: default (XST)
-//  BZIP2   1: fast, 9: default, 9: max
-//  LZSS    no level setting
+//  ZSTD:   1: fast, 1: default, 19: max
 
 // For all methods, a level of zero results in the default level setting.
 // These settings can be added, so to use LZ4HC at level 5, use method =
 // GxB_COMPRESSION_LZ4HC + 5.
 
-// If the Intel IPPS compression methods are available, they can be selected
-// by adding GxB_COMPRESSION_INTEL.  For example, to use the Intel IPPS
-// implementation of LZ4HC at level 9, use method = GxB_COMPRESSION_INTEL +
-// GxB_COMPRESSION_LZ4HC + 9 = 1,002,009.  If the Intel methods are requested
-// but not available, this setting is ignored and the non-Intel methods are
-// used instead.
-
 // If the level setting is out of range, the default is used for that method.
 // If the method is negative, no compression is performed.  If the method is
-// positive but unrecognized, the default is used (GxB_COMPRESSION_LZ4, with no
-// level setting, and the non-Intel version).
-
-// If a method is not implemented, LZ4 is used instead, and the level setting
-// is ignored.
+// positive but unrecognized, the default is used (GxB_COMPRESSION_ZSTD,
+// level 1).
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_serialize       // serialize a GrB_Matrix to a blob
@@ -11536,6 +11546,65 @@ GrB_Info GxB_Matrix_sort
     )                                                       \
     (arg1, __VA_ARGS__)
 
+//==============================================================================
+// GxB_Matrix_reshape and GxB_Matrix_reshapeDup:  reshape a matrix
+//==============================================================================
+
+// GxB_Matrix_reshape changes the dimensions of a matrix, reshaping the entries
+// by row or by column.
+
+// For example, if C is 3-by-4 on input, and is reshaped by column to have
+// dimensions 2-by-6:
+
+//      C on input      C on output (by_col true)
+//      00 01 02 03     00 20 11 02 22 13
+//      10 11 12 13     10 01 21 12 03 23
+//      20 21 22 23
+
+// If the same C on input is reshaped by row to dimensions 2-by-6:
+
+//      C on input      C on output (by_col false)
+//      00 01 02 03     00 01 02 03 10 11
+//      10 11 12 13     12 13 20 21 22 23
+//      20 21 22 23
+
+// If the input matrix is nrows-by-ncols, and the size of the reshaped matrix
+// is nrows_new-by-ncols_new, then nrows*ncols must equal nrows_new*ncols_new.
+// The format of the input matrix (by row or by column) is unchanged; this
+// format need not match the by_col input parameter.
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_reshape     // reshape a GrB_Matrix in place
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix, reshaped in place
+    // input:
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // new number of rows of C
+    GrB_Index ncols_new,        // new number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+) ;
+
+// GxB_Matrix_reshapeDup reshapes a matrix into another matrix.
+
+// If the input matrix A is nrows-by-ncols, and the size of the newly-created
+// matrix C is nrows_new-by-ncols_new, then nrows*ncols must equal
+// nrows_new*ncols_new.  The format of the input matrix A (by row or by column)
+// determines the format of the output matrix C, which need not match the
+// by_col input parameter.
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_reshapeDup // reshape a GrB_Matrix into another GrB_Matrix
+(
+    // output:
+    GrB_Matrix *C,              // newly created output matrix, not in place
+    // input:
+    GrB_Matrix A,               // input matrix, not modified
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // number of rows of C
+    GrB_Index ncols_new,        // number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+) ;
 
 //==============================================================================
 // GxB_Iterator: an object that iterates over the entries of a matrix or vector
@@ -12542,15 +12611,28 @@ extern "C" {
 #endif
 
 // TODO describe the modes
-typedef enum { rmm_wrap_host=0, rmm_wrap_host_pinned=1, rmm_wrap_device=2, rmm_wrap_managed=3 } RMM_MODE ;
+typedef enum
+{
+    rmm_wrap_host = 0,
+    rmm_wrap_host_pinned = 1,
+    rmm_wrap_device = 2,
+    rmm_wrap_managed = 3
+} RMM_MODE ;
 
 void rmm_wrap_finalize (void) ;
-int rmm_wrap_initialize (RMM_MODE mode, size_t init_pool_size, size_t max_pool_size) ;
+
+int rmm_wrap_initialize
+(
+    RMM_MODE mode,
+    size_t init_pool_size,
+    size_t max_pool_size
+) ;
 
 // example usage:
     //  rmm_wrap_initialize (rmm_wrap_managed, INT32_MAX, INT64_MAX) ;
-    //  GxB_init (GrB_NONBLOCKING, rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free) ;
-    //  use GraphBLAS ...
+    //  GxB_init (GxB_NONBLOCKING_GPU, rmm_wrap_malloc, rmm_wrap_calloc,
+    //      rmm_wrap_realloc, rmm_wrap_free) ;
+    //  use GraphBLAS ... with the GPU
     //  GrB_finalize ( ) ;
     //  rmm_wrap_finalize ( ) ;
 
diff --git a/GraphBLAS/LICENSE b/GraphBLAS/LICENSE
index 9e449a470..7bf0ddede 100644
--- a/GraphBLAS/LICENSE
+++ b/GraphBLAS/LICENSE
@@ -110,3 +110,39 @@ cpu_features/ndk_compat: SPDX-License-Identifier: BSD-2-Clause
     OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     SUCH DAMAGE.
 
+
+--------------------------------------------------------------------------------
+ZSTD:  SPDX-License-Identifier: BSD-3-Clause
+--------------------------------------------------------------------------------
+
+BSD License
+
+For Zstandard software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/GraphBLAS/README.md b/GraphBLAS/README.md
index 18228c695..662eaffcc 100644
--- a/GraphBLAS/README.md
+++ b/GraphBLAS/README.md
@@ -8,7 +8,7 @@ For the GraphBLAS/GraphBLAS Octave/MATLAB interface *only*:
 SPDX-License-Identifier: GPL-3.0-or-later
 (see below for a discussion of the licensing of this package).
 
-VERSION 7.0.3, Apr 8, 2022
+VERSION 7.2.0, Aug 8, 2022
 
 SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS standard,
 which defines a set of sparse matrix operations on an extended algebra of
@@ -24,8 +24,8 @@ built-in sparse matrix multiply in MATLAB R2021a, where `C=A*B` is now up to
 30x faster than in prior versions of MATLAB (on my 20-core NVIDIA DGX Station).
 
 The development of this package is supported by Intel, NVIDIA (including the
-donation of the 20-core DGX Station), Redis, MIT Lincoln Lab, IBM, and Julia
-Computing.
+donation of the 20-core DGX Station), Redis, MIT Lincoln Lab, MathWorks,
+IBM, and Julia Computing.
 
 See the user guide in `Doc/GraphBLAS_UserGuide.pdf` for documentation on the
 SuiteSparse implementation of GraphBLAS, and how to use it in your
diff --git a/GraphBLAS/Source/GB.h b/GraphBLAS/Source/GB.h
index d20a3dca9..ed59da938 100644
--- a/GraphBLAS/Source/GB.h
+++ b/GraphBLAS/Source/GB.h
@@ -28,11 +28,21 @@
 #include "GraphBLAS.h"
 
 //------------------------------------------------------------------------------
-// internal #include files
+// handle the restrict keyword
 //------------------------------------------------------------------------------
 
+// Intentionally shadow the built-in "restrict" keyword.  See GraphBLAS.h for
+// the definition of GB_restrict.  It becomes empty for C++, and "__restrict"
+// for MS Visual Studio.  Otherwise, GB_restrct is just "restrict" on ANSI C11
+// compliant compilers.  I prefer to use the "restrct" keyword to make the code
+// readable.  This #define is a patch for compilers that don't support it:
+
 #define restrict GB_restrict
 
+//------------------------------------------------------------------------------
+// internal #include files
+//------------------------------------------------------------------------------
+
 #include "GB_prefix.h"
 #include "GB_bytes.h"
 #include "GB_defaults.h"
diff --git a/GraphBLAS/Source/GB_AxB_colscale.c b/GraphBLAS/Source/GB_AxB_colscale.c
index bb7627b8d..15e3cbd18 100644
--- a/GraphBLAS/Source/GB_AxB_colscale.c
+++ b/GraphBLAS/Source/GB_AxB_colscale.c
@@ -14,6 +14,7 @@
 #ifndef GBCUDA_DEV
 #include "GB_binop__include.h"
 #endif
+#include "GB_unused.h"
 
 #define GB_FREE_WORKSPACE                   \
 {                                           \
diff --git a/GraphBLAS/Source/GB_AxB_dot.c b/GraphBLAS/Source/GB_AxB_dot.c
index dfb2b3bb9..836ef3646 100644
--- a/GraphBLAS/Source/GB_AxB_dot.c
+++ b/GraphBLAS/Source/GB_AxB_dot.c
@@ -170,33 +170,29 @@ GrB_Info GB_AxB_dot                 // dot product (multiple methods)
         GBURBLE ("(%sdot3) ", iso_kind) ;
         (*mask_applied) = true ;    // mask is always applied
         (*done_in_place) = false ;
+        GrB_Info info ;
 
         #if defined ( GBCUDA )
         if (!C_iso &&   // FIXME for CUDA, remove and create C iso on output
             GB_AxB_dot3_cuda_branch (M, Mask_struct, A, B, semiring,
             flipxy, Context))
         {
-            // FIXME for CUDA: can M be jumbled for the CUDA kernel?
-            GB_MATRIX_WAIT (M) ;    // make sure it's not jumbled
-            if (GB_AxB_dot3_control (M, Mask_comp)
-                && !GB_IS_HYPERSPARSE (M)   // FIXME for CUDA, remove this
-            )
-            {
-                return (GB_AxB_dot3_cuda (C, M, Mask_struct, A, B, semiring,
-                    flipxy, Context)) ;
-            }
+            info = (GB_AxB_dot3_cuda (C, M, Mask_struct, A, B, semiring,
+                flipxy, Context)) ;
         }
         else
         #endif
         { 
             // use the CPU
-            return (GB_AxB_dot3 (C, C_iso, cscalar, M, Mask_struct, A, B,
+            info = (GB_AxB_dot3 (C, C_iso, cscalar, M, Mask_struct, A, B,
                 semiring, flipxy, Context)) ;
         }
+        // GxB_print (C,3) ;
+        return (info) ;
     }
 
     //--------------------------------------------------------------------------
-    // general case: C<M>=A'*B, C<!M>=A'B*, or C=A'*B, not in-place
+    // general case: C<M>=A'*B, C<!M>=A'*B, or C=A'*B, not in-place
     //--------------------------------------------------------------------------
 
     GBURBLE ("(%sdot2) ", iso_kind) ;
diff --git a/GraphBLAS/Source/GB_AxB_dot2.c b/GraphBLAS/Source/GB_AxB_dot2.c
index 0403d0c36..d297aa4c2 100644
--- a/GraphBLAS/Source/GB_AxB_dot2.c
+++ b/GraphBLAS/Source/GB_AxB_dot2.c
@@ -39,12 +39,12 @@
 #endif
 
 GB_PUBLIC
-GrB_Info GB_AxB_dot2                // C=A'*B or C<!M>=A'*B, dot product method
+GrB_Info GB_AxB_dot2                // C=A'*B or C<#M>=A'*B, dot product method
 (
     GrB_Matrix C,                   // output matrix, static header
     const bool C_iso,               // true if C is iso
     const GB_void *cscalar,         // iso value of C
-    const GrB_Matrix M_in,          // mask matrix for C<!M>=A'*B, may be NULL
+    const GrB_Matrix M_in,          // mask matrix for C<#M>=A'*B, may be NULL
     const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
     const bool A_not_transposed,    // if true, C=A*B, else C=A'*B
diff --git a/GraphBLAS/Source/GB_AxB_dot3.c b/GraphBLAS/Source/GB_AxB_dot3.c
index b16e263eb..53688cafc 100644
--- a/GraphBLAS/Source/GB_AxB_dot3.c
+++ b/GraphBLAS/Source/GB_AxB_dot3.c
@@ -18,6 +18,7 @@
 #ifndef GBCUDA_DEV
 #include "GB_AxB__include2.h"
 #endif
+#include "GB_unused.h"
 
 #define GB_FREE_WORKSPACE                       \
 {                                               \
@@ -219,6 +220,7 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
         #define GB_MASK_SPARSE_AND_STRUCTURAL
         #include "GB_meta16_factory.c"
         #undef GB_MASK_SPARSE_AND_STRUCTURAL
+        // TODO: skip phase1 if A and B are both bitmap/full.
     }
     else
     {
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3.c b/GraphBLAS/Source/GB_AxB_saxpy3.c
index ce6cd8fee..229d6bfca 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3.c
@@ -94,6 +94,7 @@
 #ifndef GBCUDA_DEV
 #include "GB_AxB__include2.h"
 #endif
+#include "GB_unused.h"
 
 #define GB_FREE_WORKSPACE                           \
 {                                                   \
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c b/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c
index e3b2c16fd..9d033f7b2 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c
@@ -11,6 +11,7 @@
 // phase4: cumulative sum of C->p
 
 #include "GB_AxB_saxpy3.h"
+#include "GB_unused.h"
 
 void GB_AxB_saxpy3_cumsum
 (
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_flopcount.c b/GraphBLAS/Source/GB_AxB_saxpy3_flopcount.c
index c46449ead..edd11fae7 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3_flopcount.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_flopcount.c
@@ -69,6 +69,7 @@
 #include "GB_ek_slice.h"
 #include "GB_bracket.h"
 #include "GB_AxB_saxpy3.h"
+#include "GB_unused.h"
 
 #define GB_FREE_ALL                         \
 {                                           \
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_slice_balanced.c b/GraphBLAS/Source/GB_AxB_saxpy3_slice_balanced.c
index dd22a0588..707a45a92 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3_slice_balanced.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_slice_balanced.c
@@ -11,6 +11,7 @@
 // GrB_NO_VALUE, to indicate that the analysis was terminated early.
 
 #include "GB_AxB_saxpy3.h"
+#include "GB_unused.h"
 
 // control parameters for generating parallel tasks
 #define GB_NTASKS_PER_THREAD 2
diff --git a/GraphBLAS/Source/GB_AxB_saxpy4.c b/GraphBLAS/Source/GB_AxB_saxpy4.c
index 6238b6206..27ba4796b 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy4.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy4.c
@@ -92,8 +92,8 @@ GrB_Info GB_AxB_saxpy4              // C += A*B
     //--------------------------------------------------------------------------
 
     GrB_BinaryOp mult = semiring->multiply ;
-    GrB_Monoid add = semiring->add ;
-    ASSERT (mult->ztype == add->op->ztype) ;
+//  GrB_Monoid add = semiring->add ;
+    ASSERT (mult->ztype == semiring->add->op->ztype) ;
     bool A_is_pattern, B_is_pattern ;
     GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
 
diff --git a/GraphBLAS/Source/GB_AxB_saxpy5.c b/GraphBLAS/Source/GB_AxB_saxpy5.c
index 94320cde7..36743b43c 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy5.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy5.c
@@ -100,8 +100,8 @@ GrB_Info GB_AxB_saxpy5              // C += A*B
     //--------------------------------------------------------------------------
 
     GrB_BinaryOp mult = semiring->multiply ;
-    GrB_Monoid add = semiring->add ;
-    ASSERT (mult->ztype == add->op->ztype) ;
+//  GrB_Monoid add = semiring->add ;
+    ASSERT (mult->ztype == semiring->add->op->ztype) ;
     bool A_is_pattern, B_is_pattern ;
     GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
 
diff --git a/GraphBLAS/Source/GB_AxB_saxpy_sparsity.c b/GraphBLAS/Source/GB_AxB_saxpy_sparsity.c
index 47e6a0ffb..d0e21aec4 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy_sparsity.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy_sparsity.c
@@ -46,7 +46,7 @@ void GB_AxB_saxpy_sparsity          // determine C_sparsity and method to use
     double m = (double) A->vlen ;
     double n = (double) B->vdim ;
     double anz = (double) GB_nnz_held (A) ;
-    double bnz = (double) GB_nnz_held (B) ;
+//  double bnz = (double) GB_nnz_held (B) ;
 
     int M_sparsity = (M == NULL) ? 0 : GB_sparsity (M) ;
     int B_sparsity = GB_sparsity (B) ;
diff --git a/GraphBLAS/Source/GB_Global.c b/GraphBLAS/Source/GB_Global.c
index 563633c49..100068de6 100644
--- a/GraphBLAS/Source/GB_Global.c
+++ b/GraphBLAS/Source/GB_Global.c
@@ -26,7 +26,8 @@ typedef struct
     // blocking/non-blocking mode, set by GrB_init
     //--------------------------------------------------------------------------
 
-    GrB_Mode mode ;             // GrB_NONBLOCKING or GrB_BLOCKING
+    GrB_Mode mode ;             // GrB_NONBLOCKING, GrB_BLOCKING
+                                // GxB_NONBLOCKING_GPU, or GxB_BLOCKING_GPU
     bool GrB_init_called ;      // true if GrB_init already called
 
     //--------------------------------------------------------------------------
@@ -152,7 +153,7 @@ typedef struct
     GrB_Desc_Value gpu_control ;    // always, never, or default
     double gpu_chunk ;              // min problem size for using a GPU
     // properties of each GPU:
-    rmm_device gpu_properties [GB_CUDA_MAX_GPUS] ;
+    GB_cuda_device gpu_properties [GB_CUDA_MAX_GPUS] ;
 
 }
 GB_Global_struct ;
@@ -163,7 +164,7 @@ GB_Global_struct GB_Global =
 {
 
     // GraphBLAS mode
-    .mode = GrB_NONBLOCKING,    // default is nonblocking
+    .mode = GrB_NONBLOCKING,    // default is nonblocking, no GPU
 
     // initialization flag
     .GrB_init_called = false,   // GrB_init has not yet been called
@@ -651,7 +652,7 @@ void GB_Global_memtable_add (void *p, size_t size)
     #endif
     #pragma omp critical(GB_memtable)
     {
-        int n = GB_Global.nmemtable  ;
+        int n = GB_Global.nmemtable ;
         fail = (n > GB_MEMTABLE_SIZE) ;
         if (!fail)
         {
@@ -691,7 +692,7 @@ size_t GB_Global_memtable_size (void *p)
     bool found = false ;
     #pragma omp critical(GB_memtable)
     {
-        int n = GB_Global.nmemtable  ;
+        int n = GB_Global.nmemtable ;
         for (int i = 0 ; i < n ; i++)
         {
             if (p == GB_Global.memtable_p [i])
@@ -721,7 +722,7 @@ bool GB_Global_memtable_find (void *p)
     if (p == NULL) return (false) ;
     #pragma omp critical(GB_memtable)
     {
-        int n = GB_Global.nmemtable  ;
+        int n = GB_Global.nmemtable ;
         for (int i = 0 ; i < n ; i++)
         {
             if (p == GB_Global.memtable_p [i])
@@ -752,7 +753,7 @@ void GB_Global_memtable_remove (void *p)
     #endif
     #pragma omp critical(GB_memtable)
     {
-        int n = GB_Global.nmemtable  ;
+        int n = GB_Global.nmemtable ;
         for (int i = 0 ; i < n ; i++)
         {
             if (p == GB_Global.memtable_p [i])
@@ -992,13 +993,13 @@ bool GB_Global_burble_get (void)
 }
 
 GB_PUBLIC
-GB_printf_function_t GB_Global_printf_get ( )
+GB_printf_function_t GB_Global_printf_get (void)
 { 
     return (GB_Global.printf_func) ;
 }
 
 GB_PUBLIC
-GB_flush_function_t GB_Global_flush_get ( )
+GB_flush_function_t GB_Global_flush_get (void)
 { 
     return (GB_Global.flush_func) ;
 }
@@ -1131,35 +1132,35 @@ int GB_Global_gpu_sm_get (int device)
 {
     // get the # of SMs in a specific GPU
     GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
-    return (GB_Global.gpu_properties [device].number_of_sms)  ;
+    return (GB_Global.gpu_properties [device].number_of_sms) ;
 }
 
-bool GB_Global_gpu_device_pool_size_set( int device, size_t size)
+bool GB_Global_gpu_device_pool_size_set (int device, size_t size)
 {
-    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
-    GB_Global.gpu_properties [device].pool_size = (int) size ;
-    return( true); 
+    GB_GPU_DEVICE_CHECK (false) ;   // fail if invalid GPU
+    GB_Global.gpu_properties [device].pool_size = size ;
+    return (true) ; 
 }
 
-bool GB_Global_gpu_device_max_pool_size_set( int device, size_t size)
+bool GB_Global_gpu_device_max_pool_size_set (int device, size_t size)
 {
-    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
-    GB_Global.gpu_properties[device].max_pool_size = (int) size ;
-    return( true); 
+    GB_GPU_DEVICE_CHECK (false) ;   // fail if invalid GPU
+    GB_Global.gpu_properties[device].max_pool_size = size ;
+    return (true) ; 
 }
 
-bool GB_Global_gpu_device_memory_resource_set( int device, void *resource)
+bool GB_Global_gpu_device_memory_resource_set (int device, void *resource)
 {
-    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
+    GB_GPU_DEVICE_CHECK (false) ;   // fail if invalid GPU
     GB_Global.gpu_properties[device].memory_resource = resource;
-    return( true); 
+    return (true) ; 
 }
 
-void* GB_Global_gpu_device_memory_resource_get( int device )
+void* GB_Global_gpu_device_memory_resource_get (int device)
 {
-    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
-    return ( GB_Global.gpu_properties [device].memory_resource ) ;
-    //NOTE: this returns a void*, needs to be cast to be used
+    GB_GPU_DEVICE_CHECK (false) ;   // fail if invalid GPU
+    return  (GB_Global.gpu_properties [device].memory_resource) ;
+    // NOTE: this returns a void*, needs to be cast to be used
 }
 
 bool GB_Global_gpu_device_properties_get (int device)
diff --git a/GraphBLAS/Source/GB_Matrix_diag.c b/GraphBLAS/Source/GB_Matrix_diag.c
index ae4e826bb..e47b47a0a 100644
--- a/GraphBLAS/Source/GB_Matrix_diag.c
+++ b/GraphBLAS/Source/GB_Matrix_diag.c
@@ -19,6 +19,7 @@
 }
 
 #include "GB_diag.h"
+#include "GB_unused.h"
 
 GrB_Info GB_Matrix_diag     // build a diagonal matrix from a vector
 (
@@ -135,9 +136,6 @@ GrB_Info GB_Matrix_diag     // build a diagonal matrix from a vector
     int64_t *restrict Cp = C->p ;
     int64_t *restrict Ch = C->h ;
     int64_t *restrict Ci = C->i ;
-    GB_Type_code vcode = vtype->code ;
-    GB_Type_code ccode = ctype->code ;
-    size_t vsize = vtype->size ;
 
     //--------------------------------------------------------------------------
     // copy the contents of V into the kth diagonal of C
diff --git a/GraphBLAS/Source/GB_accum_mask.c b/GraphBLAS/Source/GB_accum_mask.c
index 30f7be970..ec107dcd2 100644
--- a/GraphBLAS/Source/GB_accum_mask.c
+++ b/GraphBLAS/Source/GB_accum_mask.c
@@ -57,6 +57,7 @@
 #include "GB_transpose.h"
 #include "GB_accum_mask.h"
 #include "GB_bitmap_assign.h"
+#include "GB_unused.h"
 
 /* -----------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_bitmap_AxB_saxpy.c b/GraphBLAS/Source/GB_bitmap_AxB_saxpy.c
index e42869512..a5ca5d0f5 100644
--- a/GraphBLAS/Source/GB_bitmap_AxB_saxpy.c
+++ b/GraphBLAS/Source/GB_bitmap_AxB_saxpy.c
@@ -90,8 +90,8 @@ GrB_Info GB_bitmap_AxB_saxpy        // C = A*B where C is bitmap
     //--------------------------------------------------------------------------
 
     GrB_BinaryOp mult = semiring->multiply ;
-    GrB_Monoid add = semiring->add ;
-    ASSERT (mult->ztype == add->op->ztype) ;
+//  GrB_Monoid add = semiring->add ;
+    ASSERT (mult->ztype == semiring->add->op->ztype) ;
     bool A_is_pattern, B_is_pattern ;
     GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
 
diff --git a/GraphBLAS/Source/GB_block.c b/GraphBLAS/Source/GB_block.c
index 152f3fd8b..b02d9c9f3 100644
--- a/GraphBLAS/Source/GB_block.c
+++ b/GraphBLAS/Source/GB_block.c
@@ -38,7 +38,8 @@ GrB_Info GB_block   // apply all pending computations if blocking mode enabled
     double npending = (double) GB_Pending_n (A) ;
     double anzmax = ((double) A->vlen) * ((double) A->vdim) ;
     bool many_pending = (npending >= anzmax) ;
-    bool blocking = (GB_Global_mode_get ( ) == GrB_BLOCKING) ;
+    GrB_Mode mode = GB_Global_mode_get ( ) ;
+    bool blocking = (mode == GrB_BLOCKING || mode == GxB_BLOCKING_GPU) ;
 
     if (many_pending || blocking)
     { 
diff --git a/GraphBLAS/Source/GB_builder.c b/GraphBLAS/Source/GB_builder.c
index 333684df2..a3f31d6ac 100644
--- a/GraphBLAS/Source/GB_builder.c
+++ b/GraphBLAS/Source/GB_builder.c
@@ -205,7 +205,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
     int64_t *restrict I_work = (*I_work_handle) ;
     int64_t *restrict J_work = (*J_work_handle) ;
     int64_t *restrict K_work = NULL ; size_t K_work_size = 0 ;
-    ASSERT (*J_work_size_handle == GB_Global_memtable_size (J_work)) ;
+//  ASSERT (*J_work_size_handle == GB_Global_memtable_size (J_work)) ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
diff --git a/GraphBLAS/Source/GB_calloc_memory.c b/GraphBLAS/Source/GB_calloc_memory.c
index 4941e5dcb..69718609c 100644
--- a/GraphBLAS/Source/GB_calloc_memory.c
+++ b/GraphBLAS/Source/GB_calloc_memory.c
@@ -1,5 +1,5 @@
 //------------------------------------------------------------------------------
-// GB_calloc_memory: wrapper for calloc
+// GB_calloc_memory: wrapper for calloc (actually uses malloc and memset)
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -12,7 +12,7 @@
 #include "GB.h"
 
 //------------------------------------------------------------------------------
-// GB_calloc_helper:  use calloc or malloc/memset to allocate initialized block
+// GB_calloc_helper:  malloc/memset to allocate an initialized block
 //------------------------------------------------------------------------------
 
 static inline void *GB_calloc_helper
diff --git a/GraphBLAS/Source/GB_compiler.h b/GraphBLAS/Source/GB_compiler.h
index cdca69b16..aa19db64c 100644
--- a/GraphBLAS/Source/GB_compiler.h
+++ b/GraphBLAS/Source/GB_compiler.h
@@ -213,7 +213,7 @@
 
     #define GB_HAS_VLA  1
 
-#elif GxB_STDC_VERSION >= 199901L
+#elif (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))
 
     // ANSI C99 and later
     #define GB_HAS_VLA  1
diff --git a/GraphBLAS/Source/GB_concat_bitmap.c b/GraphBLAS/Source/GB_concat_bitmap.c
index b36c65c01..cd09c3c9b 100644
--- a/GraphBLAS/Source/GB_concat_bitmap.c
+++ b/GraphBLAS/Source/GB_concat_bitmap.c
@@ -16,6 +16,7 @@
     GB_phbix_free (C) ;
 
 #include "GB_concat.h"
+#include "GB_unused.h"
 
 GrB_Info GB_concat_bitmap           // concatenate into a bitmap matrix
 (
diff --git a/GraphBLAS/Source/GB_concat_full.c b/GraphBLAS/Source/GB_concat_full.c
index e738a7d0e..ebffa0baf 100644
--- a/GraphBLAS/Source/GB_concat_full.c
+++ b/GraphBLAS/Source/GB_concat_full.c
@@ -170,11 +170,13 @@ GrB_Info GB_concat_full             // concatenate into a full matrix
 
                         case GB_16BYTE : // double complex or 16-byte user
                             #define GB_CTYPE GB_blob16
-//                          #define GB_CTYPE uint64_t
-//                          #undef  GB_COPY
-//                          #define GB_COPY(pC,pA,A_iso)                    \
-//                              Cx [2*pC  ] = Ax [A_iso ? 0 : (2*pA)] ;     \
-//                              Cx [2*pC+1] = Ax [A_iso ? 1 : (2*pA+1)] ;
+                            /*
+                            #define GB_CTYPE uint64_t
+                            #undef  GB_COPY
+                            #define GB_COPY(pC,pA,A_iso)                    \
+                                Cx [2*pC  ] = Ax [A_iso ? 0 : (2*pA)] ;     \
+                                Cx [2*pC+1] = Ax [A_iso ? 1 : (2*pA+1)] ;
+                            */
                             #include "GB_concat_full_template.c"
                             break ;
 
diff --git a/GraphBLAS/Source/GB_concat_sparse.c b/GraphBLAS/Source/GB_concat_sparse.c
index eb7ed40d4..33b4932a5 100644
--- a/GraphBLAS/Source/GB_concat_sparse.c
+++ b/GraphBLAS/Source/GB_concat_sparse.c
@@ -26,6 +26,7 @@
 }
 
 #include "GB_concat.h"
+#include "GB_unused.h"
 
 GrB_Info GB_concat_sparse           // concatenate into a sparse matrix
 (
diff --git a/GraphBLAS/Source/GB_control.h b/GraphBLAS/Source/GB_control.h
index 3c5d3c6f7..e5ed149d7 100644
--- a/GraphBLAS/Source/GB_control.h
+++ b/GraphBLAS/Source/GB_control.h
@@ -239,6 +239,7 @@
 // #define GxB_NO_TGAMMA    1
 // #define GxB_NO_ERF       1
 // #define GxB_NO_ERFC      1
+// #define GxB_NO_CBRT      1
 
 // #define GxB_NO_FREXPX    1
 // #define GxB_NO_FREXPE    1
diff --git a/GraphBLAS/Source/GB_convert_bitmap_worker.c b/GraphBLAS/Source/GB_convert_bitmap_worker.c
index edf0b2cb0..fc88133f6 100644
--- a/GraphBLAS/Source/GB_convert_bitmap_worker.c
+++ b/GraphBLAS/Source/GB_convert_bitmap_worker.c
@@ -17,6 +17,7 @@
 
 #include "GB.h"
 #include "GB_partition.h"
+#include "GB_unused.h"
 
 GrB_Info GB_convert_bitmap_worker   // extract CSC/CSR or triplets from bitmap
 (
diff --git a/GraphBLAS/Source/GB_convert_sparse_to_bitmap.c b/GraphBLAS/Source/GB_convert_sparse_to_bitmap.c
index 6f1a13ea9..27fb0a33b 100644
--- a/GraphBLAS/Source/GB_convert_sparse_to_bitmap.c
+++ b/GraphBLAS/Source/GB_convert_sparse_to_bitmap.c
@@ -71,7 +71,7 @@ GrB_Info GB_convert_sparse_to_bitmap    // convert sparse/hypersparse to bitmap
     const int64_t anz = GB_nnz (A) ;
     const int64_t avdim = A->vdim ;
     const int64_t avlen = A->vlen ;
-    const int64_t anvec = A->nvec ;
+//  const int64_t anvec = A->nvec ;
     int64_t anzmax ;
     if (!GB_int64_multiply ((GrB_Index *) &anzmax, avdim, avlen))
     { 
diff --git a/GraphBLAS/Source/GB_cpu_features.h b/GraphBLAS/Source/GB_cpu_features.h
index 21a3601b5..9f34efa1c 100644
--- a/GraphBLAS/Source/GB_cpu_features.h
+++ b/GraphBLAS/Source/GB_cpu_features.h
@@ -30,6 +30,12 @@
 //          then no AVX acceleration is used.  default: not #define'd (using
 //          Google's cpu_features).
 
+//------------------------------------------------------------------------------
+// GB_compiler.h: determine the compiler and architecture
+//------------------------------------------------------------------------------
+
+#include "GB_compiler.h"
+
 #ifndef GB_CPU_FEATURES_H
 #define GB_CPU_FEATURES_H
 
diff --git a/GraphBLAS/Source/GB_cuda_gateway.h b/GraphBLAS/Source/GB_cuda_gateway.h
index 8ad7f104a..944117eab 100644
--- a/GraphBLAS/Source/GB_cuda_gateway.h
+++ b/GraphBLAS/Source/GB_cuda_gateway.h
@@ -30,10 +30,22 @@
 #define GB_GPU_CHUNK_DEFAULT (1024*1024)
 
 //------------------------------------------------------------------------------
-// rmm_device: properties of each GPU in the system
+// GB_cuda_device: properties of each GPU in the system
 //------------------------------------------------------------------------------
 
-#include "rmm_device.h"
+typedef struct
+{
+    char    name [256] ;
+    size_t  total_global_memory ;
+    int  number_of_sms ;
+    int  compute_capability_major ;
+    int  compute_capability_minor ;
+    bool use_memory_pool ;
+    size_t  pool_size ;
+    size_t  max_pool_size ;
+    void *memory_resource ;
+}
+GB_cuda_device ;
 
 //------------------------------------------------------------------------------
 // GB_ngpus_to_use: determine # of GPUs to use for the next computation
@@ -52,6 +64,7 @@ static inline int GB_ngpus_to_use
     if (gpu_control == GxB_GPU_NEVER || gpu_count == 0)
     {
         // never use the GPU(s)
+        printf ("(GPU: disabled, gpu_count: %d) ", gpu_count) ;
         return (0) ;
     }
     else if (gpu_control == GxB_GPU_ALWAYS)
@@ -66,6 +79,8 @@ static inline int GB_ngpus_to_use
         // use no more than max_gpus_to_use
         double gpu_chunk = GB_Global_gpu_chunk_get ( ) ;
         double max_gpus_to_use = floor (work / gpu_chunk) ;
+        printf ("(work %g gpu_chunk: %g max gpus to use: %g) ",
+            work, gpu_chunk, max_gpus_to_use) ;
         // but use no more than the # of GPUs available
         if (max_gpus_to_use > gpu_count) return (gpu_count) ;
         return ((int) max_gpus_to_use) ;
@@ -77,12 +92,13 @@ static inline int GB_ngpus_to_use
 // GB_cuda_* gateway functions
 //------------------------------------------------------------------------------
 
+GrB_Info GB_cuda_init (void) ;
+
 bool GB_cuda_get_device_count   // true if OK, false if failure
 (
     int *gpu_count              // return # of GPUs in the system
 ) ;
 
-
 bool GB_cuda_warmup (int device) ;
 
 bool GB_cuda_get_device( int *device) ;
@@ -92,7 +108,7 @@ bool GB_cuda_set_device( int device) ;
 bool GB_cuda_get_device_properties
 (
     int device,
-    rmm_device *prop
+    GB_cuda_device *prop
 ) ;
 
 bool GB_reduce_to_scalar_cuda_branch 
@@ -134,5 +150,12 @@ bool GB_AxB_dot3_cuda_branch
     GB_Context Context
 );
 
+#ifdef GBCUDA
+#include <nvToolsExt.h>
+#define GB_NVTX { nvtxMark ("nvtx:" __FILE__ ":" GB_XSTR(__LINE__)) ; }
+#else
+#define GB_NVTX
+#endif
+
 #endif
 
diff --git a/GraphBLAS/Source/GB_dense_subassign_23.c b/GraphBLAS/Source/GB_dense_subassign_23.c
index 864617ff8..4ed59e9da 100644
--- a/GraphBLAS/Source/GB_dense_subassign_23.c
+++ b/GraphBLAS/Source/GB_dense_subassign_23.c
@@ -25,6 +25,7 @@
 #ifndef GBCUDA_DEV
 #include "GB_binop__include.h"
 #endif
+#include "GB_unused.h"
 
 #define GB_FREE_ALL                         \
 {                                           \
diff --git a/GraphBLAS/Source/GB_deserialize_from_blob.c b/GraphBLAS/Source/GB_deserialize_from_blob.c
index 51d66a977..a79e7fc8a 100644
--- a/GraphBLAS/Source/GB_deserialize_from_blob.c
+++ b/GraphBLAS/Source/GB_deserialize_from_blob.c
@@ -16,6 +16,7 @@
 #include "GB.h"
 #include "GB_serialize.h"
 #include "GB_lz4.h"
+#include "GB_zstd.h"
 
 #define GB_FREE_ALL         \
 {                           \
@@ -44,7 +45,7 @@ GrB_Info GB_deserialize_from_blob
     // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
+//  GrB_Info info ;
     ASSERT (blob != NULL) ;
     ASSERT (s_handle != NULL) ;
     ASSERT (X_handle != NULL) ;
@@ -104,11 +105,12 @@ GrB_Info GB_deserialize_from_blob
         }
 
     }
-    else if (algo == GxB_COMPRESSION_LZ4 || algo == GxB_COMPRESSION_LZ4HC)
+    else if (algo == GxB_COMPRESSION_LZ4 || algo == GxB_COMPRESSION_LZ4HC
+        || algo == GxB_COMPRESSION_ZSTD)
     {
 
         //----------------------------------------------------------------------
-        // LZ4 / LZ4HC compression
+        // LZ4, LZ4HC, or ZSTD compression
         //----------------------------------------------------------------------
 
         int nthreads = GB_IMIN (nthreads_max, nblocks) ;
@@ -143,13 +145,27 @@ GrB_Info GB_deserialize_from_blob
                 // GB_deserialize, if requested.
                 const char *src = (const char *) (blob + s + s_start) ;
                 char *dst = (char *) (X + kstart) ;
-                int src_size = (int) s_size ;
-                int dst_size = (int) d_size ;
-                int u = LZ4_decompress_safe (src, dst, src_size, dst_size) ;
-                if (u != dst_size)
-                {
-                    // blob is invalid
-                    ok = false ;
+                if (algo == GxB_COMPRESSION_ZSTD)
+                { 
+                    // ZSTD
+                    size_t u = ZSTD_decompress (dst, d_size, src, s_size) ;
+                    if (u != d_size)
+                    {
+                        // blob is invalid
+                        ok = false ;
+                    }
+                }
+                else
+                { 
+                    // LZ4 or LZ4HC
+                    int src_size = (int) s_size ;
+                    int dst_size = (int) d_size ;
+                    int u = LZ4_decompress_safe (src, dst, src_size, dst_size) ;
+                    if (u != dst_size)
+                    {
+                        // blob is invalid
+                        ok = false ;
+                    }
                 }
             }
         }
diff --git a/GraphBLAS/Source/GB_export.c b/GraphBLAS/Source/GB_export.c
index dca70c6b4..3280a29d9 100644
--- a/GraphBLAS/Source/GB_export.c
+++ b/GraphBLAS/Source/GB_export.c
@@ -86,6 +86,7 @@ GrB_Info GB_export      // export/unpack a matrix in any format
         case GxB_HYPERSPARSE : 
             GB_RETURN_IF_NULL (nvec) ;
             GB_RETURN_IF_NULL (Ah) ; GB_RETURN_IF_NULL (Ah_size) ;
+            // fall through to the sparse case
 
         case GxB_SPARSE : 
             if (is_sparse_vector)
@@ -102,6 +103,7 @@ GrB_Info GB_export      // export/unpack a matrix in any format
         case GxB_BITMAP : 
             GB_RETURN_IF_NULL (nvals) ;
             GB_RETURN_IF_NULL (Ab) ; GB_RETURN_IF_NULL (Ab_size) ;
+            // fall through to the full case
 
         case GxB_FULL : 
             break ;
diff --git a/GraphBLAS/Source/GB_helper.c b/GraphBLAS/Source/GB_helper.c
index 931662dc8..678027624 100644
--- a/GraphBLAS/Source/GB_helper.c
+++ b/GraphBLAS/Source/GB_helper.c
@@ -93,12 +93,12 @@ void GB_helper1i             // convert zero-based indices to one-based
 // GB_helper3: convert 1-based indices to 0-based for gb_mxarray_to_list
 //------------------------------------------------------------------------------
 
-bool GB_helper3              // return true if OK, false on error
+bool GB_helper3             // return true if OK, false on error
 (
     int64_t *restrict List,             // size len, output array
     const double *restrict List_double, // size len, input array
     int64_t len,
-    int64_t *List_max               // also compute the max entry in the list
+    int64_t *List_max       // also compute the max entry in the list (1-based)
 )
 {
 
@@ -151,12 +151,12 @@ bool GB_helper3              // return true if OK, false on error
 // GB_helper3i: convert 1-based indices to 0-based for gb_mxarray_to_list
 //------------------------------------------------------------------------------
 
-bool GB_helper3i             // return true if OK, false on error
+bool GB_helper3i        // return true if OK, false on error
 (
     int64_t *restrict List,             // size len, output array
     const int64_t *restrict List_int64, // size len, input array
     int64_t len,
-    int64_t *List_max               // also compute the max entry in the list
+    int64_t *List_max   // also compute the max entry in the list (1-based)
 )
 {
 
@@ -194,14 +194,15 @@ bool GB_helper3i             // return true if OK, false on error
 }
 
 //------------------------------------------------------------------------------
-// GB_helper4: find the max entry in an index list for gbbuild
+// GB_helper4: find the max entry in a list of type GrB_Index
 //------------------------------------------------------------------------------
 
-bool GB_helper4              // return true if OK, false on error
+bool GB_helper4             // return true if OK, false on error
 (
     const GrB_Index *restrict I,    // array of size len
     const int64_t len,
-    GrB_Index *List_max             // find max (I) + 1
+    GrB_Index *List_max     // also compute the max entry in the list (1-based,
+                            // which is max(I)+1)
 )
 {
 
diff --git a/GraphBLAS/Source/GB_hyper_realloc.c b/GraphBLAS/Source/GB_hyper_realloc.c
index 7429a81aa..700713540 100644
--- a/GraphBLAS/Source/GB_hyper_realloc.c
+++ b/GraphBLAS/Source/GB_hyper_realloc.c
@@ -11,6 +11,7 @@
 // No change is made if A is not hypersparse.
 
 #include "GB.h"
+#include "GB_unused.h"
 
 GrB_Info GB_hyper_realloc
 (
diff --git a/GraphBLAS/Source/GB_iceil.h b/GraphBLAS/Source/GB_iceil.h
new file mode 100644
index 000000000..602c2c4fb
--- /dev/null
+++ b/GraphBLAS/Source/GB_iceil.h
@@ -0,0 +1,17 @@
+//------------------------------------------------------------------------------
+// GB_iceil.h: definitions for ceiling (a/b)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_ICEIL_H
+#define GB_ICEIL_H
+
+// ceiling of a/b for two integers a and b
+#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_import.c b/GraphBLAS/Source/GB_import.c
index d4e74e30a..5ed4e0d0d 100644
--- a/GraphBLAS/Source/GB_import.c
+++ b/GraphBLAS/Source/GB_import.c
@@ -242,6 +242,7 @@ GrB_Info GB_import      // import/pack a matrix in any format
                 #endif
                 GB_Global_memtable_add ((*A)->h, (*A)->h_size) ;
             }
+            // fall through to the sparse case
 
         case GxB_SPARSE : 
             (*A)->jumbled = jumbled ;   // import jumbled status
diff --git a/GraphBLAS/Source/GB_init.c b/GraphBLAS/Source/GB_init.c
index e91fb29cc..4dbaf147b 100644
--- a/GraphBLAS/Source/GB_init.c
+++ b/GraphBLAS/Source/GB_init.c
@@ -65,7 +65,7 @@ GrB_Info GB_init            // start up GraphBLAS
 
     GB_Global_GrB_init_called_set (true) ;
 
-    if (! (mode == GrB_BLOCKING || mode == GrB_NONBLOCKING))
+    if (mode < GrB_NONBLOCKING || mode > GxB_BLOCKING_GPU)
     { 
         // invalid mode
         return (GrB_INVALID_VALUE) ;
@@ -148,56 +148,26 @@ GrB_Info GB_init            // start up GraphBLAS
     // CUDA initializations
     //--------------------------------------------------------------------------
 
-// FIXME for CUDA: MOVE THIS to rmm_wrap (or call it something else)
-
+    GrB_Info info = GrB_SUCCESS ;
     #if defined ( GBCUDA )
+    if (mode == GxB_BLOCKING_GPU || mode == GxB_NONBLOCKING_GPU)
     {
-        // TODO: move this code into a function inside CUDA folder
-#if 0
-        GB_cuda_init ( ) ; or something
-#else
-        // If CUDA exists (#define GBCUDA) and if the caller is GxB_cuda_init,
-        // then query the system for the # of GPUs available, their memory
-        // sizes, SM counts, and other capabilities.  Unified Memory support is
-        // assumed.  Then warmup each GPU.
-
-        // query the system for the # of GPUs
-        // TODO for GPU: make this a function in the CUDA folder
-        GB_Global_gpu_control_set (GxB_DEFAULT) ;
-        if (!GB_Global_gpu_count_set (true)) return (GrB_PANIC) ;
-        int gpu_count = GB_Global_gpu_count_get ( ) ;
-        for (int device = 0 ; device < 1 ; device++) // TODO for GPU: gpu_count
-        {
-            // query the GPU and then warm it up
-            if (!GB_Global_gpu_device_properties_get (device))
-            {
-                return (GrB_PANIC) ;
-            }
-            if (!GB_cuda_warmup (device))
-            {
-                return (GrB_PANIC) ;
-            }
-        }
-        // make GPU 0 the default device
-        GB_cuda_set_device( 0 );
-
-        // also check for jit cache, pre-load library of common kernels ...
-#endif
+        // initialize the GPUs
+        info = GB_cuda_init ( ) ;
     }
-    #else
+    else
+    #endif
     { 
-        // CUDA not available at compile-time
+        // CUDA not available at compile-time, or not requested at run time
         GB_Global_gpu_control_set (GxB_GPU_NEVER) ;
         GB_Global_gpu_count_set (0) ;
+        GB_Global_gpu_chunk_set (GxB_DEFAULT) ;
     }
-    #endif
-
-    GB_Global_gpu_chunk_set (GxB_DEFAULT) ;
 
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GB_iso_unop.c b/GraphBLAS/Source/GB_iso_unop.c
index f5433f81c..afbb7f06e 100644
--- a/GraphBLAS/Source/GB_iso_unop.c
+++ b/GraphBLAS/Source/GB_iso_unop.c
@@ -35,7 +35,7 @@ void GB_iso_unop            // Cx [0] = unop (A), binop (s,A) or binop (A,s)
     ASSERT (Cx != NULL) ;
 
     GrB_Type stype = (scalar != NULL) ? scalar->type : GrB_BOOL ;
-    const size_t csize = ctype->size ;
+//  const size_t csize = ctype->size ;
     const size_t asize = A->type->size ;
     const size_t ssize = stype->size ;
     const GB_Type_code ccode = ctype->code ;
diff --git a/GraphBLAS/Source/GB_lz4.h b/GraphBLAS/Source/GB_lz4.h
index 1a248a4ef..a2c1783fb 100644
--- a/GraphBLAS/Source/GB_lz4.h
+++ b/GraphBLAS/Source/GB_lz4.h
@@ -1,5 +1,5 @@
 //------------------------------------------------------------------------------
-// GB_lz4.h: defintions for a wrapper for the LZ4 compression library
+// GB_lz4.h: definitions for a wrapper for the LZ4 compression library
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
diff --git a/GraphBLAS/Source/GB_math.h b/GraphBLAS/Source/GB_math.h
index 5677ab422..c745b9122 100644
--- a/GraphBLAS/Source/GB_math.h
+++ b/GraphBLAS/Source/GB_math.h
@@ -133,7 +133,7 @@
 #include "GB_imin.h"
 
 // ceiling of a/b for two integers a and b
-#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
+#include "GB_iceil.h"
 
 //------------------------------------------------------------------------------
 // division by zero
diff --git a/GraphBLAS/Source/GB_matvec_check.c b/GraphBLAS/Source/GB_matvec_check.c
index 36cdd04ff..a7082b85a 100644
--- a/GraphBLAS/Source/GB_matvec_check.c
+++ b/GraphBLAS/Source/GB_matvec_check.c
@@ -510,7 +510,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         }
     }
 
-    if (!ignore_zombies && (A->nzombies < 0 || A->nzombies > anz))
+    if (!ignore_zombies && (A->nzombies > anz))
     { 
         GBPR0 ("  invalid number of zombies: " GBd " "
             "must be >= 0 and <= # entries (" GBd ")\n", A->nzombies, anz) ;
@@ -721,7 +721,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         // matrix has tuples, arrays and type must not be NULL
         // Pending->x must be NULL if and only if A is iso
         // Pending->x must be non-NULL if and only if A is non-iso
-        if (Pending->i == NULL || (Pending->x == NULL != (A->iso)) ||
+        if (Pending->i == NULL || ((Pending->x == NULL) != (A->iso)) ||
             (A->vdim > 1 && Pending->j == NULL))
         { 
             GBPR0 ("  invalid pending tuples\n") ;
diff --git a/GraphBLAS/Source/GB_opaque.h b/GraphBLAS/Source/GB_opaque.h
index 3a8468dad..e4ca5f609 100644
--- a/GraphBLAS/Source/GB_opaque.h
+++ b/GraphBLAS/Source/GB_opaque.h
@@ -106,41 +106,42 @@ typedef enum
     GB_TGAMMA_unop_code    = 34,   // z = tgamma (x)
     GB_ERF_unop_code       = 35,   // z = erf (x)
     GB_ERFC_unop_code      = 36,   // z = erfc (x)
-    GB_FREXPX_unop_code    = 37,   // z = frexpx (x), mantissa of ANSI C11 frexp
-    GB_FREXPE_unop_code    = 38,   // z = frexpe (x), exponent of ANSI C11 frexp
+    GB_CBRT_unop_code      = 37,   // z = cbrt (x)
+    GB_FREXPX_unop_code    = 38,   // z = frexpx (x), mantissa of ANSI C11 frexp
+    GB_FREXPE_unop_code    = 39,   // z = frexpe (x), exponent of ANSI C11 frexp
 
     //--------------------------------------------------------------------------
     // unary operators for complex types only
     //--------------------------------------------------------------------------
 
-    GB_CONJ_unop_code      = 39,   // z = conj (x)
+    GB_CONJ_unop_code      = 40,   // z = conj (x)
 
     //--------------------------------------------------------------------------
     // unary operators where z is real and x is complex
     //--------------------------------------------------------------------------
 
-    GB_CREAL_unop_code     = 40,   // z = creal (x)
-    GB_CIMAG_unop_code     = 41,   // z = cimag (x)
-    GB_CARG_unop_code      = 42,   // z = carg (x)
+    GB_CREAL_unop_code     = 41,   // z = creal (x)
+    GB_CIMAG_unop_code     = 42,   // z = cimag (x)
+    GB_CARG_unop_code      = 43,   // z = carg (x)
 
     //--------------------------------------------------------------------------
     // unary operators where z is bool and x is any floating-point type
     //--------------------------------------------------------------------------
 
-    GB_ISINF_unop_code     = 43,   // z = isinf (x)
-    GB_ISNAN_unop_code     = 44,   // z = isnan (x)
-    GB_ISFINITE_unop_code  = 45,   // z = isfinite (x)
+    GB_ISINF_unop_code     = 44,   // z = isinf (x)
+    GB_ISNAN_unop_code     = 45,   // z = isnan (x)
+    GB_ISFINITE_unop_code  = 46,   // z = isfinite (x)
 
     //--------------------------------------------------------------------------
     // positional unary operators: z is int32 or int64, x is ignored
     //--------------------------------------------------------------------------
 
-    GB_POSITIONI_unop_code     = 46,   // z = position_i(A(i,j)) == i
-    GB_POSITIONI1_unop_code    = 47,   // z = position_i1(A(i,j)) == i+1
-    GB_POSITIONJ_unop_code     = 48,   // z = position_j(A(i,j)) == j
-    GB_POSITIONJ1_unop_code    = 49,   // z = position_j1(A(i,j)) == j+1
+    GB_POSITIONI_unop_code     = 47,   // z = position_i(A(i,j)) == i
+    GB_POSITIONI1_unop_code    = 48,   // z = position_i1(A(i,j)) == i+1
+    GB_POSITIONJ_unop_code     = 49,   // z = position_j(A(i,j)) == j
+    GB_POSITIONJ1_unop_code    = 50,   // z = position_j1(A(i,j)) == j+1
 
-    GB_USER_unop_code = 50,
+    GB_USER_unop_code = 51,
 
     // true if opcode is for a GrB_UnaryOp
     #define GB_IS_UNARYOP_CODE(opcode) \
@@ -159,30 +160,30 @@ typedef enum
     // operator codes used in GrB_IndexUnaryOp structures
 
     // Result is INT32 or INT64, depending on i and/or j, and thunk:
-    GB_ROWINDEX_idxunop_code  = 51,   // (i+thunk): row index - thunk
-    GB_COLINDEX_idxunop_code  = 52,   // (j+thunk): col index - thunk
-    GB_DIAGINDEX_idxunop_code = 53,   // (j-(i+thunk)): diag index + thunk
-    GB_FLIPDIAGINDEX_idxunop_code = 54,   // (i-(j+thunk)), internal use only
+    GB_ROWINDEX_idxunop_code  = 52,   // (i+thunk): row index - thunk
+    GB_COLINDEX_idxunop_code  = 53,   // (j+thunk): col index - thunk
+    GB_DIAGINDEX_idxunop_code = 54,   // (j-(i+thunk)): diag index + thunk
+    GB_FLIPDIAGINDEX_idxunop_code = 55,   // (i-(j+thunk)), internal use only
 
     // Result is BOOL, depending on i and/or j, and thunk:
-    GB_TRIL_idxunop_code      = 55,   // (j <= (i+thunk)): tril (A,thunk)
-    GB_TRIU_idxunop_code      = 56,   // (j >= (i+thunk)): triu (A,thunk)
-    GB_DIAG_idxunop_code      = 57,   // (j == (i+thunk)): diag(A,thunk)
-    GB_OFFDIAG_idxunop_code   = 58,   // (j != (i+thunk)): offdiag(A,thunk)
-    GB_COLLE_idxunop_code     = 59,   // (j <= thunk): A (:,0:thunk)
-    GB_COLGT_idxunop_code     = 60,   // (j > thunk): A (:,thunk+1:ncols-1)
-    GB_ROWLE_idxunop_code     = 61,   // (i <= thunk): A (0:thunk,:)
-    GB_ROWGT_idxunop_code     = 62,   // (i > thunk): A (thunk+1:nrows-1,:)
+    GB_TRIL_idxunop_code      = 56,   // (j <= (i+thunk)): tril (A,thunk)
+    GB_TRIU_idxunop_code      = 57,   // (j >= (i+thunk)): triu (A,thunk)
+    GB_DIAG_idxunop_code      = 58,   // (j == (i+thunk)): diag(A,thunk)
+    GB_OFFDIAG_idxunop_code   = 59,   // (j != (i+thunk)): offdiag(A,thunk)
+    GB_COLLE_idxunop_code     = 60,   // (j <= thunk): A (:,0:thunk)
+    GB_COLGT_idxunop_code     = 61,   // (j > thunk): A (:,thunk+1:ncols-1)
+    GB_ROWLE_idxunop_code     = 62,   // (i <= thunk): A (0:thunk,:)
+    GB_ROWGT_idxunop_code     = 63,   // (i > thunk): A (thunk+1:nrows-1,:)
 
     // Result is BOOL, depending on the value aij and thunk:
-    GB_VALUENE_idxunop_code   = 63,   // (aij != thunk)
-    GB_VALUEEQ_idxunop_code   = 64,   // (aij == thunk)
-    GB_VALUEGT_idxunop_code   = 65,   // (aij > thunk)
-    GB_VALUEGE_idxunop_code   = 66,   // (aij >= thunk)
-    GB_VALUELT_idxunop_code   = 67,   // (aij < thunk)
-    GB_VALUELE_idxunop_code   = 68,   // (aij <= thunk)
+    GB_VALUENE_idxunop_code   = 64,   // (aij != thunk)
+    GB_VALUEEQ_idxunop_code   = 65,   // (aij == thunk)
+    GB_VALUEGT_idxunop_code   = 66,   // (aij > thunk)
+    GB_VALUEGE_idxunop_code   = 67,   // (aij >= thunk)
+    GB_VALUELT_idxunop_code   = 68,   // (aij < thunk)
+    GB_VALUELE_idxunop_code   = 69,   // (aij <= thunk)
 
-    GB_USER_idxunop_code = 69,
+    GB_USER_idxunop_code = 70,
 
     // true if opcode is for a GrB_IndexUnaryOp
     #define GB_IS_INDEXUNARYOP_CODE(opcode) \
@@ -202,82 +203,82 @@ typedef enum
     // binary operators z=f(x,y) that return the same type as their inputs
     //--------------------------------------------------------------------------
 
-    GB_FIRST_binop_code     = 70,   // z = x
-    GB_SECOND_binop_code    = 71,   // z = y
-    GB_ANY_binop_code       = 72,   // z = x or y, selected arbitrarily
-    GB_PAIR_binop_code      = 73,   // z = 1
-    GB_MIN_binop_code       = 74,   // z = min(x,y)
-    GB_MAX_binop_code       = 75,   // z = max(x,y)
-    GB_PLUS_binop_code      = 76,   // z = x + y
-    GB_MINUS_binop_code     = 77,   // z = x - y
-    GB_RMINUS_binop_code    = 78,   // z = y - x
-    GB_TIMES_binop_code     = 79,   // z = x * y
-    GB_DIV_binop_code       = 80,   // z = x / y
-    GB_RDIV_binop_code      = 81,   // z = y / x
-    GB_POW_binop_code       = 82,   // z = pow (x,y)
-
-    GB_ISEQ_binop_code      = 83,   // z = (x == y)
-    GB_ISNE_binop_code      = 84,   // z = (x != y)
-    GB_ISGT_binop_code      = 85,   // z = (x >  y)
-    GB_ISLT_binop_code      = 86,   // z = (x <  y)
-    GB_ISGE_binop_code      = 87,   // z = (x >= y)
-    GB_ISLE_binop_code      = 88,   // z = (x <= y)
-
-    GB_LOR_binop_code       = 89,   // z = (x != 0) || (y != 0)
-    GB_LAND_binop_code      = 90,   // z = (x != 0) && (y != 0)
-    GB_LXOR_binop_code      = 91,   // z = (x != 0) != (y != 0)
-
-    GB_BOR_binop_code       = 92,   // z = (x | y), bitwise or
-    GB_BAND_binop_code      = 93,   // z = (x & y), bitwise and
-    GB_BXOR_binop_code      = 94,   // z = (x ^ y), bitwise xor
-    GB_BXNOR_binop_code     = 95,   // z = ~(x ^ y), bitwise xnor
-    GB_BGET_binop_code      = 96,   // z = bitget (x,y)
-    GB_BSET_binop_code      = 97,   // z = bitset (x,y)
-    GB_BCLR_binop_code      = 98,   // z = bitclr (x,y)
-    GB_BSHIFT_binop_code    = 99,   // z = bitshift (x,y)
+    GB_FIRST_binop_code     = 71,   // z = x
+    GB_SECOND_binop_code    = 72,   // z = y
+    GB_ANY_binop_code       = 73,   // z = x or y, selected arbitrarily
+    GB_PAIR_binop_code      = 74,   // z = 1
+    GB_MIN_binop_code       = 75,   // z = min(x,y)
+    GB_MAX_binop_code       = 76,   // z = max(x,y)
+    GB_PLUS_binop_code      = 77,   // z = x + y
+    GB_MINUS_binop_code     = 78,   // z = x - y
+    GB_RMINUS_binop_code    = 79,   // z = y - x
+    GB_TIMES_binop_code     = 80,   // z = x * y
+    GB_DIV_binop_code       = 81,   // z = x / y
+    GB_RDIV_binop_code      = 82,   // z = y / x
+    GB_POW_binop_code       = 83,   // z = pow (x,y)
+
+    GB_ISEQ_binop_code      = 84,   // z = (x == y)
+    GB_ISNE_binop_code      = 85,   // z = (x != y)
+    GB_ISGT_binop_code      = 86,   // z = (x >  y)
+    GB_ISLT_binop_code      = 87,   // z = (x <  y)
+    GB_ISGE_binop_code      = 88,   // z = (x >= y)
+    GB_ISLE_binop_code      = 89,   // z = (x <= y)
+
+    GB_LOR_binop_code       = 90,   // z = (x != 0) || (y != 0)
+    GB_LAND_binop_code      = 91,   // z = (x != 0) && (y != 0)
+    GB_LXOR_binop_code      = 92,   // z = (x != 0) != (y != 0)
+
+    GB_BOR_binop_code       = 93,   // z = (x | y), bitwise or
+    GB_BAND_binop_code      = 94,   // z = (x & y), bitwise and
+    GB_BXOR_binop_code      = 95,   // z = (x ^ y), bitwise xor
+    GB_BXNOR_binop_code     = 96,   // z = ~(x ^ y), bitwise xnor
+    GB_BGET_binop_code      = 97,   // z = bitget (x,y)
+    GB_BSET_binop_code      = 98,   // z = bitset (x,y)
+    GB_BCLR_binop_code      = 99,   // z = bitclr (x,y)
+    GB_BSHIFT_binop_code    =100,   // z = bitshift (x,y)
 
     //--------------------------------------------------------------------------
     // binary operators z=f(x,y) that return bool (TxT -> bool)
     //--------------------------------------------------------------------------
 
-    GB_EQ_binop_code        = 100,  // z = (x == y), is LXNOR for bool
-    GB_NE_binop_code        = 101,  // z = (x != y)
-    GB_GT_binop_code        = 102,  // z = (x >  y)
-    GB_LT_binop_code        = 103,  // z = (x <  y)
-    GB_GE_binop_code        = 104,  // z = (x >= y)
-    GB_LE_binop_code        = 105,  // z = (x <= y)
+    GB_EQ_binop_code        = 101,  // z = (x == y), is LXNOR for bool
+    GB_NE_binop_code        = 102,  // z = (x != y)
+    GB_GT_binop_code        = 103,  // z = (x >  y)
+    GB_LT_binop_code        = 104,  // z = (x <  y)
+    GB_GE_binop_code        = 105,  // z = (x >= y)
+    GB_LE_binop_code        = 106,  // z = (x <= y)
 
     //--------------------------------------------------------------------------
     // binary operators for real floating-point types (TxT -> T)
     //--------------------------------------------------------------------------
 
-    GB_ATAN2_binop_code     = 106,  // z = atan2 (x,y)
-    GB_HYPOT_binop_code     = 107,  // z = hypot (x,y)
-    GB_FMOD_binop_code      = 108,  // z = fmod (x,y)
-    GB_REMAINDER_binop_code = 109,  // z = remainder (x,y)
-    GB_COPYSIGN_binop_code  = 110,  // z = copysign (x,y)
-    GB_LDEXP_binop_code     = 111,  // z = ldexp (x,y)
+    GB_ATAN2_binop_code     = 107,  // z = atan2 (x,y)
+    GB_HYPOT_binop_code     = 108,  // z = hypot (x,y)
+    GB_FMOD_binop_code      = 109,  // z = fmod (x,y)
+    GB_REMAINDER_binop_code = 110,  // z = remainder (x,y)
+    GB_COPYSIGN_binop_code  = 111,  // z = copysign (x,y)
+    GB_LDEXP_binop_code     = 112,  // z = ldexp (x,y)
 
     //--------------------------------------------------------------------------
     // binary operator z=f(x,y) where z is complex, x,y real:
     //--------------------------------------------------------------------------
 
-    GB_CMPLX_binop_code     = 112,  // z = cmplx (x,y)
+    GB_CMPLX_binop_code     = 113,  // z = cmplx (x,y)
 
     //--------------------------------------------------------------------------
     // positional binary operators: z is int64, x and y are ignored
     //--------------------------------------------------------------------------
 
-    GB_FIRSTI_binop_code    = 113,  // z = first_i(A(i,j),y) == i
-    GB_FIRSTI1_binop_code   = 114,  // z = first_i1(A(i,j),y) == i+1
-    GB_FIRSTJ_binop_code    = 115,  // z = first_j(A(i,j),y) == j
-    GB_FIRSTJ1_binop_code   = 116,  // z = first_j1(A(i,j),y) == j+1
-    GB_SECONDI_binop_code   = 117,  // z = second_i(x,B(i,j)) == i
-    GB_SECONDI1_binop_code  = 118,  // z = second_i1(x,B(i,j)) == i+1
-    GB_SECONDJ_binop_code   = 119,  // z = second_j(x,B(i,j)) == j
-    GB_SECONDJ1_binop_code  = 120,  // z = second_j1(x,B(i,j)) == j+1
+    GB_FIRSTI_binop_code    = 114,  // z = first_i(A(i,j),y) == i
+    GB_FIRSTI1_binop_code   = 115,  // z = first_i1(A(i,j),y) == i+1
+    GB_FIRSTJ_binop_code    = 116,  // z = first_j(A(i,j),y) == j
+    GB_FIRSTJ1_binop_code   = 117,  // z = first_j1(A(i,j),y) == j+1
+    GB_SECONDI_binop_code   = 118,  // z = second_i(x,B(i,j)) == i
+    GB_SECONDI1_binop_code  = 119,  // z = second_i1(x,B(i,j)) == i+1
+    GB_SECONDJ_binop_code   = 120,  // z = second_j(x,B(i,j)) == j
+    GB_SECONDJ1_binop_code  = 121,  // z = second_j1(x,B(i,j)) == j+1
 
-    GB_USER_binop_code = 121,
+    GB_USER_binop_code = 122,
 
     // true if opcode is for a GrB_BinaryOp
     #define GB_IS_BINARYOP_CODE(opcode) \
@@ -293,29 +294,29 @@ typedef enum
     //==========================================================================
 
     // built-in positional select operators: thunk optional; defaults to zero
-    GB_TRIL_selop_code      = 122,
-    GB_TRIU_selop_code      = 123,
-    GB_DIAG_selop_code      = 124,
-    GB_OFFDIAG_selop_code   = 125,
+    GB_TRIL_selop_code      = 123,
+    GB_TRIU_selop_code      = 124,
+    GB_DIAG_selop_code      = 125,
+    GB_OFFDIAG_selop_code   = 126,
 
     // built-in select operators, no thunk used
-    GB_NONZOMBIE_selop_code = 126,
-    GB_NONZERO_selop_code   = 127,
-    GB_EQ_ZERO_selop_code   = 128,
-    GB_GT_ZERO_selop_code   = 129,
-    GB_GE_ZERO_selop_code   = 130,
-    GB_LT_ZERO_selop_code   = 131,
-    GB_LE_ZERO_selop_code   = 132,
+    GB_NONZOMBIE_selop_code = 127,
+    GB_NONZERO_selop_code   = 128,
+    GB_EQ_ZERO_selop_code   = 129,
+    GB_GT_ZERO_selop_code   = 130,
+    GB_GE_ZERO_selop_code   = 131,
+    GB_LT_ZERO_selop_code   = 132,
+    GB_LE_ZERO_selop_code   = 133,
 
     // built-in select operators, thunk optional; defaults to zero
-    GB_NE_THUNK_selop_code  = 133,
-    GB_EQ_THUNK_selop_code  = 134,
-    GB_GT_THUNK_selop_code  = 135,
-    GB_GE_THUNK_selop_code  = 136,
-    GB_LT_THUNK_selop_code  = 137,
-    GB_LE_THUNK_selop_code  = 138,
-
-    GB_USER_selop_code = 139
+    GB_NE_THUNK_selop_code  = 134,
+    GB_EQ_THUNK_selop_code  = 135,
+    GB_GT_THUNK_selop_code  = 136,
+    GB_GE_THUNK_selop_code  = 137,
+    GB_LT_THUNK_selop_code  = 138,
+    GB_LE_THUNK_selop_code  = 139,
+
+    GB_USER_selop_code = 140
 
     // true if opcode is for a GxB_SelectOp
     #define GB_IS_SELECTOP_CODE(opcode) \
diff --git a/GraphBLAS/Source/GB_ops.c b/GraphBLAS/Source/GB_ops.c
index 035f1a7a1..f6c56c42c 100644
--- a/GraphBLAS/Source/GB_ops.c
+++ b/GraphBLAS/Source/GB_ops.c
@@ -43,7 +43,7 @@
         name,                                               \
         NULL                                                \
     } ;                                                     \
-    GrB_Type prefix ## _ ## type = & GB_OPAQUE (type) ;
+    GrB_Type prefix ## _ ## type = & GB_OPAQUE (type)
 
 GB_TYPEDEF (GrB, BOOL  , bool      , "bool"          ) ;
 GB_TYPEDEF (GrB, INT8  , int8_t    , "int8_t"        ) ;
@@ -76,8 +76,9 @@ GB_TYPEDEF (GxB, FC64  , GxB_FC64_t, "double complex") ;
         (GrB_Desc_Value) (in0),                                         \
         (GrB_Desc_Value) (in1),                                         \
         o, o,                   /* default: axb, #threads */            \
+        0,                      /* default compression */               \
         0,                      /* no sort */                           \
-        0                       /* default compression: LZ4 */          \
+        0                       /* import */                            \
     } ;                                                                 \
     GrB_Descriptor GRB (DESC_ ## name) = & GB_OPAQUE (desc_ ## name) ;
 
@@ -147,22 +148,22 @@ GB_DESC (RSCT0T1, GrB_REPLACE, GrB_STRUCTURE + GrB_COMP, GrB_TRAN, GrB_TRAN )
         str,                                                                \
         GB_ ## op ## _unop_code,                                            \
         NULL                                                                \
-    } ;
+    }
 
 #define GRB_OP1z(op,str,z_t,ztype)                                          \
-    GB_OP1zx (op, str, z_t, ztype, GB_TYPE, GB_XTYPE)                       \
-    GrB_UnaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GB_OP1zx (op, str, z_t, ztype, GB_TYPE, GB_XTYPE) ;                     \
+    GrB_UnaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 #define GRB_OP1(op,str) GRB_OP1z (op, str, GB_TYPE, GB_XTYPE)
 
 #define GXB_OP1z(op,str,z_t,ztype)                                          \
-    GB_OP1zx (op, str, z_t, ztype, GB_TYPE, GB_XTYPE)                       \
-    GrB_UnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GB_OP1zx (op, str, z_t, ztype, GB_TYPE, GB_XTYPE) ;                     \
+    GrB_UnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 #define GXB_OP1(op,str) GXB_OP1z (op, str, GB_TYPE, GB_XTYPE)
 
 #define GXB_OP1_RENAME(op)                                                  \
-    GrB_UnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GrB_UnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 //------------------------------------------------------------------------------
 // helper macros to define binary operators
@@ -180,23 +181,23 @@ GB_DESC (RSCT0T1, GrB_REPLACE, GrB_STRUCTURE + GrB_COMP, GrB_TRAN, GrB_TRAN )
         str,                                                                \
         GB_ ## op ## _binop_code,                                           \
         NULL                                                                \
-    } ;
+    }
 
 #define GRB_OP2z(op,str,z_t,ztype)                                          \
-    GB_OP2zxy (op, str, z_t, ztype, GB_TYPE, GB_XTYPE, GB_TYPE, GB_XTYPE)   \
-    GrB_BinaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GB_OP2zxy (op, str, z_t, ztype, GB_TYPE, GB_XTYPE, GB_TYPE, GB_XTYPE) ; \
+    GrB_BinaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 #define GRB_OP2(op,str) GRB_OP2z (op, str, GB_TYPE, GB_XTYPE)
 
 #define GXB_OP2z(op,str,z_t,ztype)                                          \
-    GB_OP2zxy (op, str, z_t, ztype, GB_TYPE, GB_XTYPE, GB_TYPE, GB_XTYPE)   \
-    GrB_BinaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GB_OP2zxy (op, str, z_t, ztype, GB_TYPE, GB_XTYPE, GB_TYPE, GB_XTYPE) ; \
+    GrB_BinaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 #define GXB_OP2(op,str) GXB_OP2z (op, str, GB_TYPE, GB_XTYPE)
 
 #define GXB_OP2shift(op,str) \
-    GB_OP2zxy (op, str, GB_TYPE, GB_XTYPE, GB_TYPE, GB_XTYPE, int8_t, INT8) \
-    GrB_BinaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GB_OP2zxy (op, str, GB_TYPE, GB_XTYPE, GB_TYPE, GB_XTYPE, int8_t, INT8) ; \
+    GrB_BinaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 //------------------------------------------------------------------------------
 // positional unary and binary operators
@@ -219,7 +220,7 @@ GB_DESC (RSCT0T1, GrB_REPLACE, GrB_STRUCTURE + GrB_COMP, GrB_TRAN, GrB_TRAN )
         GB_ ## op ## _unop_code,                                            \
         NULL                                                                \
     } ;                                                                     \
-    GrB_UnaryOp GXB (op ## _ ## type) = & GB_OPAQUE (op ## _ ## type) ;
+    GrB_UnaryOp GXB (op ## _ ## type) = & GB_OPAQUE (op ## _ ## type)
 
 // helper macros to define positional binary operators
 #define GXB_OP2_POS(op,str,type)                                            \
@@ -234,7 +235,7 @@ GB_DESC (RSCT0T1, GrB_REPLACE, GrB_STRUCTURE + GrB_COMP, GrB_TRAN, GrB_TRAN )
         GB_ ## op ## _binop_code,                                           \
         NULL                                                                \
     } ;                                                                     \
-    GrB_BinaryOp GXB (op ## _ ## type) = & GB_OPAQUE (op ## _ ## type) ;
+    GrB_BinaryOp GXB (op ## _ ## type) = & GB_OPAQUE (op ## _ ## type)
 
 GXB_OP1_POS (POSITIONI , "positioni" , INT32) ;
 GXB_OP1_POS (POSITIONI , "positioni" , INT64) ;
@@ -284,7 +285,7 @@ GXB_OP2_POS (SECONDJ1  , "secondj1"  , INT64) ;
         GB_ ## op ## _idxunop_code,                                         \
         NULL                                                                \
     } ;                                                                     \
-    GrB_IndexUnaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GrB_IndexUnaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 // GxB_IndexUnaryOps that depend on i,j,thunk but not A(i,j), and result has
 // the same type as the thunk: FLIPDIAGINDEX
@@ -303,7 +304,7 @@ GXB_OP2_POS (SECONDJ1  , "secondj1"  , INT64) ;
         GB_ ## op ## _idxunop_code,                                         \
         NULL                                                                \
     } ;                                                                     \
-    GrB_IndexUnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GrB_IndexUnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 // IndexUnaryOps that depend on i,j, and thunk but not A(i,j), and result is
 // bool: TRIL, TRIU, DIAG, OFFDIAG, COLLE, COLGT, ROWLE, ROWGT
@@ -322,7 +323,7 @@ GXB_OP2_POS (SECONDJ1  , "secondj1"  , INT64) ;
         GB_ ## op ## _idxunop_code,                                         \
         NULL                                                                \
     } ;                                                                     \
-    GrB_IndexUnaryOp GRB (op) = & GB_OPAQUE (GB_OP (op)) ;
+    GrB_IndexUnaryOp GRB (op) = & GB_OPAQUE (GB_OP (op))
 
 // GrB_IndexUnaryOps that depend on A(i,j), and result is bool: VALUE* ops
 #define GRB_IDXOP_VALUE(op,str)                                             \
@@ -340,7 +341,7 @@ GXB_OP2_POS (SECONDJ1  , "secondj1"  , INT64) ;
         GB_ ## op ## _idxunop_code,                                         \
         NULL                                                                \
     } ;                                                                     \
-    GrB_IndexUnaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GrB_IndexUnaryOp GRB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 // GrB_IndexUnaryOps that depend on A(i,j), result is bool: VALUE* complex ops
 #define GXB_IDXOP_VALUE(op,str)                                             \
@@ -358,7 +359,7 @@ GXB_OP2_POS (SECONDJ1  , "secondj1"  , INT64) ;
         GB_ ## op ## _idxunop_code,                                         \
         NULL                                                                \
     } ;                                                                     \
-    GrB_IndexUnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op)) ;
+    GrB_IndexUnaryOp GXB (GB_OP (op)) = & GB_OPAQUE (GB_OP (op))
 
 //------------------------------------------------------------------------------
 // built-in select operators
@@ -376,7 +377,7 @@ GXB_OP2_POS (SECONDJ1  , "secondj1"  , INT64) ;
         GB_ ## op ## _selop_code,                                           \
         NULL                                                                \
     } ;                                                                     \
-    GxB_SelectOp GXB (op) = & GB_OPAQUE (op) ;
+    GxB_SelectOp GXB (op) = & GB_OPAQUE (op)
 
 GXB_SEL (TRIL     , "tril"    ) ;
 GXB_SEL (TRIU     , "triu"    ) ;
diff --git a/GraphBLAS/Source/GB_printf.h b/GraphBLAS/Source/GB_printf.h
index 140ba086e..75b3368bb 100644
--- a/GraphBLAS/Source/GB_printf.h
+++ b/GraphBLAS/Source/GB_printf.h
@@ -140,6 +140,7 @@ void GB_burble_assign
 // define the function to use to burble
 #define GBURBLE(...)                                \
 {                                                   \
+    GB_NVTX ; /* HACK */ \
     if (GB_Global_burble_get ( ))                   \
     {                                               \
         GBDUMP (__VA_ARGS__) ;                      \
@@ -169,6 +170,7 @@ void GB_burble_assign
     #define GB_BURBLE_START(func)                       \
     double t_burble = 0 ;                               \
     {                                                   \
+        GB_NVTX /* HACK */ \
         if (GB_Global_burble_get ( ))                   \
         {                                               \
             GBURBLE (" [ " func " ") ;                  \
@@ -178,6 +180,7 @@ void GB_burble_assign
 
     #define GB_BURBLE_END                               \
     {                                                   \
+        GB_NVTX /* HACK */ \
         if (GB_Global_burble_get ( ))                   \
         {                                               \
             t_burble = GB_OPENMP_GET_WTIME - t_burble ; \
diff --git a/GraphBLAS/Source/GB_reduce_to_scalar.c b/GraphBLAS/Source/GB_reduce_to_scalar.c
index 157bb3191..4406b98d2 100644
--- a/GraphBLAS/Source/GB_reduce_to_scalar.c
+++ b/GraphBLAS/Source/GB_reduce_to_scalar.c
@@ -101,7 +101,7 @@ GrB_Info GB_reduce_to_scalar    // s = reduce_to_scalar (A)
     //--------------------------------------------------------------------------
 
     #if defined ( GBCUDA )
-    if (!A->iso && GB_reduce_to_scalar_cuda_branch (reduce, A, Context))
+    if (GB_reduce_to_scalar_cuda_branch (reduce, A, Context))
     {
 
         //----------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_reshape.c b/GraphBLAS/Source/GB_reshape.c
new file mode 100644
index 000000000..775d2a7dc
--- /dev/null
+++ b/GraphBLAS/Source/GB_reshape.c
@@ -0,0 +1,396 @@
+//------------------------------------------------------------------------------
+// GB_reshape:  reshape a matrix into another matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If the input matrix is nrows-by-ncols, and the size of the newly-created
+// matrix C is nrows_new-by-ncols_new, then nrows*ncols must equal
+// nrows_new*ncols_new.
+
+#include "GB.h"
+#include "GB_reshape.h"
+#include "GB_transpose.h"
+#include "GB_ek_slice.h"
+#include "GB_build.h"
+
+#define GB_FREE_WORKSPACE                       \
+{                                               \
+    GB_WERK_POP (T_ek_slicing, int64_t) ;       \
+    GB_FREE (&I_work, I_work_size) ;            \
+    GB_FREE (&J_work, J_work_size) ;            \
+    GB_FREE (&S_work, S_work_size) ;            \
+    if (T != A && T != C)                       \
+    {                                           \
+        GB_Matrix_free (&T) ;                   \
+    }                                           \
+}
+
+#define GB_FREE_ALL                             \
+{                                               \
+    GB_FREE_WORKSPACE ;                         \
+    if (Chandle == NULL)                        \
+    {                                           \
+        GB_phbix_free (A) ;                     \
+    }                                           \
+    else                                        \
+    {                                           \
+        GB_Matrix_free (&C) ;                   \
+    }                                           \
+}
+
+GrB_Info GB_reshape         // reshape a GrB_Matrix into another GrB_Matrix
+(
+    // output, if not in-place:
+    GrB_Matrix *Chandle,    // output matrix, in place if Chandle == NULL
+    // input, or input/output:
+    GrB_Matrix A,           // input matrix, or input/output if in-place
+    // input:
+    bool by_col,            // true if reshape by column, false if by row
+    int64_t nrows_new,      // number of rows of C
+    int64_t ncols_new,      // number of columns of C
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (A, "A for reshape", GB0) ;
+
+    int64_t *I_work = NULL, *J_work = NULL ;
+    GB_void *S_work = NULL, *S_input = NULL ;
+    size_t I_work_size = 0, J_work_size = 0, S_work_size = 0 ;
+    GB_WERK_DECLARE (T_ek_slicing, int64_t) ;
+    GrB_Matrix C = NULL, T = NULL ;
+
+    bool in_place = (Chandle == NULL) ;
+    if (!in_place)
+    { 
+        (*Chandle) = NULL ;
+    }
+
+    GrB_Index matrix_size, s ;
+    int64_t nrows_old = GB_NROWS (A) ;
+    int64_t ncols_old = GB_NCOLS (A) ;
+    bool ok = GB_int64_multiply (&matrix_size, nrows_old, ncols_old) ;
+    if (!ok)
+    { 
+        // problem too large
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    ok = GB_int64_multiply (&s, nrows_new, ncols_new) ;
+    if (!ok || s != matrix_size)
+    { 
+        // dimensions are invalid
+        return (GrB_DIMENSION_MISMATCH) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // finish any pending work, and transpose the input matrix if needed
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT (A) ;
+
+    GrB_Type type = A->type ;
+    bool A_is_csc = A->is_csc ;
+    if (A_is_csc != by_col)
+    {
+        // transpose the input matrix
+        if (in_place)
+        { 
+            // transpose A in-place
+            GB_OK (GB_transpose_in_place (A, by_col, Context)) ;
+            T = A ;
+        }
+        else
+        { 
+            // T = A'
+            GB_OK (GB_new (&T,  // new header
+                type, A->vdim, A->vlen, GB_Ap_null, by_col, GxB_AUTO_SPARSITY,
+                GB_Global_hyper_switch_get ( ), 0, Context)) ;
+            GB_OK (GB_transpose_cast (T, type, by_col, A, false, Context)) ;
+            // now T can be reshaped in-place to construct C
+            in_place = true ;
+        }
+    }
+    else
+    { 
+        // use T = A as-is, and reshape it either in-place or not in-place
+        T = A ;
+    }
+
+    // T is now in the format required for the reshape
+    ASSERT_MATRIX_OK (T, "T for reshape", GB0) ;
+    ASSERT (T->is_csc == by_col) ;
+
+    //--------------------------------------------------------------------------
+    // determine the dimensions of C
+    //--------------------------------------------------------------------------
+
+    int64_t vlen_new, vdim_new ;
+    bool T_is_csc = T->is_csc ;
+    if (T_is_csc)
+    { 
+        vlen_new = nrows_new ;
+        vdim_new = ncols_new ;
+    }
+    else
+    { 
+        vlen_new = ncols_new ;
+        vdim_new = nrows_new ;
+    }
+
+    //--------------------------------------------------------------------------
+    // C = reshape (T), keeping the same format (by_col)
+    //--------------------------------------------------------------------------
+
+    if (GB_IS_FULL (T) || GB_IS_BITMAP (T))
+    {
+
+        //----------------------------------------------------------------------
+        // T and C are both full or both bitmap
+        //----------------------------------------------------------------------
+
+        if (in_place)
+        { 
+            // move T into C
+            C = T ;
+            T = NULL ;
+        }
+        else
+        { 
+            // copy T into C
+            GB_OK (GB_dup (&C, T, Context)) ;
+        }
+        // change the size of C
+        C->vlen = vlen_new ;
+        C->vdim = vdim_new ;
+        C->nvec = vdim_new ;
+        C->nvec_nonempty = (vlen_new == 0) ? 0 : vdim_new ;
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // sparse/hypersparse case
+        //----------------------------------------------------------------------
+
+        int64_t nvals = GB_nnz (T) ;
+        int64_t *Tp = T->p ;
+        int64_t *Th = T->h ;
+        int64_t *Ti = T->i ;
+        bool T_iso = T->iso ;
+        int64_t tvlen = T->vlen ;
+        bool T_jumbled = T->jumbled ;
+
+        GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+        int T_nthreads, T_ntasks ;
+        GB_SLICE_MATRIX (T, 1, chunk) ;
+
+        //----------------------------------------------------------------------
+        // allocate output and workspace
+        //----------------------------------------------------------------------
+
+        if (in_place)
+        { 
+
+            //------------------------------------------------------------------
+            // Remove T->i and T->x from T; these become I_work and S_work
+            //------------------------------------------------------------------
+
+            // remove T->i from T; it becomes I_work
+            I_work = T->i ; I_work_size = T->i_size ;
+            T->i = NULL   ; T->i_size = 0 ;
+            // remove T->x from T; it becomes S_work
+            S_work = T->x ; S_work_size = T->x_size ;
+            T->x = NULL   ; T->x_size = 0 ;
+            S_input = NULL ;
+
+            // move T into C
+            C = T ;
+            T = NULL ;
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // create a new matrix C for GB_builder and allocate I_work
+            //------------------------------------------------------------------
+
+            // create the output matrix (just the header; no content)
+            GB_OK (GB_new (&C, // new header
+                type, vlen_new, vdim_new, GB_Ap_null, T_is_csc,
+                GxB_AUTO_SPARSITY, GB_Global_hyper_switch_get ( ), 0,
+                Context)) ;
+            // allocate new space for the future C->i
+            I_work = GB_MALLOC (nvals, int64_t, &I_work_size) ;
+            if (I_work == NULL)
+            { 
+                // out of memory
+                GB_FREE_ALL ;
+                return (GrB_OUT_OF_MEMORY) ;
+            }
+            // use T->x as S_input to GB_builder, which is not modified
+            S_input = T->x ;
+        }
+
+        if (vdim_new > 1)
+        {
+            // J_work is not needed if vdim_new == 1
+            J_work = GB_MALLOC (nvals, int64_t, &J_work_size) ;
+            if (J_work == NULL)
+            { 
+                // out of memory
+                GB_FREE_ALL ;
+                return (GrB_OUT_OF_MEMORY) ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // construct the new indices
+        //----------------------------------------------------------------------
+
+        int tid ;
+
+        if (vdim_new == 1)
+        { 
+
+            //------------------------------------------------------------------
+            // C is a single vector: no J_work is needed, and new index is 1D
+            //------------------------------------------------------------------
+
+            #pragma omp parallel for num_threads(T_nthreads) schedule(static)
+            for (tid = 0 ; tid < T_ntasks ; tid++)
+            {
+                int64_t kfirst = kfirst_Tslice [tid] ;
+                int64_t klast  = klast_Tslice  [tid] ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    int64_t jold = GBH (Th, k) ;
+                    int64_t pT_start, pT_end ;
+                    GB_get_pA (&pT_start, &pT_end, tid, k,
+                        kfirst, klast, pstart_Tslice, Tp, tvlen) ;
+                    for (int64_t p = pT_start ; p < pT_end ; p++)
+                    {
+                        int64_t iold = Ti [p] ;
+                        // convert (iold,jold) to a 1D index
+                        int64_t index_1d = iold + jold * tvlen ;
+                        // save the new 1D index
+                        I_work [p] = index_1d ;
+                    }
+                }
+            }
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // C is a matrix
+            //------------------------------------------------------------------
+
+            #pragma omp parallel for num_threads(T_nthreads) schedule(static)
+            for (tid = 0 ; tid < T_ntasks ; tid++)
+            {
+                int64_t kfirst = kfirst_Tslice [tid] ;
+                int64_t klast  = klast_Tslice  [tid] ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    int64_t jold = GBH (Th, k) ;
+                    int64_t pT_start, pT_end ;
+                    GB_get_pA (&pT_start, &pT_end, tid, k,
+                        kfirst, klast, pstart_Tslice, Tp, tvlen) ;
+                    for (int64_t p = pT_start ; p < pT_end ; p++)
+                    {
+                        int64_t iold = Ti [p] ;
+                        // convert (iold,jold) to a 1D index
+                        int64_t index_1d = iold + jold * tvlen ;
+                        // convert the 1D index to the 2d index: (inew,jnew)
+                        int64_t inew = index_1d % vlen_new ;
+                        int64_t jnew = (index_1d - inew) / vlen_new ;
+                        // save the new indices
+                        I_work [p] = inew ;
+                        J_work [p] = jnew ;
+                    }
+                }
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // free the old C->p and C->h, if constructing C in place
+        //----------------------------------------------------------------------
+
+        if (in_place)
+        { 
+            GB_phbix_free (C) ;
+        }
+
+        //----------------------------------------------------------------------
+        // build the output matrix C
+        //----------------------------------------------------------------------
+
+        GB_OK (GB_builder (
+            C,              // output matrix
+            type,           // same type as T
+            vlen_new,       // new vlen
+            vdim_new,       // new vdim
+            T_is_csc,       // same format as T
+            &I_work,        // transplanted into C->i
+            &I_work_size,
+            &J_work,        // freed when done
+            &J_work_size,
+            &S_work,        // array of values; transplanted into C->x in-place
+            &S_work_size,
+            !T_jumbled,     // indices may be jumbled on input
+            true,           // no duplicates exist
+            nvals,          // number of entries in T and C 
+            true,           // C is a matrix
+            NULL,           // I_input is not used
+            NULL,           // J_input is not used
+            S_input,        // S_input is used if not in-place; NULL if in-place
+            T_iso,          // true if T and C are iso-valued
+            nvals,          // number of entries in T and C 
+            NULL,           // no dup operator
+            type,           // type of S_work and S_input
+            Context
+        )) ;
+
+        ASSERT (I_work == NULL) ;   // transplanted into C->i
+        ASSERT (J_work == NULL) ;   // freed by GB_builder
+        ASSERT (S_work == NULL) ;   // freed by GB_builder
+    }
+
+    //--------------------------------------------------------------------------
+    // transpose C if needed, to change its format to match the format of A
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (C, "C for reshape before transpose", GB0) ;
+    ASSERT (C->is_csc == T_is_csc) ;
+    if (A_is_csc != T_is_csc)
+    { 
+        GB_OK (GB_transpose_in_place (C, A_is_csc, Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace, conform C, and return results
+    //--------------------------------------------------------------------------
+
+    GB_FREE_WORKSPACE ;
+    GB_OK (GB_conform (C, Context)) ;
+    ASSERT_MATRIX_OK (C, "C result for reshape", GB0) ;
+    if (Chandle != NULL)
+    { 
+        (*Chandle) = C ;
+    }
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_reshape.h b/GraphBLAS/Source/GB_reshape.h
new file mode 100644
index 000000000..e6f372b50
--- /dev/null
+++ b/GraphBLAS/Source/GB_reshape.h
@@ -0,0 +1,27 @@
+//------------------------------------------------------------------------------
+// GB_reshape:  reshape a matrix into another matrix, or reshape it in-place
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_RESHAPE_H
+#define GB_RESHAPE_H
+
+GrB_Info GB_reshape         // reshape a GrB_Matrix into another GrB_Matrix
+(
+    // output, if not in-place:
+    GrB_Matrix *Chandle,    // output matrix, in place if Chandle == NULL
+    // input, or input/output:
+    GrB_Matrix A,           // input matrix, or input/output if in-place
+    // input:
+    bool by_col,            // true if reshape by column, false if by row
+    int64_t nrows_new,      // number of rows of C
+    int64_t ncols_new,      // number of columns of C
+    GB_Context Context
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_serialize.c b/GraphBLAS/Source/GB_serialize.c
index a9d269c67..24be802d4 100644
--- a/GraphBLAS/Source/GB_serialize.c
+++ b/GraphBLAS/Source/GB_serialize.c
@@ -129,10 +129,12 @@ GrB_Info GB_serialize               // serialize a matrix into a blob
     int32_t algo, level ;
     GB_serialize_method (&algo, &level, method) ;
     method = algo + level ;
-    GBURBLE ("(compression: %s%s%s:%d) ",
+    GBURBLE ("(compression: %s%s%s%s:%d) ",
         (algo == GxB_COMPRESSION_NONE ) ? "none" : "",
         (algo == GxB_COMPRESSION_LZ4  ) ? "LZ4" : "",
-        (algo == GxB_COMPRESSION_LZ4HC) ? "LZ4HC" : "", level) ;
+        (algo == GxB_COMPRESSION_LZ4HC) ? "LZ4HC" : "",
+        (algo == GxB_COMPRESSION_ZSTD ) ? "ZSTD" : "",
+        level) ;
 
     //--------------------------------------------------------------------------
     // get the content of the matrix
@@ -169,6 +171,7 @@ GrB_Info GB_serialize               // serialize a matrix into a blob
     {
         case GxB_HYPERSPARSE : 
             Ah_len = sizeof (GrB_Index) * nvec ;
+            // fall through to the sparse case
         case GxB_SPARSE :
             Ap_len = sizeof (GrB_Index) * (nvec+1) ;
             Ai_len = sizeof (GrB_Index) * anz ;
@@ -176,6 +179,7 @@ GrB_Info GB_serialize               // serialize a matrix into a blob
             break ;
         case GxB_BITMAP : 
             Ab_len = sizeof (int8_t) * anz_held ;
+            // fall through to the full case
         case GxB_FULL : 
             Ax_len = typesize * (iso ? 1 : anz_held) ;
             break ;
@@ -323,6 +327,11 @@ GrB_Info GB_serialize               // serialize a matrix into a blob
     { 
         // only copy the type_name for user-defined types
         memset (blob + s, 0, GxB_MAX_NAME_LEN) ;
+        #if GB_COMPILER_GCC
+        #if (__GNUC__ > 5)
+        #pragma GCC diagnostic ignored "-Wstringop-truncation"
+        #endif
+        #endif
         strncpy ((char *) (blob + s), atype->name, GxB_MAX_NAME_LEN-1) ;
         s += GxB_MAX_NAME_LEN ;
     }
diff --git a/GraphBLAS/Source/GB_serialize_array.c b/GraphBLAS/Source/GB_serialize_array.c
index 97efed582..0e46be8ff 100644
--- a/GraphBLAS/Source/GB_serialize_array.c
+++ b/GraphBLAS/Source/GB_serialize_array.c
@@ -9,11 +9,12 @@
 
 // Parallel compression method for an array.  The array is compressed into
 // a sequence of independently allocated blocks, or returned as-is if not
-// compressed.  Currently, only LZ4 is supported.
+// compressed.  Currently, only LZ4, LZ4HC, and ZSTD are supported.
 
 #include "GB.h"
 #include "GB_serialize.h"
 #include "GB_lz4.h"
+#include "GB_zstd.h"
 
 #define GB_FREE_ALL                                                     \
 {                                                                       \
@@ -47,7 +48,6 @@ GrB_Info GB_serialize_array
     // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
     ASSERT (Blocks_handle != NULL) ;
     ASSERT (Blocks_size_handle != NULL) ;
     ASSERT (Sblocks_handle != NULL) ;
@@ -131,6 +131,7 @@ GrB_Info GB_serialize_array
     int64_t blocksize = (nthreads == 1) ? len : GB_ICEIL (len, 4*nthreads) ;
 
     // ensure the blocksize does not exceed the LZ4 maximum
+    // ... this is also fine for ZSTD
     ASSERT (LZ4_MAX_INPUT_SIZE < INT32_MAX) ;
     blocksize = GB_IMIN (blocksize, LZ4_MAX_INPUT_SIZE/2) ;
 
@@ -166,7 +167,20 @@ GrB_Info GB_serialize_array
         size_t uncompressed = kend - kstart ;
         ASSERT (uncompressed < INT32_MAX) ;
         ASSERT (uncompressed > 0) ;
-        size_t s = (size_t) LZ4_compressBound ((int) uncompressed) ;
+
+        size_t s ;
+        switch (algo)
+        {
+            case GxB_COMPRESSION_LZ4 : 
+            case GxB_COMPRESSION_LZ4HC : 
+                s = (size_t) LZ4_compressBound ((int) uncompressed) ;
+                break ;
+            default :
+            case GxB_COMPRESSION_ZSTD : 
+                s = ZSTD_compressBound (uncompressed) ;
+                break ;
+        }
+
         ASSERT (s < INT32_MAX) ;
         if (dryrun)
         { 
@@ -217,19 +231,32 @@ GrB_Info GB_serialize_array
         size_t dsize = Blocks [blockid].p_size_allocated ;  // size of dest
         int dstCapacity = GB_IMIN (dsize, INT32_MAX) ;
         int s ;
+        size_t s64 ;
         switch (algo)
         {
-            default :
+
             case GxB_COMPRESSION_LZ4 : 
                 s = LZ4_compress_default (src, dst, srcSize, dstCapacity) ;
+                ok = ok && (s > 0) ;
+                // compressed block is now in dst [0:s-1], of size s
+                Sblocks [blockid] = (int64_t) s ;
                 break ;
+
             case GxB_COMPRESSION_LZ4HC : 
                 s = LZ4_compress_HC (src, dst, srcSize, dstCapacity, level) ;
+                ok = ok && (s > 0) ;
+                // compressed block is now in dst [0:s-1], of size s
+                Sblocks [blockid] = (int64_t) s ;
+                break ;
+
+            default :
+            case GxB_COMPRESSION_ZSTD : 
+                s64 = ZSTD_compress (dst, dstCapacity, src, srcSize, level) ;
+                ok = ok && (s64 <= dstCapacity) ;
+                // compressed block is now in dst [0:s64-1], of size s64
+                Sblocks [blockid] = (int64_t) s64 ;
                 break ;
         }
-        ok = ok && (s > 0) ;
-        // compressed block is now in dst [0:s-1], of size s
-        Sblocks [blockid] = (int64_t) s ;
     }
 
     if (!ok)
diff --git a/GraphBLAS/Source/GB_serialize_method.c b/GraphBLAS/Source/GB_serialize_method.c
index 6ad1fcef9..e3fdbca44 100644
--- a/GraphBLAS/Source/GB_serialize_method.c
+++ b/GraphBLAS/Source/GB_serialize_method.c
@@ -40,24 +40,29 @@ void GB_serialize_method
     {
 
         default : 
-            (*algo) = GxB_COMPRESSION_LZ4 ; 
-            (*level) = 0 ;              // level is ignored
+            // The default method has changed to ZSTD, level 1, as of
+            // SuiteSparse:GraphBLAS v7.2.0.
+            (*algo) = GxB_COMPRESSION_ZSTD ; 
+            (*level) = 1 ;              // fast with good compression
             break ;
 
         case GxB_COMPRESSION_LZ4 : 
             (*level) = 0 ;              // level is ignored
             break ;
 
-        case GxB_COMPRESSION_LZ4HC : 
-            // level 1 to 9, with a default of 9.  Note that LZ4HC supports
-            // levels 10, 11, and 12, but these are very slow and do not
-            // provide much benefit over level 9.  Level 10 often results in
-            // a larger blob than level 9.  Level 12 is typically just a tiny
-            // bit more compact than level 9, but can be 10x slower, or worse,
-            // as compared to level 9.
+        case GxB_COMPRESSION_LZ4HC :    // LZ4HC: level 1 to 9; default 9.
+            // Note that LZ4HC supports levels 10, 11, and 12, but these are
+            // very slow and do not provide much benefit over level 9.  Level
+            // 10 often results in a larger blob than level 9.  Level 12 is
+            // typically just a tiny bit more compact than level 9, but can be
+            // 10x slower, or worse, as compared to level 9.
             if ((*level) <= 0 || (*level) > 9) (*level) = 9 ;
             break ;
 
+        case GxB_COMPRESSION_ZSTD :     // ZSTD: level 1 to 19; default 1.
+            if ((*level) <= 0 || (*level) > 19) (*level) = 1 ;
+            break ;
+
 //      These cases will be uncommented when the methods are implemented:
 
 //      case GxB_COMPRESSION_ZLIB:
diff --git a/GraphBLAS/Source/GB_serialize_to_blob.c b/GraphBLAS/Source/GB_serialize_to_blob.c
index eb47220fa..46289706b 100644
--- a/GraphBLAS/Source/GB_serialize_to_blob.c
+++ b/GraphBLAS/Source/GB_serialize_to_blob.c
@@ -27,7 +27,6 @@ void GB_serialize_to_blob
     // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
     ASSERT (blob != NULL) ;
     ASSERT (s_handle != NULL) ;
     ASSERT (nblocks >= 0) ;
diff --git a/GraphBLAS/Source/GB_setElement.c b/GraphBLAS/Source/GB_setElement.c
index 031f112ce..6291fe23c 100644
--- a/GraphBLAS/Source/GB_setElement.c
+++ b/GraphBLAS/Source/GB_setElement.c
@@ -1,5 +1,5 @@
 //------------------------------------------------------------------------------
-// GB_setElement: C(row,col) = scalar
+// GB_setElement: C(row,col) = scalar or += scalar
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -17,14 +17,15 @@
 // non-blocking, the tuple (i,j,scalar) is appended to a list of pending tuples
 // to C.  GB_wait assembles these pending tuples.
 
-// GrB_setElement when accum is NULL is the same as GrB_*assign with an implied
-// SECOND accum operator whose ztype, xtype, and ytype are the same as C, with
-// I=i, J=j, a 1-by-1 dense matrix A (where nnz (A) == 1), no mask, mask not
-// complemented, C_replace effectively false (its value is ignored), and A
-// transpose effectively false (since transposing a scalar has no effect).
+// GB_setElement when accum is NULL is used by GrB_*_setElement.  It is the
+// same as GrB_*assign with an implied SECOND accum operator whose ztype,
+// xtype, and ytype are the same as C, with I=i, J=j, a 1-by-1 dense matrix A
+// (where nnz (A) == 1), no mask, mask not complemented, C_replace effectively
+// false (its value is ignored), and A transpose effectively false (since
+// transposing a scalar has no effect).
 
-// GrB_setElement when accum is not NULL uses the accum operator instead of
-// the implied SECOND operator.
+// GB_setElement when accum is not NULL uses the accum operator instead of the
+// implied SECOND operator.  It is used by GrB_*_assign, as a special case.
 
 // Compare this function with GrB_*_extractElement_*
 
diff --git a/GraphBLAS/Source/GB_shallow_copy.c b/GraphBLAS/Source/GB_shallow_copy.c
index ad6461f6b..53980ae7f 100644
--- a/GraphBLAS/Source/GB_shallow_copy.c
+++ b/GraphBLAS/Source/GB_shallow_copy.c
@@ -24,6 +24,7 @@
 // Compare this function with GB_shallow_op.c.
 
 #include "GB_transpose.h"
+#include "GB_unused.h"
 
 #define GB_FREE_ALL ;
 
diff --git a/GraphBLAS/Source/GB_split_bitmap.c b/GraphBLAS/Source/GB_split_bitmap.c
index 9ed1485b1..9d1e0f69f 100644
--- a/GraphBLAS/Source/GB_split_bitmap.c
+++ b/GraphBLAS/Source/GB_split_bitmap.c
@@ -37,11 +37,11 @@ GrB_Info GB_split_bitmap            // split a bitmap matrix
     bool csc = A->is_csc ;
     GrB_Type atype = A->type ;
     int64_t avlen = A->vlen ;
-    int64_t avdim = A->vdim ;
+//  int64_t avdim = A->vdim ;
     size_t asize = atype->size ;
     const int8_t *restrict Ab = A->b ;
     const bool A_iso = A->iso ;
-    int64_t anz = GB_nnz (A) ;
+//  int64_t anz = GB_nnz (A) ;
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
 
@@ -145,11 +145,13 @@ GrB_Info GB_split_bitmap            // split a bitmap matrix
 
                     case GB_16BYTE : // double complex or 16-byte user-defined
                         #define GB_CTYPE GB_blob16
-//                      #define GB_CTYPE uint64_t
-//                      #undef  GB_COPY
-//                      #define GB_COPY(pC,pA)                      \
-//                          Cx [2*pC  ] = Ax [2*pA  ] ;             \
-//                          Cx [2*pC+1] = Ax [2*pA+1] ;
+                        /*
+                        #define GB_CTYPE uint64_t
+                        #undef  GB_COPY
+                        #define GB_COPY(pC,pA)                      \
+                            Cx [2*pC  ] = Ax [2*pA  ] ;             \
+                            Cx [2*pC+1] = Ax [2*pA+1] ;
+                        */
                         #include "GB_split_bitmap_template.c"
                         break ;
 
diff --git a/GraphBLAS/Source/GB_split_full.c b/GraphBLAS/Source/GB_split_full.c
index 23d58c6ea..df65dd235 100644
--- a/GraphBLAS/Source/GB_split_full.c
+++ b/GraphBLAS/Source/GB_split_full.c
@@ -37,7 +37,7 @@ GrB_Info GB_split_full              // split a full matrix
     bool csc = A->is_csc ;
     GrB_Type atype = A->type ;
     int64_t avlen = A->vlen ;
-    int64_t avdim = A->vdim ;
+//  int64_t avdim = A->vdim ;
     size_t asize = atype->size ;
     const bool A_iso = A->iso ;
 
@@ -137,11 +137,13 @@ GrB_Info GB_split_full              // split a full matrix
 
                         case GB_16BYTE : // double complex or 16-byte user
                             #define GB_CTYPE GB_blob16
-//                          #define GB_CTYPE uint64_t
-//                          #undef  GB_COPY
-//                          #define GB_COPY(pC,pA)                          \
-//                              Cx [2*pC  ] = Ax [2*pA  ] ;                 \
-//                              Cx [2*pC+1] = Ax [2*pA+1] ;
+                            /*
+                            #define GB_CTYPE uint64_t
+                            #undef  GB_COPY
+                            #define GB_COPY(pC,pA)                          \
+                                Cx [2*pC  ] = Ax [2*pA  ] ;                 \
+                                Cx [2*pC+1] = Ax [2*pA+1] ;
+                            */
                             #include "GB_split_full_template.c"
                             break ;
 
diff --git a/GraphBLAS/Source/GB_split_sparse.c b/GraphBLAS/Source/GB_split_sparse.c
index 9d35db3fc..fa730feb5 100644
--- a/GraphBLAS/Source/GB_split_sparse.c
+++ b/GraphBLAS/Source/GB_split_sparse.c
@@ -45,8 +45,8 @@ GrB_Info GB_split_sparse            // split a sparse matrix
     float hyper_switch = A->hyper_switch ;
     bool csc = A->is_csc ;
     GrB_Type atype = A->type ;
-    int64_t avlen = A->vlen ;
-    int64_t avdim = A->vdim ;
+//  int64_t avlen = A->vlen ;
+//  int64_t avdim = A->vdim ;
     size_t asize = atype->size ;
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
@@ -273,11 +273,13 @@ GrB_Info GB_split_sparse            // split a sparse matrix
 
                     case GB_16BYTE : // double complex or 16-byte user-defined
                         #define GB_CTYPE GB_blob16
-//                      #define GB_CTYPE uint64_t
-//                      #undef  GB_COPY
-//                      #define GB_COPY(pC,pA)                  \
-//                          Cx [2*pC  ] = Ax [2*pA  ] ;         \
-//                          Cx [2*pC+1] = Ax [2*pA+1] ;
+                        /*
+                        #define GB_CTYPE uint64_t
+                        #undef  GB_COPY
+                        #define GB_COPY(pC,pA)                  \
+                            Cx [2*pC  ] = Ax [2*pA  ] ;         \
+                            Cx [2*pC+1] = Ax [2*pA+1] ;
+                        */
                         #include "GB_split_sparse_template.c"
                         break ;
 
diff --git a/GraphBLAS/Source/GB_subassign_09.c b/GraphBLAS/Source/GB_subassign_09.c
index 1f02ffef8..8b1e53081 100644
--- a/GraphBLAS/Source/GB_subassign_09.c
+++ b/GraphBLAS/Source/GB_subassign_09.c
@@ -18,8 +18,8 @@
 
 // C: not bitmap or full
 
-#include "GB_unused.h"
 #include "GB_subassign_methods.h"
+#include "GB_unused.h"
 
 GrB_Info GB_subassign_09
 (
diff --git a/GraphBLAS/Source/GB_subassign_11.c b/GraphBLAS/Source/GB_subassign_11.c
index 3f22e08e6..e6abe3f92 100644
--- a/GraphBLAS/Source/GB_subassign_11.c
+++ b/GraphBLAS/Source/GB_subassign_11.c
@@ -18,8 +18,8 @@
 
 // C, M: not bitmap
 
-#include "GB_unused.h"
 #include "GB_subassign_methods.h"
+#include "GB_unused.h"
 
 GrB_Info GB_subassign_11
 (
diff --git a/GraphBLAS/Source/GB_subassign_methods.h b/GraphBLAS/Source/GB_subassign_methods.h
index d7e6b48d7..c8c7381fd 100644
--- a/GraphBLAS/Source/GB_subassign_methods.h
+++ b/GraphBLAS/Source/GB_subassign_methods.h
@@ -14,8 +14,8 @@
 #include "GB_add.h"
 #include "GB_ij.h"
 #include "GB_Pending.h"
-#include "GB_unused.h"
 #include "GB_subassign_IxJ_slice.h"
+#include "GB_unused.h"
 
 //------------------------------------------------------------------------------
 // free workspace
diff --git a/GraphBLAS/Source/GB_subref_phase3.c b/GraphBLAS/Source/GB_subref_phase3.c
index 525c123f9..6359945aa 100644
--- a/GraphBLAS/Source/GB_subref_phase3.c
+++ b/GraphBLAS/Source/GB_subref_phase3.c
@@ -12,6 +12,7 @@
 
 #include "GB_subref.h"
 #include "GB_sort.h"
+#include "GB_unused.h"
 
 GrB_Info GB_subref_phase3   // C=A(I,J)
 (
diff --git a/GraphBLAS/Source/GB_transpose.c b/GraphBLAS/Source/GB_transpose.c
index 8c2f78baa..45eead3e4 100644
--- a/GraphBLAS/Source/GB_transpose.c
+++ b/GraphBLAS/Source/GB_transpose.c
@@ -122,8 +122,8 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A' or C=op(A')
     //--------------------------------------------------------------------------
 
     GrB_Type atype = A->type ;
-    size_t asize = atype->size ;
-    GB_Type_code acode = atype->code ;
+//  size_t asize = atype->size ;
+//  GB_Type_code acode = atype->code ;
 
     bool A_is_bitmap = GB_IS_BITMAP (A) ;
     bool A_is_hyper  = GB_IS_HYPERSPARSE (A) ;
@@ -729,8 +729,8 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A' or C=op(A')
             // allocate the output matrix and additional space (jwork and Swork)
             //------------------------------------------------------------------
 
-            // initialize the header of T, with no content
-            // content, and initialize the type and dimension of T.
+            // initialize the header of T, with no content,
+            // and initialize the type and dimension of T.
 
             info = GB_new (&T, // hyper, existing header
                 ctype, avdim, avlen, GB_Ap_null, C_is_csc,
diff --git a/GraphBLAS/Source/GB_transpose_bucket.c b/GraphBLAS/Source/GB_transpose_bucket.c
index 761d74a4d..9727c2d19 100644
--- a/GraphBLAS/Source/GB_transpose_bucket.c
+++ b/GraphBLAS/Source/GB_transpose_bucket.c
@@ -251,7 +251,7 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
 
         ASSERT (nworkspaces == nthreads) ;
         const int64_t *restrict Ap = A->p ;
-        const int64_t *restrict Ah = A->h ;
+//      const int64_t *restrict Ah = A->h ;
         const int64_t *restrict Ai = A->i ;
 
         int tid ;
@@ -264,7 +264,7 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
             for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
             {
                 // iterate over the entries in A(:,j)
-                int64_t j = GBH (Ah, k) ;
+                // int64_t j = GBH (Ah, k) ;
                 int64_t pA_start = Ap [k] ;
                 int64_t pA_end = Ap [k+1] ;
                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
diff --git a/GraphBLAS/Source/GB_transpose_method.c b/GraphBLAS/Source/GB_transpose_method.c
index 87de866fd..4397896a3 100644
--- a/GraphBLAS/Source/GB_transpose_method.c
+++ b/GraphBLAS/Source/GB_transpose_method.c
@@ -30,7 +30,7 @@ bool GB_transpose_method        // if true: use GB_builder, false: use bucket
     int64_t anvec = (A->nvec_nonempty < 0) ? A->nvec : A->nvec_nonempty ;
     int64_t anz = GB_nnz (A) ;
     int64_t avlen = A->vlen ;
-    int64_t avdim = A->vdim ;
+//  int64_t avdim = A->vdim ;
     int anzlog = (anz   == 0) ? 1 : (int) GB_CEIL_LOG2 (anz) ;
     int mlog   = (avlen == 0) ? 1 : (int) GB_CEIL_LOG2 (avlen) ;
     double bucket_factor ;
diff --git a/GraphBLAS/Source/GB_unused.h b/GraphBLAS/Source/GB_unused.h
index 27abbc14e..3043f60f9 100644
--- a/GraphBLAS/Source/GB_unused.h
+++ b/GraphBLAS/Source/GB_unused.h
@@ -30,6 +30,7 @@
     #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
     #endif
     #pragma GCC diagnostic ignored "-Wunused-variable"
+    #pragma GCC diagnostic ignored "-Wuninitialized"
 
 #endif
 
diff --git a/GraphBLAS/Source/GB_warnings.h b/GraphBLAS/Source/GB_warnings.h
index f90877b58..04529b296 100644
--- a/GraphBLAS/Source/GB_warnings.h
+++ b/GraphBLAS/Source/GB_warnings.h
@@ -40,6 +40,8 @@
     #pragma GCC diagnostic ignored "-Wint-in-bool-context"
     #pragma GCC diagnostic ignored "-Wformat-truncation="
     #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+    #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+    #pragma GCC diagnostic ignored "-Wtype-limits"
     // enable these warnings as errors
     #pragma GCC diagnostic error "-Wmisleading-indentation"
     #endif
diff --git a/GraphBLAS/Source/GB_zstd.c b/GraphBLAS/Source/GB_zstd.c
new file mode 100644
index 000000000..f03e35470
--- /dev/null
+++ b/GraphBLAS/Source/GB_zstd.c
@@ -0,0 +1,77 @@
+//------------------------------------------------------------------------------
+// GB_zstd: wrapper for the ZSTD compression library
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_zstd is a wrapper for the ZSTD compression library.  The
+// ZSTD library is compiled with ZSTD_DEP enabled (which is not
+// the default), and configured to use the SuiteSparse:GraphBLAS functions
+// in place of malloc/calloc/free.
+
+#include "GB.h"
+#include "GB_serialize.h"
+#include "GB_zstd.h"
+
+void *ZSTD_malloc (size_t s)
+{
+    return (GB_Global_malloc_function (s)) ;
+}
+
+void *ZSTD_calloc (size_t n, size_t s)
+{
+    // ns = n*s, the size of the space to allocate
+    size_t ns = 0 ;
+    bool ok = GB_size_t_multiply (&ns, n, s) ;
+    if (!ok) return (NULL) ;
+    // malloc the space and then use memset to clear it
+    void *p = GB_Global_malloc_function (ns) ;
+    if (p != NULL) memset (p, 0, ns) ;
+    return (p) ;
+}
+
+void ZSTD_free (void *p)
+{
+    GB_Global_free_function (p) ;
+}
+
+// ZSTD uses switch statements with no default case.
+#pragma GCC diagnostic ignored "-Wswitch-default"
+
+// Include the unmodified zstd, version 1.5.3.  This
+// allows the ZSTD_* functions to be renamed via GB_zstd.h, and avoids any
+// conflict with the original -lzstd, which might be linked in by the user
+// application.
+
+#include "zstd_subset/common/debug.c"
+#include "zstd_subset/common/entropy_common.c"
+#include "zstd_subset/common/error_private.c"
+#include "zstd_subset/common/fse_decompress.c"
+#include "zstd_subset/common/pool.c"
+#include "zstd_subset/common/threading.c"
+#include "zstd_subset/common/xxhash.c"
+#include "zstd_subset/common/zstd_common.c"
+
+#include "zstd_subset/compress/fse_compress.c"
+#include "zstd_subset/compress/hist.c"
+#include "zstd_subset/compress/huf_compress.c"
+#include "zstd_subset/compress/zstd_compress.c"
+#include "zstd_subset/compress/zstd_compress_literals.c"
+#include "zstd_subset/compress/zstd_compress_sequences.c"
+#include "zstd_subset/compress/zstd_compress_superblock.c"
+#include "zstd_subset/compress/zstd_double_fast.c"
+#include "zstd_subset/compress/zstd_fast.c"
+#include "zstd_subset/compress/zstd_lazy.c"
+#include "zstd_subset/compress/zstd_ldm.c"
+#include "zstd_subset/compress/zstdmt_compress.c"
+#include "zstd_subset/compress/zstd_opt.c"
+
+#include "zstd_subset/decompress/huf_decompress.c"
+#include "zstd_subset/decompress/zstd_ddict.c"
+#include "zstd_subset/decompress/zstd_decompress_block.c"
+#include "zstd_subset/decompress/zstd_decompress.c"
+
+
diff --git a/GraphBLAS/Source/GB_zstd.h b/GraphBLAS/Source/GB_zstd.h
new file mode 100644
index 000000000..5de014ddd
--- /dev/null
+++ b/GraphBLAS/Source/GB_zstd.h
@@ -0,0 +1,871 @@
+//------------------------------------------------------------------------------
+// GB_zstd.h: definitions for a wrapper for the ZSTD compression library
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// It's possible that the user application has its own copy of the ZSTD library,
+// which wouldn't be using the SuiteSparse:GraphBLAS memory allocator.  To
+// avoid any conflict between multiple copies of the ZSTD library, all global
+// symbols ZSTD_* are renamed to GBZSTD (ZSTD_*), via #defines below.
+
+#ifndef GB_ZSTD_H 
+#define GB_ZSTD_H 
+
+// ZSTD has its own GB macro, so #undefine the GraphBLAS one, and use GBZSTD
+// to rename the ZSTD functions.
+#undef GB
+
+#ifdef GBRENAME
+    #define GBZSTD(x) GB_EVAL2 (GM_, x)
+#else
+    #define GBZSTD(x) GB_EVAL2 (GB_, x)
+#endif
+
+//------------------------------------------------------------------------------
+// methods called directly by GraphBLAS
+//------------------------------------------------------------------------------
+
+// size_t ZSTD_compressBound (size_t s) : returns the maximum size for the
+// compression of a block of s bytes.
+#define ZSTD_compressBound  GBZSTD (ZSTD_compressBound)
+
+// size_t ZSTD_compress (void *dst, size_t dstCap, const void *src, size_t
+// srcSize, int level) : compresses the uncompressed src block of size srcSize
+// into the output buffer dst of size dstCap.  Returns the compressed size
+// written into dst (<= dstCap), or an error code if it fails.
+#define ZSTD_compress       GBZSTD (ZSTD_compress)
+
+// size_t ZSTD_decompress (void *dst, size_t dstCap, const void *src, size_t
+// compressedSize) : decompresses the compressed src block of size
+// compressedSize into the dst block of size dstCap.  Returns the # of bytes
+// written to dst (<= dstCap), or an error code if it fails.
+#define ZSTD_decompress     GBZSTD (ZSTD_decompress)
+
+//------------------------------------------------------------------------------
+// ensure that ZSTD_malloc, ZSTD_calloc, and ZSTD_free are used.
+//------------------------------------------------------------------------------
+
+// ZSTD will use these 3 functions in place of malloc, calloc, and free.  They
+// are defined in GB_zstd.c, and rely on the malloc and free methods provided by
+// the user application to GraphBLAS by GrB_init or GxB_init.
+
+#define ZSTD_DEPS_MALLOC
+#define ZSTD_malloc  GBZSTD (ZSTD_malloc)
+#define ZSTD_calloc  GBZSTD (ZSTD_calloc)
+#define ZSTD_free    GBZSTD (ZSTD_free)
+void *ZSTD_malloc (size_t s) ;
+void *ZSTD_calloc (size_t n, size_t s) ;
+void  ZSTD_free (void *p) ;
+
+//------------------------------------------------------------------------------
+// methods not directly used, or not used at all by GraphBLAS
+//------------------------------------------------------------------------------
+
+#define ZSTD_adjustCParams                       \
+GBZSTD (ZSTD_adjustCParams)
+#define ZSTD_buildBlockEntropyStats              \
+GBZSTD (ZSTD_buildBlockEntropyStats)
+#define ZSTD_buildCTable                         \
+GBZSTD (ZSTD_buildCTable)
+#define ZSTD_buildFSETable                       \
+GBZSTD (ZSTD_buildFSETable)
+#define ZSTD_CCtx_getParameter                   \
+GBZSTD (ZSTD_CCtx_getParameter)
+#define ZSTD_CCtx_loadDictionary                 \
+GBZSTD (ZSTD_CCtx_loadDictionary)
+#define ZSTD_CCtx_loadDictionary_advanced        \
+GBZSTD (ZSTD_CCtx_loadDictionary_advanced)
+#define ZSTD_CCtx_loadDictionary_byReference     \
+GBZSTD (ZSTD_CCtx_loadDictionary_byReference)
+#define ZSTD_CCtxParams_getParameter             \
+GBZSTD (ZSTD_CCtxParams_getParameter)
+#define ZSTD_CCtxParams_init                     \
+GBZSTD (ZSTD_CCtxParams_init)
+#define ZSTD_CCtxParams_init_advanced            \
+GBZSTD (ZSTD_CCtxParams_init_advanced)
+#define ZSTD_CCtxParams_reset                    \
+GBZSTD (ZSTD_CCtxParams_reset)
+#define ZSTD_CCtxParams_setParameter             \
+GBZSTD (ZSTD_CCtxParams_setParameter)
+#define ZSTD_CCtx_refCDict                       \
+GBZSTD (ZSTD_CCtx_refCDict)
+#define ZSTD_CCtx_refPrefix                      \
+GBZSTD (ZSTD_CCtx_refPrefix)
+#define ZSTD_CCtx_refPrefix_advanced             \
+GBZSTD (ZSTD_CCtx_refPrefix_advanced)
+#define ZSTD_CCtx_refThreadPool                  \
+GBZSTD (ZSTD_CCtx_refThreadPool)
+#define ZSTD_CCtx_reset                          \
+GBZSTD (ZSTD_CCtx_reset)
+#define ZSTD_CCtx_setParameter                   \
+GBZSTD (ZSTD_CCtx_setParameter)
+#define ZSTD_CCtx_setParametersUsingCCtxParams   \
+GBZSTD (ZSTD_CCtx_setParametersUsingCCtxParams)
+#define ZSTD_CCtx_setPledgedSrcSize              \
+GBZSTD (ZSTD_CCtx_setPledgedSrcSize)
+#define ZSTD_CCtx_trace                          \
+GBZSTD (ZSTD_CCtx_trace)
+#define ZSTD_checkContinuity                     \
+GBZSTD (ZSTD_checkContinuity)
+#define ZSTD_checkCParams                        \
+GBZSTD (ZSTD_checkCParams)
+#define ZSTD_compress2                           \
+GBZSTD (ZSTD_compress2)
+#define ZSTD_compress_advanced                   \
+GBZSTD (ZSTD_compress_advanced)
+#define ZSTD_compress_advanced_internal          \
+GBZSTD (ZSTD_compress_advanced_internal)
+#define ZSTD_compressBegin                       \
+GBZSTD (ZSTD_compressBegin)
+#define ZSTD_compressBegin_advanced              \
+GBZSTD (ZSTD_compressBegin_advanced)
+#define ZSTD_compressBegin_advanced_internal     \
+GBZSTD (ZSTD_compressBegin_advanced_internal)
+#define ZSTD_compressBegin_usingCDict            \
+GBZSTD (ZSTD_compressBegin_usingCDict)
+#define ZSTD_compressBegin_usingCDict_advanced   \
+GBZSTD (ZSTD_compressBegin_usingCDict_advanced)
+#define ZSTD_compressBegin_usingDict             \
+GBZSTD (ZSTD_compressBegin_usingDict)
+#define ZSTD_compressBlock                       \
+GBZSTD (ZSTD_compressBlock)
+#define ZSTD_compressBlock_btlazy2               \
+GBZSTD (ZSTD_compressBlock_btlazy2)
+#define ZSTD_compressBlock_btlazy2_dictMatchState \
+GBZSTD (ZSTD_compressBlock_btlazy2_dictMatchState)
+#define ZSTD_compressBlock_btlazy2_extDict       \
+GBZSTD (ZSTD_compressBlock_btlazy2_extDict)
+#define ZSTD_compressBlock_btopt                 \
+GBZSTD (ZSTD_compressBlock_btopt)
+#define ZSTD_compressBlock_btopt_dictMatchState  \
+GBZSTD (ZSTD_compressBlock_btopt_dictMatchState)
+#define ZSTD_compressBlock_btopt_extDict         \
+GBZSTD (ZSTD_compressBlock_btopt_extDict)
+#define ZSTD_compressBlock_btultra               \
+GBZSTD (ZSTD_compressBlock_btultra)
+#define ZSTD_compressBlock_btultra2              \
+GBZSTD (ZSTD_compressBlock_btultra2)
+#define ZSTD_compressBlock_btultra_dictMatchState \
+GBZSTD (ZSTD_compressBlock_btultra_dictMatchState)
+#define ZSTD_compressBlock_btultra_extDict       \
+GBZSTD (ZSTD_compressBlock_btultra_extDict)
+#define ZSTD_compressBlock_doubleFast            \
+GBZSTD (ZSTD_compressBlock_doubleFast)
+#define ZSTD_compressBlock_doubleFast_dictMatchState \
+GBZSTD (ZSTD_compressBlock_doubleFast_dictMatchState)
+#define ZSTD_compressBlock_doubleFast_extDict    \
+GBZSTD (ZSTD_compressBlock_doubleFast_extDict)
+#define ZSTD_compressBlock_fast                  \
+GBZSTD (ZSTD_compressBlock_fast)
+#define ZSTD_compressBlock_fast_dictMatchState   \
+GBZSTD (ZSTD_compressBlock_fast_dictMatchState)
+#define ZSTD_compressBlock_fast_extDict          \
+GBZSTD (ZSTD_compressBlock_fast_extDict)
+#define ZSTD_compressBlock_greedy                \
+GBZSTD (ZSTD_compressBlock_greedy)
+#define ZSTD_compressBlock_greedy_dedicatedDictSearch \
+GBZSTD (ZSTD_compressBlock_greedy_dedicatedDictSearch)
+#define ZSTD_compressBlock_greedy_dedicatedDictSearch_row \
+GBZSTD (ZSTD_compressBlock_greedy_dedicatedDictSearch_row)
+#define ZSTD_compressBlock_greedy_dictMatchState \
+GBZSTD (ZSTD_compressBlock_greedy_dictMatchState)
+#define ZSTD_compressBlock_greedy_dictMatchState_row \
+GBZSTD (ZSTD_compressBlock_greedy_dictMatchState_row)
+#define ZSTD_compressBlock_greedy_extDict        \
+GBZSTD (ZSTD_compressBlock_greedy_extDict)
+#define ZSTD_compressBlock_greedy_extDict_row    \
+GBZSTD (ZSTD_compressBlock_greedy_extDict_row)
+#define ZSTD_compressBlock_greedy_row            \
+GBZSTD (ZSTD_compressBlock_greedy_row)
+#define ZSTD_compressBlock_lazy                  \
+GBZSTD (ZSTD_compressBlock_lazy)
+#define ZSTD_compressBlock_lazy2                 \
+GBZSTD (ZSTD_compressBlock_lazy2)
+#define ZSTD_compressBlock_lazy2_dedicatedDictSearch \
+GBZSTD (ZSTD_compressBlock_lazy2_dedicatedDictSearch)
+#define ZSTD_compressBlock_lazy2_dedicatedDictSearch_row \
+GBZSTD (ZSTD_compressBlock_lazy2_dedicatedDictSearch_row)
+#define ZSTD_compressBlock_lazy2_dictMatchState  \
+GBZSTD (ZSTD_compressBlock_lazy2_dictMatchState)
+#define ZSTD_compressBlock_lazy2_dictMatchState_row \
+GBZSTD (ZSTD_compressBlock_lazy2_dictMatchState_row)
+#define ZSTD_compressBlock_lazy2_extDict         \
+GBZSTD (ZSTD_compressBlock_lazy2_extDict)
+#define ZSTD_compressBlock_lazy2_extDict_row     \
+GBZSTD (ZSTD_compressBlock_lazy2_extDict_row)
+#define ZSTD_compressBlock_lazy2_row             \
+GBZSTD (ZSTD_compressBlock_lazy2_row)
+#define ZSTD_compressBlock_lazy_dedicatedDictSearch \
+GBZSTD (ZSTD_compressBlock_lazy_dedicatedDictSearch)
+#define ZSTD_compressBlock_lazy_dedicatedDictSearch_row \
+GBZSTD (ZSTD_compressBlock_lazy_dedicatedDictSearch_row)
+#define ZSTD_compressBlock_lazy_dictMatchState   \
+GBZSTD (ZSTD_compressBlock_lazy_dictMatchState)
+#define ZSTD_compressBlock_lazy_dictMatchState_row \
+GBZSTD (ZSTD_compressBlock_lazy_dictMatchState_row)
+#define ZSTD_compressBlock_lazy_extDict          \
+GBZSTD (ZSTD_compressBlock_lazy_extDict)
+#define ZSTD_compressBlock_lazy_extDict_row      \
+GBZSTD (ZSTD_compressBlock_lazy_extDict_row)
+#define ZSTD_compressBlock_lazy_row              \
+GBZSTD (ZSTD_compressBlock_lazy_row)
+#define ZSTD_compressCCtx                        \
+GBZSTD (ZSTD_compressCCtx)
+#define ZSTD_compressContinue                    \
+GBZSTD (ZSTD_compressContinue)
+#define ZSTD_compressEnd                         \
+GBZSTD (ZSTD_compressEnd)
+#define ZSTD_compressLiterals                    \
+GBZSTD (ZSTD_compressLiterals)
+#define ZSTD_compressRleLiteralsBlock            \
+GBZSTD (ZSTD_compressRleLiteralsBlock)
+#define ZSTD_compressSequences                   \
+GBZSTD (ZSTD_compressSequences)
+#define ZSTD_compressStream                      \
+GBZSTD (ZSTD_compressStream)
+#define ZSTD_compressStream2                     \
+GBZSTD (ZSTD_compressStream2)
+#define ZSTD_compressStream2_simpleArgs          \
+GBZSTD (ZSTD_compressStream2_simpleArgs)
+#define ZSTD_compressSuperBlock                  \
+GBZSTD (ZSTD_compressSuperBlock)
+#define ZSTD_compress_usingCDict                 \
+GBZSTD (ZSTD_compress_usingCDict)
+#define ZSTD_compress_usingCDict_advanced        \
+GBZSTD (ZSTD_compress_usingCDict_advanced)
+#define ZSTD_compress_usingDict                  \
+GBZSTD (ZSTD_compress_usingDict)
+#define ZSTD_copyCCtx                            \
+GBZSTD (ZSTD_copyCCtx)
+#define ZSTD_copyDCtx                            \
+GBZSTD (ZSTD_copyDCtx)
+#define ZSTD_copyDDictParameters                 \
+GBZSTD (ZSTD_copyDDictParameters)
+#define ZSTD_cParam_getBounds                    \
+GBZSTD (ZSTD_cParam_getBounds)
+#define ZSTD_createCCtx                          \
+GBZSTD (ZSTD_createCCtx)
+#define ZSTD_createCCtx_advanced                 \
+GBZSTD (ZSTD_createCCtx_advanced)
+#define ZSTD_createCCtxParams                    \
+GBZSTD (ZSTD_createCCtxParams)
+#define ZSTD_createCDict                         \
+GBZSTD (ZSTD_createCDict)
+#define ZSTD_createCDict_advanced                \
+GBZSTD (ZSTD_createCDict_advanced)
+#define ZSTD_createCDict_advanced2               \
+GBZSTD (ZSTD_createCDict_advanced2)
+#define ZSTD_createCDict_byReference             \
+GBZSTD (ZSTD_createCDict_byReference)
+#define ZSTD_createCStream                       \
+GBZSTD (ZSTD_createCStream)
+#define ZSTD_createCStream_advanced              \
+GBZSTD (ZSTD_createCStream_advanced)
+#define ZSTD_createDCtx                          \
+GBZSTD (ZSTD_createDCtx)
+#define ZSTD_createDCtx_advanced                 \
+GBZSTD (ZSTD_createDCtx_advanced)
+#define ZSTD_createDDict                         \
+GBZSTD (ZSTD_createDDict)
+#define ZSTD_createDDict_advanced                \
+GBZSTD (ZSTD_createDDict_advanced)
+#define ZSTD_createDDict_byReference             \
+GBZSTD (ZSTD_createDDict_byReference)
+#define ZSTD_createDStream                       \
+GBZSTD (ZSTD_createDStream)
+#define ZSTD_createDStream_advanced              \
+GBZSTD (ZSTD_createDStream_advanced)
+#define ZSTD_crossEntropyCost                    \
+GBZSTD (ZSTD_crossEntropyCost)
+#define ZSTD_CStreamInSize                       \
+GBZSTD (ZSTD_CStreamInSize)
+#define ZSTD_CStreamOutSize                      \
+GBZSTD (ZSTD_CStreamOutSize)
+#define ZSTD_customCalloc                        \
+GBZSTD (ZSTD_customCalloc)
+#define ZSTD_customFree                          \
+GBZSTD (ZSTD_customFree)
+#define ZSTD_customMalloc                        \
+GBZSTD (ZSTD_customMalloc)
+#define ZSTD_cycleLog                            \
+GBZSTD (ZSTD_cycleLog)
+#define ZSTD_DCtx_getParameter                   \
+GBZSTD (ZSTD_DCtx_getParameter)
+#define ZSTD_DCtx_loadDictionary                 \
+GBZSTD (ZSTD_DCtx_loadDictionary)
+#define ZSTD_DCtx_loadDictionary_advanced        \
+GBZSTD (ZSTD_DCtx_loadDictionary_advanced)
+#define ZSTD_DCtx_loadDictionary_byReference     \
+GBZSTD (ZSTD_DCtx_loadDictionary_byReference)
+#define ZSTD_DCtx_refDDict                       \
+GBZSTD (ZSTD_DCtx_refDDict)
+#define ZSTD_DCtx_refPrefix                      \
+GBZSTD (ZSTD_DCtx_refPrefix)
+#define ZSTD_DCtx_refPrefix_advanced             \
+GBZSTD (ZSTD_DCtx_refPrefix_advanced)
+#define ZSTD_DCtx_reset                          \
+GBZSTD (ZSTD_DCtx_reset)
+#define ZSTD_DCtx_setFormat                      \
+GBZSTD (ZSTD_DCtx_setFormat)
+#define ZSTD_DCtx_setMaxWindowSize               \
+GBZSTD (ZSTD_DCtx_setMaxWindowSize)
+#define ZSTD_DCtx_setParameter                   \
+GBZSTD (ZSTD_DCtx_setParameter)
+#define ZSTD_DDict_dictContent                   \
+GBZSTD (ZSTD_DDict_dictContent)
+#define ZSTD_DDict_dictSize                      \
+GBZSTD (ZSTD_DDict_dictSize)
+#define ZSTD_decodeLiteralsBlock                 \
+GBZSTD (ZSTD_decodeLiteralsBlock)
+#define ZSTD_decodeSeqHeaders                    \
+GBZSTD (ZSTD_decodeSeqHeaders)
+#define ZSTD_decodingBufferSize_min              \
+GBZSTD (ZSTD_decodingBufferSize_min)
+#define ZSTD_decompressBegin                     \
+GBZSTD (ZSTD_decompressBegin)
+#define ZSTD_decompressBegin_usingDDict          \
+GBZSTD (ZSTD_decompressBegin_usingDDict)
+#define ZSTD_decompressBegin_usingDict           \
+GBZSTD (ZSTD_decompressBegin_usingDict)
+#define ZSTD_decompressBlock                     \
+GBZSTD (ZSTD_decompressBlock)
+#define ZSTD_decompressBlock_internal            \
+GBZSTD (ZSTD_decompressBlock_internal)
+#define ZSTD_decompressBound                     \
+GBZSTD (ZSTD_decompressBound)
+#define ZSTD_decompressContinue                  \
+GBZSTD (ZSTD_decompressContinue)
+#define ZSTD_decompressDCtx                      \
+GBZSTD (ZSTD_decompressDCtx)
+#define ZSTD_decompressStream                    \
+GBZSTD (ZSTD_decompressStream)
+#define ZSTD_decompressStream_simpleArgs         \
+GBZSTD (ZSTD_decompressStream_simpleArgs)
+#define ZSTD_decompress_usingDDict               \
+GBZSTD (ZSTD_decompress_usingDDict)
+#define ZSTD_decompress_usingDict                \
+GBZSTD (ZSTD_decompress_usingDict)
+#define ZSTD_dedicatedDictSearch_lazy_loadDictionary \
+GBZSTD (ZSTD_dedicatedDictSearch_lazy_loadDictionary)
+#define ZSTD_defaultCLevel                       \
+GBZSTD (ZSTD_defaultCLevel)
+#define ZSTD_dParam_getBounds                    \
+GBZSTD (ZSTD_dParam_getBounds)
+#define ZSTD_DStreamInSize                       \
+GBZSTD (ZSTD_DStreamInSize)
+#define ZSTD_DStreamOutSize                      \
+GBZSTD (ZSTD_DStreamOutSize)
+#define ZSTD_encodeSequences                     \
+GBZSTD (ZSTD_encodeSequences)
+#define ZSTD_endStream                           \
+GBZSTD (ZSTD_endStream)
+#define ZSTD_estimateCCtxSize                    \
+GBZSTD (ZSTD_estimateCCtxSize)
+#define ZSTD_estimateCCtxSize_usingCCtxParams    \
+GBZSTD (ZSTD_estimateCCtxSize_usingCCtxParams)
+#define ZSTD_estimateCCtxSize_usingCParams       \
+GBZSTD (ZSTD_estimateCCtxSize_usingCParams)
+#define ZSTD_estimateCDictSize                   \
+GBZSTD (ZSTD_estimateCDictSize)
+#define ZSTD_estimateCDictSize_advanced          \
+GBZSTD (ZSTD_estimateCDictSize_advanced)
+#define ZSTD_estimateCStreamSize                 \
+GBZSTD (ZSTD_estimateCStreamSize)
+#define ZSTD_estimateCStreamSize_usingCCtxParams \
+GBZSTD (ZSTD_estimateCStreamSize_usingCCtxParams)
+#define ZSTD_estimateCStreamSize_usingCParams    \
+GBZSTD (ZSTD_estimateCStreamSize_usingCParams)
+#define ZSTD_estimateDCtxSize                    \
+GBZSTD (ZSTD_estimateDCtxSize)
+#define ZSTD_estimateDDictSize                   \
+GBZSTD (ZSTD_estimateDDictSize)
+#define ZSTD_estimateDStreamSize                 \
+GBZSTD (ZSTD_estimateDStreamSize)
+#define ZSTD_estimateDStreamSize_fromFrame       \
+GBZSTD (ZSTD_estimateDStreamSize_fromFrame)
+#define ZSTD_fillDoubleHashTable                 \
+GBZSTD (ZSTD_fillDoubleHashTable)
+#define ZSTD_fillHashTable                       \
+GBZSTD (ZSTD_fillHashTable)
+#define ZSTD_findDecompressedSize                \
+GBZSTD (ZSTD_findDecompressedSize)
+#define ZSTD_findFrameCompressedSize             \
+GBZSTD (ZSTD_findFrameCompressedSize)
+#define ZSTD_flushStream                         \
+GBZSTD (ZSTD_flushStream)
+#define ZSTD_frameHeaderSize                     \
+GBZSTD (ZSTD_frameHeaderSize)
+#define ZSTD_freeCCtx                            \
+GBZSTD (ZSTD_freeCCtx)
+#define ZSTD_freeCCtxParams                      \
+GBZSTD (ZSTD_freeCCtxParams)
+#define ZSTD_freeCDict                           \
+GBZSTD (ZSTD_freeCDict)
+#define ZSTD_freeCStream                         \
+GBZSTD (ZSTD_freeCStream)
+#define ZSTD_freeDCtx                            \
+GBZSTD (ZSTD_freeDCtx)
+#define ZSTD_freeDDict                           \
+GBZSTD (ZSTD_freeDDict)
+#define ZSTD_freeDStream                         \
+GBZSTD (ZSTD_freeDStream)
+#define ZSTD_fseBitCost                          \
+GBZSTD (ZSTD_fseBitCost)
+#define ZSTD_generateSequences                   \
+GBZSTD (ZSTD_generateSequences)
+#define ZSTD_getBlockSize                        \
+GBZSTD (ZSTD_getBlockSize)
+#define ZSTD_getcBlockSize                       \
+GBZSTD (ZSTD_getcBlockSize)
+#define ZSTD_getCParams                          \
+GBZSTD (ZSTD_getCParams)
+#define ZSTD_getCParamsFromCCtxParams            \
+GBZSTD (ZSTD_getCParamsFromCCtxParams)
+#define ZSTD_getCParamsFromCDict                 \
+GBZSTD (ZSTD_getCParamsFromCDict)
+#define ZSTD_getDecompressedSize                 \
+GBZSTD (ZSTD_getDecompressedSize)
+#define ZSTD_getDictID_fromCDict                 \
+GBZSTD (ZSTD_getDictID_fromCDict)
+#define ZSTD_getDictID_fromDDict                 \
+GBZSTD (ZSTD_getDictID_fromDDict)
+#define ZSTD_getDictID_fromDict                  \
+GBZSTD (ZSTD_getDictID_fromDict)
+#define ZSTD_getDictID_fromFrame                 \
+GBZSTD (ZSTD_getDictID_fromFrame)
+#define ZSTD_getErrorCode                        \
+GBZSTD (ZSTD_getErrorCode)
+#define ZSTD_getErrorName                        \
+GBZSTD (ZSTD_getErrorName)
+#define ZSTD_getErrorString                      \
+GBZSTD (ZSTD_getErrorString)
+#define ZSTD_getFrameContentSize                 \
+GBZSTD (ZSTD_getFrameContentSize)
+#define ZSTD_getFrameHeader                      \
+GBZSTD (ZSTD_getFrameHeader)
+#define ZSTD_getFrameHeader_advanced             \
+GBZSTD (ZSTD_getFrameHeader_advanced)
+#define ZSTD_getFrameProgression                 \
+GBZSTD (ZSTD_getFrameProgression)
+#define ZSTD_getParams                           \
+GBZSTD (ZSTD_getParams)
+#define ZSTD_getSeqStore                         \
+GBZSTD (ZSTD_getSeqStore)
+#define ZSTD_initCStream                         \
+GBZSTD (ZSTD_initCStream)
+#define ZSTD_initCStream_advanced                \
+GBZSTD (ZSTD_initCStream_advanced)
+#define ZSTD_initCStream_internal                \
+GBZSTD (ZSTD_initCStream_internal)
+#define ZSTD_initCStream_srcSize                 \
+GBZSTD (ZSTD_initCStream_srcSize)
+#define ZSTD_initCStream_usingCDict              \
+GBZSTD (ZSTD_initCStream_usingCDict)
+#define ZSTD_initCStream_usingCDict_advanced     \
+GBZSTD (ZSTD_initCStream_usingCDict_advanced)
+#define ZSTD_initCStream_usingDict               \
+GBZSTD (ZSTD_initCStream_usingDict)
+#define ZSTD_initDStream                         \
+GBZSTD (ZSTD_initDStream)
+#define ZSTD_initDStream_usingDDict              \
+GBZSTD (ZSTD_initDStream_usingDDict)
+#define ZSTD_initDStream_usingDict               \
+GBZSTD (ZSTD_initDStream_usingDict)
+#define ZSTD_initStaticCCtx                      \
+GBZSTD (ZSTD_initStaticCCtx)
+#define ZSTD_initStaticCDict                     \
+GBZSTD (ZSTD_initStaticCDict)
+#define ZSTD_initStaticCStream                   \
+GBZSTD (ZSTD_initStaticCStream)
+#define ZSTD_initStaticDCtx                      \
+GBZSTD (ZSTD_initStaticDCtx)
+#define ZSTD_initStaticDDict                     \
+GBZSTD (ZSTD_initStaticDDict)
+#define ZSTD_initStaticDStream                   \
+GBZSTD (ZSTD_initStaticDStream)
+#define ZSTD_insertAndFindFirstIndex             \
+GBZSTD (ZSTD_insertAndFindFirstIndex)
+#define ZSTD_insertBlock                         \
+GBZSTD (ZSTD_insertBlock)
+#define ZSTD_invalidateRepCodes                  \
+GBZSTD (ZSTD_invalidateRepCodes)
+
+#define ZSTD_isFrame                             \
+GBZSTD (ZSTD_isFrame)
+#define ZSTD_isSkippableFrame                    \
+GBZSTD (ZSTD_isSkippableFrame)
+#define ZSTD_ldm_adjustParameters                \
+GBZSTD (ZSTD_ldm_adjustParameters)
+#define ZSTD_ldm_blockCompress                   \
+GBZSTD (ZSTD_ldm_blockCompress)
+#define ZSTD_ldm_fillHashTable                   \
+GBZSTD (ZSTD_ldm_fillHashTable)
+#define ZSTD_ldm_generateSequences               \
+GBZSTD (ZSTD_ldm_generateSequences)
+#define ZSTD_ldm_getMaxNbSeq                     \
+GBZSTD (ZSTD_ldm_getMaxNbSeq)
+#define ZSTD_ldm_getTableSize                    \
+GBZSTD (ZSTD_ldm_getTableSize)
+#define ZSTD_ldm_skipRawSeqStoreBytes            \
+GBZSTD (ZSTD_ldm_skipRawSeqStoreBytes)
+#define ZSTD_ldm_skipSequences                   \
+GBZSTD (ZSTD_ldm_skipSequences)
+#define ZSTD_loadCEntropy                        \
+GBZSTD (ZSTD_loadCEntropy)
+#define ZSTD_loadDEntropy                        \
+GBZSTD (ZSTD_loadDEntropy)
+#define ZSTD_maxCLevel                           \
+GBZSTD (ZSTD_maxCLevel)
+#define ZSTD_mergeBlockDelimiters                \
+GBZSTD (ZSTD_mergeBlockDelimiters)
+#define ZSTD_minCLevel                           \
+GBZSTD (ZSTD_minCLevel)
+#define ZSTDMT_compressStream_generic            \
+GBZSTD (ZSTDMT_compressStream_generic)
+#define ZSTDMT_createCCtx_advanced               \
+GBZSTD (ZSTDMT_createCCtx_advanced)
+#define ZSTDMT_freeCCtx                          \
+GBZSTD (ZSTDMT_freeCCtx)
+#define ZSTDMT_getFrameProgression               \
+GBZSTD (ZSTDMT_getFrameProgression)
+#define ZSTDMT_initCStream_internal              \
+GBZSTD (ZSTDMT_initCStream_internal)
+#define ZSTDMT_nextInputSizeHint                 \
+GBZSTD (ZSTDMT_nextInputSizeHint)
+#define ZSTDMT_sizeof_CCtx                       \
+GBZSTD (ZSTDMT_sizeof_CCtx)
+#define ZSTDMT_toFlushNow                        \
+GBZSTD (ZSTDMT_toFlushNow)
+#define ZSTDMT_updateCParams_whileCompressing    \
+GBZSTD (ZSTDMT_updateCParams_whileCompressing)
+#define ZSTD_nextInputType                       \
+GBZSTD (ZSTD_nextInputType)
+#define ZSTD_nextSrcSizeToDecompress             \
+GBZSTD (ZSTD_nextSrcSizeToDecompress)
+#define ZSTD_noCompressLiterals                  \
+GBZSTD (ZSTD_noCompressLiterals)
+#define ZSTD_readSkippableFrame                  \
+GBZSTD (ZSTD_readSkippableFrame)
+#define ZSTD_referenceExternalSequences          \
+GBZSTD (ZSTD_referenceExternalSequences)
+#define ZSTD_reset_compressedBlockState          \
+GBZSTD (ZSTD_reset_compressedBlockState)
+#define ZSTD_resetCStream                        \
+GBZSTD (ZSTD_resetCStream)
+#define ZSTD_resetDStream                        \
+GBZSTD (ZSTD_resetDStream)
+#define ZSTD_resetSeqStore                       \
+GBZSTD (ZSTD_resetSeqStore)
+#define ZSTD_row_update                          \
+GBZSTD (ZSTD_row_update)
+#define ZSTD_selectBlockCompressor               \
+GBZSTD (ZSTD_selectBlockCompressor)
+#define ZSTD_selectEncodingType                  \
+GBZSTD (ZSTD_selectEncodingType)
+#define ZSTD_seqToCodes                          \
+GBZSTD (ZSTD_seqToCodes)
+#define ZSTD_sizeof_CCtx                         \
+GBZSTD (ZSTD_sizeof_CCtx)
+#define ZSTD_sizeof_CDict                        \
+GBZSTD (ZSTD_sizeof_CDict)
+#define ZSTD_sizeof_CStream                      \
+GBZSTD (ZSTD_sizeof_CStream)
+#define ZSTD_sizeof_DCtx                         \
+GBZSTD (ZSTD_sizeof_DCtx)
+#define ZSTD_sizeof_DDict                        \
+GBZSTD (ZSTD_sizeof_DDict)
+#define ZSTD_sizeof_DStream                      \
+GBZSTD (ZSTD_sizeof_DStream)
+#define ZSTD_toFlushNow                          \
+GBZSTD (ZSTD_toFlushNow)
+#define ZSTD_trace_compress_begin                \
+GBZSTD (ZSTD_trace_compress_begin)
+#define ZSTD_trace_compress_end                  \
+GBZSTD (ZSTD_trace_compress_end)
+#define ZSTD_trace_decompress_begin              \
+GBZSTD (ZSTD_trace_decompress_begin)
+#define ZSTD_trace_decompress_end                \
+GBZSTD (ZSTD_trace_decompress_end)
+#define ZSTD_updateTree                          \
+GBZSTD (ZSTD_updateTree)
+#define ZSTD_versionNumber                       \
+GBZSTD (ZSTD_versionNumber)
+#define ZSTD_versionString                       \
+GBZSTD (ZSTD_versionString)
+#define ZSTD_writeLastEmptyBlock                 \
+GBZSTD (ZSTD_writeLastEmptyBlock)
+#define ZSTD_writeSkippableFrame                 \
+GBZSTD (ZSTD_writeSkippableFrame)
+#define ZSTD_XXH32                               \
+GBZSTD (ZSTD_XXH32)
+#define ZSTD_XXH32_canonicalFromHash             \
+GBZSTD (ZSTD_XXH32_canonicalFromHash)
+#define ZSTD_XXH32_copyState                     \
+GBZSTD (ZSTD_XXH32_copyState)
+#define ZSTD_XXH32_createState                   \
+GBZSTD (ZSTD_XXH32_createState)
+#define ZSTD_XXH32_digest                        \
+GBZSTD (ZSTD_XXH32_digest)
+#define ZSTD_XXH32_freeState                     \
+GBZSTD (ZSTD_XXH32_freeState)
+#define ZSTD_XXH32_hashFromCanonical             \
+GBZSTD (ZSTD_XXH32_hashFromCanonical)
+#define ZSTD_XXH32_reset                         \
+GBZSTD (ZSTD_XXH32_reset)
+#define ZSTD_XXH32_update                        \
+GBZSTD (ZSTD_XXH32_update)
+#define ZSTD_XXH64                               \
+GBZSTD (ZSTD_XXH64)
+#define ZSTD_XXH64_canonicalFromHash             \
+GBZSTD (ZSTD_XXH64_canonicalFromHash)
+#define ZSTD_XXH64_copyState                     \
+GBZSTD (ZSTD_XXH64_copyState)
+#define ZSTD_XXH64_createState                   \
+GBZSTD (ZSTD_XXH64_createState)
+#define ZSTD_XXH64_digest                        \
+GBZSTD (ZSTD_XXH64_digest)
+#define ZSTD_XXH64_freeState                     \
+GBZSTD (ZSTD_XXH64_freeState)
+#define ZSTD_XXH64_hashFromCanonical             \
+GBZSTD (ZSTD_XXH64_hashFromCanonical)
+#define ZSTD_XXH64_reset                         \
+GBZSTD (ZSTD_XXH64_reset)
+#define ZSTD_XXH64_update                        \
+GBZSTD (ZSTD_XXH64_update)
+#define ZSTD_XXH_versionNumber                   \
+GBZSTD (ZSTD_XXH_versionNumber)
+
+#define HIST_count                               \
+GBZSTD (HIST_count)
+#define HIST_countFast                           \
+GBZSTD (HIST_countFast)
+#define HIST_countFast_wksp                      \
+GBZSTD (HIST_countFast_wksp)
+#define HIST_count_simple                        \
+GBZSTD (HIST_count_simple)
+#define HIST_count_wksp                          \
+GBZSTD (HIST_count_wksp)
+#define HIST_isError                             \
+GBZSTD (HIST_isError)
+
+#define HUF_buildCTable                          \
+GBZSTD (HUF_buildCTable)
+#define HUF_buildCTable_wksp                     \
+GBZSTD (HUF_buildCTable_wksp)
+#define HUF_compress                             \
+GBZSTD (HUF_compress)
+#define HUF_compress1X                           \
+GBZSTD (HUF_compress1X)
+#define HUF_compress1X_repeat                    \
+GBZSTD (HUF_compress1X_repeat)
+#define HUF_compress1X_usingCTable               \
+GBZSTD (HUF_compress1X_usingCTable)
+#define HUF_compress1X_usingCTable_bmi2          \
+GBZSTD (HUF_compress1X_usingCTable_bmi2)
+#define HUF_compress1X_wksp                      \
+GBZSTD (HUF_compress1X_wksp)
+#define HUF_compress2                            \
+GBZSTD (HUF_compress2)
+#define HUF_compress4X_repeat                    \
+GBZSTD (HUF_compress4X_repeat)
+#define HUF_compress4X_usingCTable               \
+GBZSTD (HUF_compress4X_usingCTable)
+#define HUF_compress4X_usingCTable_bmi2          \
+GBZSTD (HUF_compress4X_usingCTable_bmi2)
+#define HUF_compress4X_wksp                      \
+GBZSTD (HUF_compress4X_wksp)
+#define HUF_compressBound                        \
+GBZSTD (HUF_compressBound)
+#define HUF_decompress                           \
+GBZSTD (HUF_decompress)
+#define HUF_decompress1X1                        \
+GBZSTD (HUF_decompress1X1)
+#define HUF_decompress1X1_DCtx                   \
+GBZSTD (HUF_decompress1X1_DCtx)
+#define HUF_decompress1X1_DCtx_wksp              \
+GBZSTD (HUF_decompress1X1_DCtx_wksp)
+#define HUF_decompress1X1_DCtx_wksp_bmi2         \
+GBZSTD (HUF_decompress1X1_DCtx_wksp_bmi2)
+#define HUF_decompress1X1_usingDTable            \
+GBZSTD (HUF_decompress1X1_usingDTable)
+#define HUF_decompress1X2                        \
+GBZSTD (HUF_decompress1X2)
+#define HUF_decompress1X2_DCtx                   \
+GBZSTD (HUF_decompress1X2_DCtx)
+#define HUF_decompress1X2_DCtx_wksp              \
+GBZSTD (HUF_decompress1X2_DCtx_wksp)
+#define HUF_decompress1X2_usingDTable            \
+GBZSTD (HUF_decompress1X2_usingDTable)
+#define HUF_decompress1X_DCtx                    \
+GBZSTD (HUF_decompress1X_DCtx)
+#define HUF_decompress1X_DCtx_wksp               \
+GBZSTD (HUF_decompress1X_DCtx_wksp)
+#define HUF_decompress1X_usingDTable             \
+GBZSTD (HUF_decompress1X_usingDTable)
+#define HUF_decompress1X_usingDTable_bmi2        \
+GBZSTD (HUF_decompress1X_usingDTable_bmi2)
+#define HUF_decompress4X1                        \
+GBZSTD (HUF_decompress4X1)
+#define HUF_decompress4X1_DCtx                   \
+GBZSTD (HUF_decompress4X1_DCtx)
+#define HUF_decompress4X1_DCtx_wksp              \
+GBZSTD (HUF_decompress4X1_DCtx_wksp)
+#define HUF_decompress4X1_usingDTable            \
+GBZSTD (HUF_decompress4X1_usingDTable)
+#define HUF_decompress4X2                        \
+GBZSTD (HUF_decompress4X2)
+#define HUF_decompress4X2_DCtx                   \
+GBZSTD (HUF_decompress4X2_DCtx)
+#define HUF_decompress4X2_DCtx_wksp              \
+GBZSTD (HUF_decompress4X2_DCtx_wksp)
+#define HUF_decompress4X2_usingDTable            \
+GBZSTD (HUF_decompress4X2_usingDTable)
+#define HUF_decompress4X_DCtx                    \
+GBZSTD (HUF_decompress4X_DCtx)
+#define HUF_decompress4X_hufOnly                 \
+GBZSTD (HUF_decompress4X_hufOnly)
+#define HUF_decompress4X_hufOnly_wksp            \
+GBZSTD (HUF_decompress4X_hufOnly_wksp)
+#define HUF_decompress4X_hufOnly_wksp_bmi2       \
+GBZSTD (HUF_decompress4X_hufOnly_wksp_bmi2)
+#define HUF_decompress4X_usingDTable             \
+GBZSTD (HUF_decompress4X_usingDTable)
+#define HUF_decompress4X_usingDTable_bmi2        \
+GBZSTD (HUF_decompress4X_usingDTable_bmi2)
+#define HUF_estimateCompressedSize               \
+GBZSTD (HUF_estimateCompressedSize)
+#define HUF_getErrorName                         \
+GBZSTD (HUF_getErrorName)
+#define HUF_getNbBitsFromCTable                  \
+GBZSTD (HUF_getNbBitsFromCTable)
+#define HUF_optimalTableLog                      \
+GBZSTD (HUF_optimalTableLog)
+#define HUF_readCTable                           \
+GBZSTD (HUF_readCTable)
+#define HUF_readDTableX1                         \
+GBZSTD (HUF_readDTableX1)
+#define HUF_readDTableX1_wksp                    \
+GBZSTD (HUF_readDTableX1_wksp)
+#define HUF_readDTableX1_wksp_bmi2               \
+GBZSTD (HUF_readDTableX1_wksp_bmi2)
+#define HUF_readDTableX2                         \
+GBZSTD (HUF_readDTableX2)
+#define HUF_readDTableX2_wksp                    \
+GBZSTD (HUF_readDTableX2_wksp)
+#define HUF_readDTableX2_wksp_bmi2               \
+GBZSTD (HUF_readDTableX2_wksp_bmi2)
+#define HUF_readStats                            \
+GBZSTD (HUF_readStats)
+#define HUF_readStats_wksp                       \
+GBZSTD (HUF_readStats_wksp)
+#define HUF_selectDecoder                        \
+GBZSTD (HUF_selectDecoder)
+#define HUF_validateCTable                       \
+GBZSTD (HUF_validateCTable)
+#define HUF_writeCTable                          \
+GBZSTD (HUF_writeCTable)
+#define HUF_writeCTable_wksp                     \
+GBZSTD (HUF_writeCTable_wksp)
+
+#define POOL_add                                 \
+GBZSTD (POOL_add)
+#define POOL_create                              \
+GBZSTD (POOL_create)
+#define POOL_create_advanced                     \
+GBZSTD (POOL_create_advanced)
+#define POOL_free                                \
+GBZSTD (POOL_free)
+#define POOL_joinJobs                            \
+GBZSTD (POOL_joinJobs)
+#define POOL_resize                              \
+GBZSTD (POOL_resize)
+#define POOL_sizeof                              \
+GBZSTD (POOL_sizeof)
+#define POOL_tryAdd                              \
+GBZSTD (POOL_tryAdd)
+
+#define ERR_getErrorString                       \
+GBZSTD (ERR_getErrorString)
+
+#define FSE_buildCTable_raw                      \
+GBZSTD (FSE_buildCTable_raw)
+#define FSE_buildCTable_rle                      \
+GBZSTD (FSE_buildCTable_rle)
+#define FSE_buildCTable_wksp                     \
+GBZSTD (FSE_buildCTable_wksp)
+#define FSE_buildDTable                          \
+GBZSTD (FSE_buildDTable)
+#define FSE_buildDTable_raw                      \
+GBZSTD (FSE_buildDTable_raw)
+#define FSE_buildDTable_rle                      \
+GBZSTD (FSE_buildDTable_rle)
+#define FSE_buildDTable_wksp                     \
+GBZSTD (FSE_buildDTable_wksp)
+#define FSE_compress                             \
+GBZSTD (FSE_compress)
+#define FSE_compress2                            \
+GBZSTD (FSE_compress2)
+#define FSE_compressBound                        \
+GBZSTD (FSE_compressBound)
+#define FSE_compress_usingCTable                 \
+GBZSTD (FSE_compress_usingCTable)
+#define FSE_compress_wksp                        \
+GBZSTD (FSE_compress_wksp)
+#define FSE_createCTable                         \
+GBZSTD (FSE_createCTable)
+#define FSE_createDTable                         \
+GBZSTD (FSE_createDTable)
+#define FSE_decompress                           \
+GBZSTD (FSE_decompress)
+#define FSE_decompress_usingDTable               \
+GBZSTD (FSE_decompress_usingDTable)
+#define FSE_decompress_wksp                      \
+GBZSTD (FSE_decompress_wksp)
+#define FSE_decompress_wksp_bmi2                 \
+GBZSTD (FSE_decompress_wksp_bmi2)
+#define FSE_freeCTable                           \
+GBZSTD (FSE_freeCTable)
+#define FSE_freeDTable                           \
+GBZSTD (FSE_freeDTable)
+#define FSE_getErrorName                         \
+GBZSTD (FSE_getErrorName)
+#define FSE_NCountWriteBound                     \
+GBZSTD (FSE_NCountWriteBound)
+#define FSE_normalizeCount                       \
+GBZSTD (FSE_normalizeCount)
+#define FSE_optimalTableLog                      \
+GBZSTD (FSE_optimalTableLog)
+#define FSE_optimalTableLog_internal             \
+GBZSTD (FSE_optimalTableLog_internal)
+#define FSE_readNCount                           \
+GBZSTD (FSE_readNCount)
+#define FSE_readNCount_bmi2                      \
+GBZSTD (FSE_readNCount_bmi2)
+#define FSE_versionNumber                        \
+GBZSTD (FSE_versionNumber)
+#define FSE_writeNCount                          \
+GBZSTD (FSE_writeNCount)
+
+// not renamed: ZSTD_isError, FSE_isError, HUF_isError
+
+//------------------------------------------------------------------------------
+// disable ZSTD deprecation warnings and include all ZSTD definitions  
+//------------------------------------------------------------------------------
+
+// GraphBLAS does not use deprecated functions, but the warnings pop up anyway
+// when GraphBLAS is built, so silence them with this #define:
+#define ZSTD_DISABLE_DEPRECATE_WARNINGS
+
+// do not use multithreading in ZSTD itself.  GraphBLAS does the parallelism.
+#undef ZSTD_MULTITHREAD
+
+// do not use asm
+#define ZSTD_DISABLE_ASM
+
+#include "zstd.h"
+#endif
+
diff --git a/GraphBLAS/Source/Generated2/GB_unop__cbrt_fp32_fp32.c b/GraphBLAS/Source/Generated2/GB_unop__cbrt_fp32_fp32.c
new file mode 100644
index 000000000..a3cf2a2e0
--- /dev/null
+++ b/GraphBLAS/Source/Generated2/GB_unop__cbrt_fp32_fp32.c
@@ -0,0 +1,131 @@
+//------------------------------------------------------------------------------
+// GB_unop:  hard-coded functions for each built-in unary operator
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated2/ folder, do not edit it
+// (it is auto-generated from Generator/*).
+
+#include "GB.h"
+#ifndef GBCUDA_DEV
+#include "GB_control.h"
+#include "GB_atomics.h"
+#include "GB_unop__include.h"
+
+// C=unop(A) is defined by the following types and operators:
+
+// op(A)  function:  GB (_unop_apply__cbrt_fp32_fp32)
+// op(A') function:  GB (_unop_tran__cbrt_fp32_fp32)
+
+// C type:   float
+// A type:   float
+// cast:     float cij = aij
+// unaryop:  cij = cbrtf (aij)
+
+#define GB_ATYPE \
+    float
+
+#define GB_CTYPE \
+    float
+
+// aij = Ax [pA]
+#define GB_GETA(aij,Ax,pA) \
+    float aij = Ax [pA]
+
+#define GB_CX(p) Cx [p]
+
+// unary operator
+#define GB_OP(z, x) \
+    z = cbrtf (x) ;
+
+// casting
+#define GB_CAST(z, aij) \
+    float z = aij ;
+
+// cij = op (aij)
+#define GB_CAST_OP(pC,pA)           \
+{                                   \
+    /* aij = Ax [pA] */             \
+    float aij = Ax [pA] ;          \
+    /* Cx [pC] = op (cast (aij)) */ \
+    float z = aij ;               \
+    Cx [pC] = cbrtf (z) ;        \
+}
+
+// disable this operator and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_CBRT || GxB_NO_FP32)
+
+//------------------------------------------------------------------------------
+// Cx = op (cast (Ax)): apply a unary operator
+//------------------------------------------------------------------------------
+
+
+GrB_Info GB (_unop_apply__cbrt_fp32_fp32)
+(
+    float *Cx,       // Cx and Ax may be aliased
+    const float *Ax,
+    const int8_t *restrict Ab,   // A->b if A is bitmap
+    int64_t anz,
+    int nthreads
+)
+{
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    int64_t p ;
+    if (Ab == NULL)
+    { 
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = cbrtf (z) ;
+        }
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = cbrtf (z) ;
+        }
+    }
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+
+//------------------------------------------------------------------------------
+// C = op (cast (A')): transpose, typecast, and apply a unary operator
+//------------------------------------------------------------------------------
+
+GrB_Info GB (_unop_tran__cbrt_fp32_fp32)
+(
+    GrB_Matrix C,
+    const GrB_Matrix A,
+    int64_t *restrict *Workspaces,
+    const int64_t *restrict A_slice,
+    int nworkspaces,
+    int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_unop_transpose.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated2/GB_unop__cbrt_fp64_fp64.c b/GraphBLAS/Source/Generated2/GB_unop__cbrt_fp64_fp64.c
new file mode 100644
index 000000000..1c6cc2418
--- /dev/null
+++ b/GraphBLAS/Source/Generated2/GB_unop__cbrt_fp64_fp64.c
@@ -0,0 +1,131 @@
+//------------------------------------------------------------------------------
+// GB_unop:  hard-coded functions for each built-in unary operator
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated2/ folder, do not edit it
+// (it is auto-generated from Generator/*).
+
+#include "GB.h"
+#ifndef GBCUDA_DEV
+#include "GB_control.h"
+#include "GB_atomics.h"
+#include "GB_unop__include.h"
+
+// C=unop(A) is defined by the following types and operators:
+
+// op(A)  function:  GB (_unop_apply__cbrt_fp64_fp64)
+// op(A') function:  GB (_unop_tran__cbrt_fp64_fp64)
+
+// C type:   double
+// A type:   double
+// cast:     double cij = aij
+// unaryop:  cij = cbrt (aij)
+
+#define GB_ATYPE \
+    double
+
+#define GB_CTYPE \
+    double
+
+// aij = Ax [pA]
+#define GB_GETA(aij,Ax,pA) \
+    double aij = Ax [pA]
+
+#define GB_CX(p) Cx [p]
+
+// unary operator
+#define GB_OP(z, x) \
+    z = cbrt (x) ;
+
+// casting
+#define GB_CAST(z, aij) \
+    double z = aij ;
+
+// cij = op (aij)
+#define GB_CAST_OP(pC,pA)           \
+{                                   \
+    /* aij = Ax [pA] */             \
+    double aij = Ax [pA] ;          \
+    /* Cx [pC] = op (cast (aij)) */ \
+    double z = aij ;               \
+    Cx [pC] = cbrt (z) ;        \
+}
+
+// disable this operator and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_CBRT || GxB_NO_FP64)
+
+//------------------------------------------------------------------------------
+// Cx = op (cast (Ax)): apply a unary operator
+//------------------------------------------------------------------------------
+
+
+GrB_Info GB (_unop_apply__cbrt_fp64_fp64)
+(
+    double *Cx,       // Cx and Ax may be aliased
+    const double *Ax,
+    const int8_t *restrict Ab,   // A->b if A is bitmap
+    int64_t anz,
+    int nthreads
+)
+{
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    int64_t p ;
+    if (Ab == NULL)
+    { 
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = cbrt (z) ;
+        }
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = cbrt (z) ;
+        }
+    }
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+
+//------------------------------------------------------------------------------
+// C = op (cast (A')): transpose, typecast, and apply a unary operator
+//------------------------------------------------------------------------------
+
+GrB_Info GB (_unop_tran__cbrt_fp64_fp64)
+(
+    GrB_Matrix C,
+    const GrB_Matrix A,
+    int64_t *restrict *Workspaces,
+    const int64_t *restrict A_slice,
+    int nworkspaces,
+    int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_unop_transpose.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated2/GB_unop__include.h b/GraphBLAS/Source/Generated2/GB_unop__include.h
index 72fa19eae..1d2fa02af 100644
--- a/GraphBLAS/Source/Generated2/GB_unop__include.h
+++ b/GraphBLAS/Source/Generated2/GB_unop__include.h
@@ -7423,6 +7423,50 @@ GrB_Info GB (_unop_tran__erfc_fp64_fp64)
 
 // SPDX-License-Identifier: Apache-2.0
 
+GrB_Info GB (_unop_apply__cbrt_fp32_fp32)
+(
+    float *Cx,
+    const float *Ax,
+    const int8_t *restrict Ab,
+    int64_t anz,
+    int nthreads
+) ;
+
+
+GrB_Info GB (_unop_tran__cbrt_fp32_fp32)
+(
+    GrB_Matrix C,
+    const GrB_Matrix A,
+    int64_t *restrict *Workspaces,
+    const int64_t *restrict A_slice,
+    int nworkspaces,
+    int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+
+GrB_Info GB (_unop_apply__cbrt_fp64_fp64)
+(
+    double *Cx,
+    const double *Ax,
+    const int8_t *restrict Ab,
+    int64_t anz,
+    int nthreads
+) ;
+
+
+GrB_Info GB (_unop_tran__cbrt_fp64_fp64)
+(
+    GrB_Matrix C,
+    const GrB_Matrix A,
+    int64_t *restrict *Workspaces,
+    const int64_t *restrict A_slice,
+    int nworkspaces,
+    int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+
 GrB_Info GB (_unop_apply__conj_fc32_fc32)
 (
     GxB_FC32_t *Cx,
diff --git a/GraphBLAS/Source/GrB_Descriptor_set.c b/GraphBLAS/Source/GrB_Descriptor_set.c
index 9ddc8bf37..45a3bd223 100644
--- a/GraphBLAS/Source/GrB_Descriptor_set.c
+++ b/GraphBLAS/Source/GrB_Descriptor_set.c
@@ -135,10 +135,11 @@ GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
 
             GB_ERROR (GrB_INVALID_VALUE,
                 "invalid descriptor field [%d], must be one of:\n"
-                "GrB_OUTP [%d], GrB_MASK [%d], GrB_INP0 [%d], GrB_INP1 [%d]"
-                "or GxB_AxB_METHOD [%d]", (int) field, (int) GrB_OUTP,
+                "GrB_OUTP [%d], GrB_MASK [%d], GrB_INP0 [%d], GrB_INP1 [%d], "
+                "GxB_AxB_METHOD [%d] or GxB_IMPORT [%d] (use GxB_Desc_set "
+                "for other descriptor settings)", (int) field, (int) GrB_OUTP,
                 (int) GrB_MASK, (int) GrB_INP0, (int) GrB_INP1,
-                (int) GxB_AxB_METHOD) ;
+                (int) GxB_AxB_METHOD, (int) GxB_IMPORT) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GrB_Matrix_exportSize.c b/GraphBLAS/Source/GrB_Matrix_exportSize.c
index 8abfbd33b..516c29640 100644
--- a/GraphBLAS/Source/GrB_Matrix_exportSize.c
+++ b/GraphBLAS/Source/GrB_Matrix_exportSize.c
@@ -25,7 +25,6 @@ GrB_Info GrB_Matrix_exportSize  // determine sizes of user arrays for export
     //--------------------------------------------------------------------------
 
     GB_WHERE1 ("GrB_Matrix_exportSize (&Ap_len, &Ai_len, &Ax_len, format, A)") ;
-    GB_BURBLE_START ("GrB_Matrix_exportSize") ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
     GB_RETURN_IF_NULL (Ap_len) ;
     GB_RETURN_IF_NULL (Ai_len) ;
diff --git a/GraphBLAS/Source/GrB_Matrix_extractElement.c b/GraphBLAS/Source/GrB_Matrix_extractElement.c
index 6c839b788..029d876ce 100644
--- a/GraphBLAS/Source/GrB_Matrix_extractElement.c
+++ b/GraphBLAS/Source/GrB_Matrix_extractElement.c
@@ -197,3 +197,6 @@ GrB_Info GrB_Matrix_extractElement_Scalar   // S = A(i,j)
 #define GB_XCODE GB_UDT_code
 #include "GB_Matrix_extractElement.c"
 
+#define GB_EXTRACT_ELEMENT GxB_Matrix_isStoredElement
+#include "GB_Matrix_extractElement.c"
+
diff --git a/GraphBLAS/Source/GrB_Matrix_select.c b/GraphBLAS/Source/GrB_Matrix_select.c
index 6a8a52358..0dc7f298c 100644
--- a/GraphBLAS/Source/GrB_Matrix_select.c
+++ b/GraphBLAS/Source/GrB_Matrix_select.c
@@ -86,19 +86,19 @@ GrB_Info GB_EVAL3 (prefix, _Matrix_select_, T)                              \
     return (GB_sel (C, M, accum, op, A, Thunk, desc, Context)) ;            \
 }
 
-GB_SEL (GrB, bool      , BOOL  ) ;
-GB_SEL (GrB, int8_t    , INT8  ) ;
-GB_SEL (GrB, int16_t   , INT16 ) ;
-GB_SEL (GrB, int32_t   , INT32 ) ;
-GB_SEL (GrB, int64_t   , INT64 ) ;
-GB_SEL (GrB, uint8_t   , UINT8 ) ;
-GB_SEL (GrB, uint16_t  , UINT16) ;
-GB_SEL (GrB, uint32_t  , UINT32) ;
-GB_SEL (GrB, uint64_t  , UINT64) ;
-GB_SEL (GrB, float     , FP32  ) ;
-GB_SEL (GrB, double    , FP64  ) ;
-GB_SEL (GxB, GxB_FC32_t, FC32  ) ;
-GB_SEL (GxB, GxB_FC64_t, FC64  ) ;
+GB_SEL (GrB, bool      , BOOL  )
+GB_SEL (GrB, int8_t    , INT8  )
+GB_SEL (GrB, int16_t   , INT16 )
+GB_SEL (GrB, int32_t   , INT32 )
+GB_SEL (GrB, int64_t   , INT64 )
+GB_SEL (GrB, uint8_t   , UINT8 )
+GB_SEL (GrB, uint16_t  , UINT16)
+GB_SEL (GrB, uint32_t  , UINT32)
+GB_SEL (GrB, uint64_t  , UINT64)
+GB_SEL (GrB, float     , FP32  )
+GB_SEL (GrB, double    , FP64  )
+GB_SEL (GxB, GxB_FC32_t, FC32  )
+GB_SEL (GxB, GxB_FC64_t, FC64  )
 
 //------------------------------------------------------------------------------
 // GrB_Matrix_select_UDT: select entries from matrix (thunk: user-defined type)
diff --git a/GraphBLAS/Source/GrB_Vector_extractElement.c b/GraphBLAS/Source/GrB_Vector_extractElement.c
index 24a05765d..b9b4da281 100644
--- a/GraphBLAS/Source/GrB_Vector_extractElement.c
+++ b/GraphBLAS/Source/GrB_Vector_extractElement.c
@@ -196,3 +196,6 @@ GrB_Info GrB_Vector_extractElement_Scalar   // S = V(i,j)
 #define GB_XCODE GB_UDT_code
 #include "GB_Vector_extractElement.c"
 
+#define GB_EXTRACT_ELEMENT GxB_Vector_isStoredElement
+#include "GB_Vector_extractElement.c"
+
diff --git a/GraphBLAS/Source/GrB_Vector_select.c b/GraphBLAS/Source/GrB_Vector_select.c
index a8b9d04d6..b0ab41c78 100644
--- a/GraphBLAS/Source/GrB_Vector_select.c
+++ b/GraphBLAS/Source/GrB_Vector_select.c
@@ -84,19 +84,19 @@ GrB_Info GB_EVAL3 (prefix, _Vector_select_, T)                              \
     return (GB_sel (w, M, accum, op, u, Thunk, desc, Context)) ;            \
 }
 
-GB_SEL (GrB, bool      , BOOL  ) ;
-GB_SEL (GrB, int8_t    , INT8  ) ;
-GB_SEL (GrB, int16_t   , INT16 ) ;
-GB_SEL (GrB, int32_t   , INT32 ) ;
-GB_SEL (GrB, int64_t   , INT64 ) ;
-GB_SEL (GrB, uint8_t   , UINT8 ) ;
-GB_SEL (GrB, uint16_t  , UINT16) ;
-GB_SEL (GrB, uint32_t  , UINT32) ;
-GB_SEL (GrB, uint64_t  , UINT64) ;
-GB_SEL (GrB, float     , FP32  ) ;
-GB_SEL (GrB, double    , FP64  ) ;
-GB_SEL (GxB, GxB_FC32_t, FC32  ) ;
-GB_SEL (GxB, GxB_FC64_t, FC64  ) ;
+GB_SEL (GrB, bool      , BOOL  )
+GB_SEL (GrB, int8_t    , INT8  )
+GB_SEL (GrB, int16_t   , INT16 )
+GB_SEL (GrB, int32_t   , INT32 )
+GB_SEL (GrB, int64_t   , INT64 )
+GB_SEL (GrB, uint8_t   , UINT8 )
+GB_SEL (GrB, uint16_t  , UINT16)
+GB_SEL (GrB, uint32_t  , UINT32)
+GB_SEL (GrB, uint64_t  , UINT64)
+GB_SEL (GrB, float     , FP32  )
+GB_SEL (GrB, double    , FP64  )
+GB_SEL (GxB, GxB_FC32_t, FC32  )
+GB_SEL (GxB, GxB_FC64_t, FC64  )
 
 //------------------------------------------------------------------------------
 // GrB_Vector_select_UDT: select entries from vector (thunk: user-defined type)
diff --git a/GraphBLAS/Source/GxB_Matrix_eWiseUnion.c b/GraphBLAS/Source/GxB_Matrix_eWiseUnion.c
index 138466fdc..57b2fc491 100644
--- a/GraphBLAS/Source/GxB_Matrix_eWiseUnion.c
+++ b/GraphBLAS/Source/GxB_Matrix_eWiseUnion.c
@@ -16,6 +16,15 @@
 // else if A(i,j) does not appear but B(i,j) does:
 //      C(i,j) = add (alpha, B(i,j))
 
+// by contrast, GrB_eWiseAdd does the following:
+
+// if A(i,j) and B(i,j) both appear:
+//      C(i,j) = add (A(i,j), B(i,j))
+// else if A(i,j) appears but B(i,j) does not:
+//      C(i,j) = A(i,j)
+// else if A(i,j) does not appear but B(i,j) does:
+//      C(i,j) = B(i,j)
+
 #include "GB_ewise.h"
 #include "GB_get_mask.h"
 
diff --git a/GraphBLAS/Source/GxB_Matrix_reshape.c b/GraphBLAS/Source/GxB_Matrix_reshape.c
new file mode 100644
index 000000000..e4316877c
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_reshape.c
@@ -0,0 +1,65 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_reshape:  reshape a matrix in place
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GxB_Matrix_reshape changes the dimensions of a matrix, reshaping the entries
+// by row or by column.
+
+// For example, if C is 3-by-4 on input, and is reshaped by column to have
+// dimensions 2-by-6:
+
+//      C on input      C on output (by_col true)
+//      00 01 02 03     00 20 11 02 22 13
+//      10 11 12 13     10 01 21 12 03 23
+//      20 21 22 23
+
+// If the same C on input is reshaped by row to dimesions 2-by-6:
+
+//      C on input      C on output (by_col false)
+//      00 01 02 03     00 01 02 03 10 11
+//      10 11 12 13     12 13 20 21 22 23
+//      20 21 22 23
+
+// If the input matrix is nrows-by-ncols, and the size of the reshaped matrix
+// is nrows_new-by-ncols_new, then nrows*ncols must equal nrows_new*ncols_new.
+// The format of the input matrix (by row or by column) is unchanged; this
+// format need not match the by_col input parameter.
+
+#include "GB.h"
+#include "GB_reshape.h"
+
+GrB_Info GxB_Matrix_reshape     // reshape a GrB_Matrix in place
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix, reshaped in place
+    // input:
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // new number of rows of C
+    GrB_Index ncols_new,        // new number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_reshape (C, nrows_new, ncols_new, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_reshape") ;
+    GB_RETURN_IF_NULL_OR_FAULTY (C) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // reshape the matrix
+    //--------------------------------------------------------------------------
+
+    info = GB_reshape (NULL, C, by_col, nrows_new, ncols_new, Context) ;
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_reshapeDup.c b/GraphBLAS/Source/GxB_Matrix_reshapeDup.c
new file mode 100644
index 000000000..d909592da
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_reshapeDup.c
@@ -0,0 +1,52 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_reshapeDup:  reshape a matrix into another matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// See GxB_Matrix_reshape for a description of the output matrix c.
+
+// If the input matrix A is nrows-by-ncols, and the size of the newly-created
+// matrix C is nrows_new-by-ncols_new, then nrows*ncols must equal
+// nrows_new*ncols_new.  The format of the input matrix A (by row or by column)
+// determines the format of the output matrix C, which need not match the
+// by_col input parameter.
+
+#include "GB.h"
+#include "GB_reshape.h"
+
+GrB_Info GxB_Matrix_reshapeDup // reshape a GrB_Matrix into another GrB_Matrix
+(
+    // output:
+    GrB_Matrix *C,              // newly created output matrix, not in place
+    // input:
+    GrB_Matrix A,               // input matrix, not modified
+    bool by_col,                // true if reshape by column, false if by row
+    GrB_Index nrows_new,        // number of rows of C
+    GrB_Index ncols_new,        // number of columns of C
+    const GrB_Descriptor desc   // to control # of threads used
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_reshapeDup (&C, A, nrows_new, ncols_new, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_reshapeDup") ;
+    GB_RETURN_IF_NULL (C) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // reshape the matrix
+    //--------------------------------------------------------------------------
+
+    info = GB_reshape (C, A, by_col, nrows_new, ncols_new, Context) ;
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_init.c b/GraphBLAS/Source/GxB_init.c
index 3cba73aea..03984a107 100644
--- a/GraphBLAS/Source/GxB_init.c
+++ b/GraphBLAS/Source/GxB_init.c
@@ -42,53 +42,7 @@
 //
 //      GxB_init (mode, rmm_malloc, rmm_calloc, rmm_realloc, rmm_free) ;
 //
-//          mode is GrB_BLOCKING or GrB_NONBLOCKING
-
-#if for_comments_only
-compute_system = rmm_wrap_initialize (mode, initpoolsize, maxpoolsize) ;
-
-    create RMM instance
-    query the GPU(s) available, set their context
-    compute_system: holds 4 RMM contexts, 4 GPUs, how big they are ...
-
-p = rmm_wrap_malloc (42) ;  // needs the GPUs to be warmed up
-...
-
-    // option:
-    GxB_init (GrB_NONBLOCKING, rmm_wrap_malloc, rmm_wrap_calloc,
-        rmm_wrap_realloc, rmm_wrap_free) ;
-
-    // use GrB just on the CPU cores
-    GrB_Matrix_new (&C, ...)
-    GrB_mxm (...)
-
-    GxB_set (GxB_CUDA_SYSTEM_CONTEXT, compute_system) ;   // use the GPUs ...
-    GxB_set (GxB_NTHREDS, 4) ;  // use 4 cpu threads
-
-    GxB_get (GxB_CUDA_NGPUS, &ngpus)
-
-    // use GrB just on the GPU 2
-    GxB_set (GxB_CUDA_SET_DEVICE, 2) ;
-    GrB_mxm (C, ...)
-    GxB_set (C, GxB_SPARSITY, GxB_SPARSE + GxB_HYPERSPARE) ;
-    GxB_Matrix_Option_set
-
-    GrB_mxm (C, ...)
-
-    ...
-    GxB_set (GxB_CUDA, true) ;      // 0 seconds, GPUs already warmed up
-    ...
-    GxB_set (GxB_CUDA, false) ;
-    ...
-    GxB_set (GxB_CUDA, true) ;      // 0 seconds
-    GxB_set (GxB_GPUS, [0 2]) ;
-    ...
-
-GrB_finalize ( ) ;
-rmm_wrap_free (p) ;
-rmm_wrap_finalize ( ) ;
-#endif
-
+//          where mode is GxB_BLOCKING_GPU or GxB_NONBLOCKING_GPU
 //
 // To use user-provided malloc and free functions, but not calloc/realloc:
 //
@@ -98,7 +52,7 @@ rmm_wrap_finalize ( ) ;
 
 GrB_Info GxB_init           // start up GraphBLAS and also define malloc, etc
 (
-    GrB_Mode mode,          // blocking or non-blocking mode
+    GrB_Mode mode,          // blocking or non-blocking mode, GPU or not
 
     // pointers to memory management functions
     void * (* user_malloc_function  ) (size_t),         // required
diff --git a/GraphBLAS/Source/Template/GB_Matrix_extractElement.c b/GraphBLAS/Source/Template/GB_Matrix_extractElement.c
index 6364d1e94..d76b2bf57 100644
--- a/GraphBLAS/Source/Template/GB_Matrix_extractElement.c
+++ b/GraphBLAS/Source/Template/GB_Matrix_extractElement.c
@@ -15,12 +15,15 @@
 
 // This template constructs GrB_Matrix_extractElement_[TYPE] for each of the
 // 13 built-in types, and the _UDT method for all user-defined types.
+// It also constructs GxB_Matrix_isStoredElement.
 
 // FUTURE: tolerate zombies
 
 GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
 (
+    #ifdef GB_XTYPE
     GB_XTYPE *x,                // scalar to extract, not modified if not found
+    #endif
     const GrB_Matrix A,         // matrix to extract a scalar from
     GrB_Index row,              // row index
     GrB_Index col               // column index
@@ -32,14 +35,16 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
     //--------------------------------------------------------------------------
 
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
+    #ifdef GB_XTYPE
     GB_RETURN_IF_NULL (x) ;
+    #endif
 
     // TODO: do not wait unless jumbled.  First try to find the element.
     // If found (live or zombie), no need to wait.  If not found and pending
     // tuples exist, wait and then extractElement again.
 
     // delete any lingering zombies, assemble any pending tuples, and unjumble
-    if (GB_ANY_PENDING_WORK (A))
+    if (A->Pending != NULL || A->nzombies > 0 || A->jumbled)
     { 
         GrB_Info info ;
         GB_WHERE1 (GB_WHERE_STRING) ;
@@ -51,88 +56,68 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
     ASSERT (!GB_ANY_PENDING_WORK (A)) ;
 
     // look for index i in vector j
-    int64_t i, j, nrows, ncols ;
+    int64_t i, j ;
+    const int64_t vlen = A->vlen ;
     if (A->is_csc)
     { 
         i = row ;
         j = col ;
-        nrows = A->vlen ;
-        ncols = A->vdim ;
+        if (row >= vlen || col >= A->vdim)
+        { 
+            return (GrB_INVALID_INDEX) ;
+        }
     }
     else
     { 
         i = col ;
         j = row ;
-        nrows = A->vdim ;
-        ncols = A->vlen ;
-    }
-
-    // check row and column indices
-    if (row >= nrows || col >= ncols)
-    { 
-        return (GrB_INVALID_INDEX) ;
-    }
-
-    // GB_XCODE and A must be compatible
-    GB_Type_code acode = A->type->code ;
-    if (!GB_code_compatible (GB_XCODE, acode))
-    { 
-        return (GrB_DOMAIN_MISMATCH) ;
-    }
-
-    if (GB_nnz (A) == 0)
-    { 
-        // quick return
-        return (GrB_NO_VALUE) ;
+        if (col >= vlen || row >= A->vdim)
+        { 
+            return (GrB_INVALID_INDEX) ;
+        }
     }
 
     //--------------------------------------------------------------------------
     // find the entry A(i,j)
     //--------------------------------------------------------------------------
 
-    int64_t pleft ;
+    int64_t pleft, pright ;
     bool found ;
     const int64_t *restrict Ap = A->p ;
 
     if (Ap != NULL)
     {
         // A is sparse or hypersparse
-        const int64_t *restrict Ai = A->i ;
-
-        // extract from vector j of a GrB_Matrix
-        int64_t k ;
         if (A->h != NULL)
         {
             // A is hypersparse: look for j in hyperlist A->h [0 ... A->nvec-1]
             const int64_t *restrict Ah = A->h ;
-            int64_t pleft = 0 ;
-            int64_t pright = A->nvec-1 ;
-            GB_BINARY_SEARCH (j, Ah, pleft, pright, found) ;
+            int64_t k = 0 ;
+            pright = A->nvec-1 ;
+            GB_BINARY_SEARCH (j, Ah, k, pright, found) ;
             if (!found)
             { 
                 // vector j is empty
                 return (GrB_NO_VALUE) ;
             }
-            ASSERT (j == Ah [pleft]) ;
-            k = pleft ;
+            ASSERT (j == Ah [k]) ;
+            pleft = Ap [k] ;
+            pright = Ap [k+1] - 1 ;
         }
         else
         { 
-            // A is sparse: j = k is the kth vector
-            k = j ;
+            // A is sparse: look in the jth vector
+            pleft = Ap [j] ;
+            pright = Ap [j+1] - 1 ;
         }
-
-        pleft = Ap [k] ;
-        int64_t pright = Ap [k+1] - 1 ;
-
-        // binary search in kth vector for index i
         // Time taken for this step is at most O(log(nnz(A(:,j))).
+        const int64_t *restrict Ai = A->i ;
         GB_BINARY_SEARCH (i, Ai, pleft, pright, found) ;
     }
     else
     {
         // A is bitmap or full
-        pleft = i + j * A->vlen ;
+        pleft = i + j * vlen ;
         const int8_t *restrict Ab = A->b ;
         if (Ab != NULL)
         { 
@@ -152,28 +137,37 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
 
     if (found)
     {
+        // entry found
+        #ifdef GB_XTYPE
+        GB_Type_code acode = A->type->code ;
         #if !defined ( GB_UDT_EXTRACT )
         if (GB_XCODE == acode)
         { 
-            // copy A [pleft] into x, no typecasting, for built-in types only.
+            // copy Ax [pleft] into x, no typecasting, for built-in types only.
             GB_XTYPE *restrict Ax = ((GB_XTYPE *) (A->x)) ;
             (*x) = Ax [A->iso ? 0:pleft] ;
         }
         else
         #endif
         { 
-            // typecast the value from A [pleft] into x
+            // typecast the value from Ax [pleft] into x
+            if (!GB_code_compatible (GB_XCODE, acode))
+            { 
+                // x (GB_XCODE) and A (acode) must be compatible
+                return (GrB_DOMAIN_MISMATCH) ;
+            }
             size_t asize = A->type->size ;
             void *ax = ((GB_void *) A->x) + (A->iso ? 0 : (pleft*asize)) ;
             GB_cast_scalar (x, GB_XCODE, ax, acode, asize) ;
         }
         // TODO: do not flush if extracting to GrB_Scalar
         #pragma omp flush
+        #endif
         return (GrB_SUCCESS) ;
     }
     else
     { 
-        // Entry not found.
+        // entry not found
         return (GrB_NO_VALUE) ;
     }
 }
diff --git a/GraphBLAS/Source/Template/GB_Vector_extractElement.c b/GraphBLAS/Source/Template/GB_Vector_extractElement.c
index 06a7a857d..193962298 100644
--- a/GraphBLAS/Source/Template/GB_Vector_extractElement.c
+++ b/GraphBLAS/Source/Template/GB_Vector_extractElement.c
@@ -15,12 +15,15 @@
 
 // This template constructs GrB_Vector_extractElement_[TYPE], for each of the
 // 13 built-in types, and the _UDT method for all user-defined types.
+// It also constructs GxB_Vector_isStoredElement.
 
 // FUTURE: tolerate zombies
 
 GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
 (
+    #ifdef GB_XTYPE
     GB_XTYPE *x,                // scalar to extract, not modified if not found
+    #endif
     const GrB_Vector V,         // vector to extract a scalar from
     GrB_Index i                 // index
 )
@@ -31,7 +34,9 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
     //--------------------------------------------------------------------------
 
     GB_RETURN_IF_NULL_OR_FAULTY (V) ;
+    #ifdef GB_XTYPE
     GB_RETURN_IF_NULL (x) ;
+    #endif
 
     // delete any lingering zombies, assemble any pending tuples, and unjumble
     if (GB_ANY_PENDING_WORK (V))
@@ -51,19 +56,6 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
         return (GrB_INVALID_INDEX) ;
     }
 
-    // GB_XCODE and V must be compatible
-    GB_Type_code vcode = V->type->code ;
-    if (!GB_code_compatible (GB_XCODE, vcode))
-    { 
-        return (GrB_DOMAIN_MISMATCH) ;
-    }
-
-    if (GB_nnz ((GrB_Matrix) V) == 0)
-    { 
-        // quick return
-        return (GrB_NO_VALUE) ;
-    }
-
     //--------------------------------------------------------------------------
     // find the entry V(i)
     //--------------------------------------------------------------------------
@@ -75,13 +67,10 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
     if (Vp != NULL)
     { 
         // V is sparse
-        const int64_t *restrict Vi = V->i ;
-
         pleft = 0 ;
         int64_t pright = Vp [1] - 1 ;
-
-        // binary search for index i
         // Time taken for this step is at most O(log(nnz(V))).
+        const int64_t *restrict Vi = V->i ;
         GB_BINARY_SEARCH (i, Vi, pleft, pright, found) ;
     }
     else
@@ -107,29 +96,37 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
 
     if (found)
     {
+        // entry found
+        #ifdef GB_XTYPE
+        GB_Type_code vcode = V->type->code ;
         #if !defined ( GB_UDT_EXTRACT )
         if (GB_XCODE == vcode)
         { 
-            // copy the value from V [...] into the scalar x, no typecasting,
-            // for built-in types only.
+            // copy Vx [pleft] into x, no typecasting, for built-in types only.
             GB_XTYPE *restrict Vx = ((GB_XTYPE *) (V->x)) ;
             (*x) = Vx [V->iso ? 0:pleft] ;
         }
         else
         #endif
         { 
-            // typecast the value from V [...] into the scalar x
+            // typecast the value from Vx [pleft] into x
+            if (!GB_code_compatible (GB_XCODE, vcode))
+            { 
+                // x (GB_XCODE) and V (vcode) must be compatible
+                return (GrB_DOMAIN_MISMATCH) ;
+            }
             size_t vsize = V->type->size ;
             void *vx = ((GB_void *) V->x) + (V->iso ? 0 : (pleft*vsize)) ;
             GB_cast_scalar (x, GB_XCODE, vx, vcode, vsize) ;
         }
         // TODO: do not flush if extracting to GrB_Scalar
         #pragma omp flush
+        #endif
         return (GrB_SUCCESS) ;
     }
     else
     { 
-        // Entry not found.
+        // entry not found
         return (GrB_NO_VALUE) ;
     }
 }
diff --git a/GraphBLAS/Source/Template/GB_ops_template.c b/GraphBLAS/Source/Template/GB_ops_template.c
index 1fa010d88..fd435da3c 100644
--- a/GraphBLAS/Source/Template/GB_ops_template.c
+++ b/GraphBLAS/Source/Template/GB_ops_template.c
@@ -88,6 +88,7 @@ GXB_OP1 (ONE, "one") ;
     GXB_OP1 (TGAMMA   , "tgamma"   ) ;
     GXB_OP1 (ERF      , "erf"      ) ;
     GXB_OP1 (ERFC     , "erfc"     ) ;
+    GXB_OP1 (CBRT     , "cbrt"     ) ;
     GXB_OP1 (FREXPX   , "frexpx"   ) ;
     GXB_OP1 (FREXPE   , "frexpe"   ) ;
     #endif
@@ -149,57 +150,57 @@ GXB_OP1 (ONE, "one") ;
 // binary functions z=f(x,y) where z, x, and y all have the same type
 //------------------------------------------------------------------------------
 
-GXB_OP2 (RMINUS , "rminus")
-GXB_OP2 (RDIV   , "rdiv"  )
-GXB_OP2 (PAIR   , "pair"  )
-GXB_OP2 (ANY    , "any"   )
-GXB_OP2 (ISEQ   , "iseq"  )
-GXB_OP2 (ISNE   , "isne"  )
-GXB_OP2 (POW    , "pow"   )
+GXB_OP2 (RMINUS , "rminus") ;
+GXB_OP2 (RDIV   , "rdiv"  ) ;
+GXB_OP2 (PAIR   , "pair"  ) ;
+GXB_OP2 (ANY    , "any"   ) ;
+GXB_OP2 (ISEQ   , "iseq"  ) ;
+GXB_OP2 (ISNE   , "isne"  ) ;
+GXB_OP2 (POW    , "pow"   ) ;
 
 #if defined ( GB_COMPLEX ) 
 
     // complex types
-    GXB_OP2 (FIRST  , "first" )
-    GXB_OP2 (SECOND , "second")
-    GXB_OP2 (PLUS   , "plus"  )
-    GXB_OP2 (MINUS  , "minus" )
-    GXB_OP2 (TIMES  , "times" )
-    GXB_OP2 (DIV    , "div"   )
+    GXB_OP2 (FIRST  , "first" ) ;
+    GXB_OP2 (SECOND , "second") ;
+    GXB_OP2 (PLUS   , "plus"  ) ;
+    GXB_OP2 (MINUS  , "minus" ) ;
+    GXB_OP2 (TIMES  , "times" ) ;
+    GXB_OP2 (DIV    , "div"   ) ;
 
 #else
 
     // real types
-    GRB_OP2 (FIRST  , "first" )
-    GRB_OP2 (SECOND , "second")
-    GRB_OP2 (PLUS   , "plus"  )
-    GRB_OP2 (MINUS  , "minus" )
-    GRB_OP2 (TIMES  , "times" )
-    GRB_OP2 (DIV    , "div"   )
+    GRB_OP2 (FIRST  , "first" ) ;
+    GRB_OP2 (SECOND , "second") ;
+    GRB_OP2 (PLUS   , "plus"  ) ;
+    GRB_OP2 (MINUS  , "minus" ) ;
+    GRB_OP2 (TIMES  , "times" ) ;
+    GRB_OP2 (DIV    , "div"   ) ;
 
-    GRB_OP2 (MIN    , "min" )
-    GRB_OP2 (MAX    , "max" )
+    GRB_OP2 (MIN    , "min" ) ;
+    GRB_OP2 (MAX    , "max" ) ;
 
-    GXB_OP2 (LOR    , "or"  )
-    GXB_OP2 (LAND   , "and" )
-    GXB_OP2 (LXOR   , "xor" )
+    GXB_OP2 (LOR    , "or"  ) ;
+    GXB_OP2 (LAND   , "and" ) ;
+    GXB_OP2 (LXOR   , "xor" ) ;
 
-    GXB_OP2 (ISGT   , "isgt")
-    GXB_OP2 (ISLT   , "islt")
-    GXB_OP2 (ISGE   , "isge")
-    GXB_OP2 (ISLE   , "isle")
+    GXB_OP2 (ISGT   , "isgt") ;
+    GXB_OP2 (ISLT   , "islt") ;
+    GXB_OP2 (ISGE   , "isge") ;
+    GXB_OP2 (ISLE   , "isle") ;
 
 #endif
 
 #if defined (GB_FLOAT) || defined (GB_DOUBLE)
 
     // these operators are only defined for float and double
-    GXB_OP2 (ATAN2    , "atan2"    )
-    GXB_OP2 (HYPOT    , "hypot"    )
-    GXB_OP2 (FMOD     , "fmod"     )
-    GXB_OP2 (REMAINDER, "remainder")
-    GXB_OP2 (COPYSIGN , "copysign" )
-    GXB_OP2 (LDEXP    , "ldexp"    )
+    GXB_OP2 (ATAN2    , "atan2"    ) ;
+    GXB_OP2 (HYPOT    , "hypot"    ) ;
+    GXB_OP2 (FMOD     , "fmod"     ) ;
+    GXB_OP2 (REMAINDER, "remainder") ;
+    GXB_OP2 (COPYSIGN , "copysign" ) ;
+    GXB_OP2 (LDEXP    , "ldexp"    ) ;
 
 #endif
 
@@ -226,32 +227,32 @@ GXB_OP2 (POW    , "pow"   )
 #if defined ( GB_FLOAT )
 
     // z = cmplx(x,y) where z is float complex, x and y are float
-    GXB_OP2z (CMPLX, "cmplx", GxB_FC32_t, FC32)
+    GXB_OP2z (CMPLX, "cmplx", GxB_FC32_t, FC32) ;
 
 #endif
 
 #if defined ( GB_DOUBLE )
 
     // z = cmplx(x,y) where z is double complex, x and y are double
-    GXB_OP2z (CMPLX, "cmplx", GxB_FC64_t, FC64)
+    GXB_OP2z (CMPLX, "cmplx", GxB_FC64_t, FC64) ;
 
 #endif
 
 #if defined ( GB_COMPLEX )
 
     // complex types
-    GXB_OP2z (EQ, "eq", bool, BOOL)
-    GXB_OP2z (NE, "ne", bool, BOOL)
+    GXB_OP2z (EQ, "eq", bool, BOOL) ;
+    GXB_OP2z (NE, "ne", bool, BOOL) ;
 
 #else
 
     // real types
-    GRB_OP2z (EQ, "eq", bool, BOOL)
-    GRB_OP2z (NE, "ne", bool, BOOL)
-    GRB_OP2z (GT, "gt", bool, BOOL)
-    GRB_OP2z (LT, "lt", bool, BOOL)
-    GRB_OP2z (LE, "le", bool, BOOL)
-    GRB_OP2z (GE, "ge", bool, BOOL)
+    GRB_OP2z (EQ, "eq", bool, BOOL) ;
+    GRB_OP2z (NE, "ne", bool, BOOL) ;
+    GRB_OP2z (GT, "gt", bool, BOOL) ;
+    GRB_OP2z (LT, "lt", bool, BOOL) ;
+    GRB_OP2z (LE, "le", bool, BOOL) ;
+    GRB_OP2z (GE, "ge", bool, BOOL) ;
 
 #endif
 
diff --git a/GraphBLAS/Source/Template/GB_ops_template.h b/GraphBLAS/Source/Template/GB_ops_template.h
index f6d16146f..27f1c0903 100644
--- a/GraphBLAS/Source/Template/GB_ops_template.h
+++ b/GraphBLAS/Source/Template/GB_ops_template.h
@@ -11,14 +11,14 @@
 // binary functions.
 
 #define GB_UNOP_STRUCT(op,xtype) \
-    GB_PUBLIC struct GB_UnaryOp_opaque GB_OPAQUE (GB_EVAL3 (op, _, xtype)) ; 
+    GB_PUBLIC struct GB_UnaryOp_opaque GB_OPAQUE (GB_EVAL3 (op, _, xtype))
 
 #define GB_BINOP_STRUCT(op,xtype) \
-    GB_PUBLIC struct GB_BinaryOp_opaque GB_OPAQUE (GB_EVAL3 (op, _, xtype)) ; 
+    GB_PUBLIC struct GB_BinaryOp_opaque GB_OPAQUE (GB_EVAL3 (op, _, xtype))
 
 #define GB_IDXOP_STRUCT(op,xtype) \
     GB_PUBLIC struct GB_IndexUnaryOp_opaque \
-        GB_OPAQUE (GB_EVAL3 (op, _, xtype)) ; 
+        GB_OPAQUE (GB_EVAL3 (op, _, xtype))
 
 //------------------------------------------------------------------------------
 // z = one (x)
@@ -253,6 +253,7 @@ GB_UNOP_STRUCT (ABS, GB_XTYPE) ;
     GB_OP (TGAMMA, tgammaf )
     GB_OP (ERF   , erff    )
     GB_OP (ERFC  , erfcf   )
+    GB_OP (CBRT  , cbrtf   )
 
 #elif defined ( GB_DOUBLE )
 
@@ -297,6 +298,7 @@ GB_UNOP_STRUCT (ABS, GB_XTYPE) ;
     GB_OP (TGAMMA, tgamma )
     GB_OP (ERF   , erf    )
     GB_OP (ERFC  , erfc   )
+    GB_OP (CBRT  , cbrt   )
 
 #elif defined ( GB_FLOAT_COMPLEX )
 
diff --git a/GraphBLAS/Source/Template/GB_unop_factory.c b/GraphBLAS/Source/Template/GB_unop_factory.c
index 23b510ccb..1ff9f31c5 100644
--- a/GraphBLAS/Source/Template/GB_unop_factory.c
+++ b/GraphBLAS/Source/Template/GB_unop_factory.c
@@ -600,6 +600,16 @@
                 }
                 break ;
 
+            case GB_CBRT_unop_code :    // z = cbrt (x)
+
+                switch (code1)
+                {
+                    case GB_FP32_code   : GB_WORKER (_cbrt, _fp32, float     , _fp32, float )
+                    case GB_FP64_code   : GB_WORKER (_cbrt, _fp64, double    , _fp64, double)
+                    default: ;
+                }
+                break ;
+
             case GB_FREXPX_unop_code :  // z = frexpx (x), mantissa from ANSI C11 frexp
 
                 switch (code1)
diff --git a/GraphBLAS/Source/codegen_unop.m b/GraphBLAS/Source/codegen_unop.m
index 3bd5d59be..1439e23e7 100644
--- a/GraphBLAS/Source/codegen_unop.m
+++ b/GraphBLAS/Source/codegen_unop.m
@@ -354,6 +354,15 @@
     [ ],                        ... % GxB_FC32_t
     [ ]) ;                      ... % GxB_FC64_t
 
+codegen_unop_template ('cbrt', ...
+    [ ],                        ... % bool
+    [ ],                        ... % int
+    [ ],                        ... % uint
+    'cbrtf (xarg)',             ... % float
+    'cbrt (xarg)',              ... % double
+    [ ],                        ... % GxB_FC32_t
+    [ ]) ;                      ... % GxB_FC64_t
+
 codegen_unop_template ('conj', ...
     [ ],                        ... % bool
     [ ],                        ... % int
diff --git a/GraphBLAS/Tcov/Makefile b/GraphBLAS/Tcov/Makefile
index 4fe96bf82..389a59ead 100644
--- a/GraphBLAS/Tcov/Makefile
+++ b/GraphBLAS/Tcov/Makefile
@@ -30,7 +30,8 @@ INC = tmp_include/*
 SRC2 = $(notdir $(wildcard $(SRC)))
 OBJ = $(SRC2:.c=.o)
 LIBS = 
-CPPFLAGS = -Itmp_include -DGBNCPUFEAT -I../rmm_wrap -DGBCOVER $(RENAME)
+I = -Itmp_include -I../rmm_wrap -I../zstd -I../zstd/zstd_subset -I../lz4
+CPPFLAGS = $(I) -DGBNCPUFEAT -DGBCOVER $(RENAME)
 SO_OPTS = $(LDFLAGS)
 
 ifeq ($(UNAME),Darwin)
diff --git a/GraphBLAS/Tcov/grbcover.m b/GraphBLAS/Tcov/grbcover.m
index 4f59503e6..b29f7d1dd 100644
--- a/GraphBLAS/Tcov/grbcover.m
+++ b/GraphBLAS/Tcov/grbcover.m
@@ -41,6 +41,7 @@ function grbcover (what)
 
 % list of include directories
 inc = '-Itmp_include -I../Test -I../Test/Template -I../lz4 -I../rmm_wrap' ;
+inc = [inc ' -I../zstd -I../zstd/zstd_subset'] ;
 
 have_octave = (exist ('OCTAVE_VERSION', 'builtin') == 5) ;
 if (have_octave)
diff --git a/GraphBLAS/Tcov/grbmake.m b/GraphBLAS/Tcov/grbmake.m
index fa0967190..e389f5b2f 100644
--- a/GraphBLAS/Tcov/grbmake.m
+++ b/GraphBLAS/Tcov/grbmake.m
@@ -20,8 +20,11 @@
 % create the include files and place in tmp_include
 hfiles = [ dir('../Include/*') ; ...
            dir('../Source/*.h') ; ...
-           dir('../lz4/*.h') ; ...
-           dir('../lz4/*.c') ; ...
+%          dir('../lz4/*.h') ; ...
+%          dir('../lz4/*.c') ; ...
+%          dir('../zstd/zstd_subset/*.h') ; ...
+%          dir('../zstd/zstd_subset/*/*.c') ; ...
+%          dir('../zstd/zstd_subset/*/*.h') ; ...
            dir('../Source/Template') ; ...
            dir('../Source/Generated1/*.h') ; ...
            dir('../Source/Generated2/*.h') ; ] ;
diff --git a/GraphBLAS/Tcov/log_Apr8_2022.txt b/GraphBLAS/Tcov/log_Apr8_2022.txt
deleted file mode 100644
index dfad22372..000000000
--- a/GraphBLAS/Tcov/log_Apr8_2022.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-
----------------------------------------------- [malloc] [cover]
-08-Apr 15:28:34 test243       20.5 sec   206: 19624 of 19830   1.0%    10.06/sec
-08-Apr 15:29:15 test242       41.2 sec   310: 19314 of 19830   2.6%     7.52/sec
-08-Apr 15:29:15 test241        0.3 sec   144: 19170 of 19830   3.3%   540.39/sec
-08-Apr 15:30:48 testca        92.1 sec   496: 18674 of 19830   5.8%     5.38/sec
-08-Apr 15:30:48 test240        0.3 sec    19: 18655 of 19830   5.9%    61.23/sec
-08-Apr 15:30:48 test240        0.2 sec     5: 18650 of 19830   6.0%    22.53/sec
-08-Apr 15:32:36 testca       107.8 sec    12: 18638 of 19830   6.0%     0.11/sec
-08-Apr 15:32:52 test238       16.2 sec   161: 18477 of 19830   6.8%     9.95/sec
-08-Apr 15:32:53 test237        1.2 sec     3: 18474 of 19830   6.8%     2.55/sec
-08-Apr 15:32:57 test236        3.7 sec   109: 18365 of 19830   7.4%    29.13/sec
-08-Apr 15:33:03 test192        5.8 sec    46: 18319 of 19830   7.6%     7.93/sec
-08-Apr 15:33:16 test191       12.9 sec    63: 18256 of 19830   7.9%     4.90/sec
-08-Apr 15:34:09 test188       53.3 sec   348: 17908 of 19830   9.7%     6.52/sec
-08-Apr 15:34:18 test187        9.1 sec    20: 17888 of 19830   9.8%     2.20/sec
-08-Apr 15:34:19 test186        0.4 sec    50: 17838 of 19830  10.0%   112.03/sec
-08-Apr 15:34:19 test186        0.4 sec    10: 17828 of 19830  10.1%    27.04/sec
-08-Apr 15:36:10 test185      110.9 sec    40: 17788 of 19830  10.3%     0.36/sec
-08-Apr 15:36:13 test184        3.3 sec    51: 17737 of 19830  10.6%    15.37/sec
-08-Apr 15:36:35 test181       21.4 sec   102: 17635 of 19830  11.1%     4.77/sec
-08-Apr 15:36:43 test180        8.6 sec   213: 17422 of 19830  12.1%    24.67/sec
-08-Apr 15:37:00 test180       16.8 sec    11: 17411 of 19830  12.2%     0.65/sec
-08-Apr 15:37:00 test150        0.1 sec    41: 17370 of 19830  12.4%   371.60/sec
-08-Apr 15:37:12 test14        12.1 sec   710: 16660 of 19830  16.0%    58.64/sec
-08-Apr 15:39:02 test154      109.4 sec  1906: 14754 of 19830  25.6%    17.42/sec
-08-Apr 15:39:32 test151b      30.5 sec   235: 14519 of 19830  26.8%     7.71/sec
-08-Apr 15:39:32 test239        0.0 sec    14: 14505 of 19830  26.9%   506.46/sec
-08-Apr 15:48:14 test74       521.9 sec  5982:  8523 of 19830  57.0%    11.46/sec
-08-Apr 15:48:14 test235        0.0 sec     4:  8519 of 19830  57.0%    99.49/sec
-08-Apr 15:49:41 test234       87.0 sec   454:  8065 of 19830  59.3%     5.22/sec
-08-Apr 15:49:49 test233        7.9 sec     3:  8062 of 19830  59.3%     0.38/sec
-08-Apr 15:49:51 test232        2.2 sec    57:  8005 of 19830  59.6%    25.97/sec
-08-Apr 15:56:57 test231      426.0 sec   701:  7304 of 19830  63.2%     1.65/sec
-08-Apr 15:58:25 test230       88.0 sec   118:  7186 of 19830  63.8%     1.34/sec
-08-Apr 15:58:26 test229        0.7 sec     8:  7178 of 19830  63.8%    11.79/sec
-08-Apr 15:58:32 test228        5.8 sec    46:  7132 of 19830  64.0%     7.99/sec
-08-Apr 15:58:38 test227        5.8 sec    38:  7094 of 19830  64.2%     6.56/sec
-08-Apr 15:58:38 test226        0.0 sec     6:  7088 of 19830  64.3%   476.23/sec
-08-Apr 15:58:38 test225        0.2 sec     4:  7084 of 19830  64.3%    26.55/sec
-08-Apr 15:58:43 test224        4.8 sec    73:  7011 of 19830  64.6%    15.36/sec
-08-Apr 15:58:43 test223        0.0 sec     2:  7009 of 19830  64.7%    68.31/sec
-08-Apr 15:58:43 test222        0.1 sec    11:  6998 of 19830  64.7%   143.80/sec
-08-Apr 15:58:43 test221        0.0 sec     2:  6996 of 19830  64.7%   236.49/sec
-08-Apr 15:58:43 test220        0.0 sec     5:  6991 of 19830  64.7%   152.66/sec
-08-Apr 15:58:43 test219        0.0 sec     4:  6987 of 19830  64.8%   552.79/sec
-08-Apr 15:58:43 test217        0.0 sec     4:  6983 of 19830  64.8%   306.09/sec
-08-Apr 15:58:43 test216        0.1 sec    11:  6972 of 19830  64.8%   133.54/sec
-08-Apr 15:58:45 test215        2.5 sec     1:  6971 of 19830  64.8%     0.39/sec
-08-Apr 15:58:45 test214        0.0 sec     1:  6970 of 19830  64.9%   139.14/sec
-08-Apr 15:58:45 test213        0.0 sec     5:  6965 of 19830  64.9%   778.33/sec
-08-Apr 15:58:45 test212        0.1 sec     4:  6961 of 19830  64.9%    47.81/sec
-08-Apr 15:58:45 test211        0.0 sec    12:  6949 of 19830  65.0%   581.87/sec
-08-Apr 15:58:45 test210        0.0 sec     2:  6947 of 19830  65.0%   689.89/sec
-08-Apr 15:58:51 test209        5.3 sec    24:  6923 of 19830  65.1%     4.55/sec
-08-Apr 15:58:51 test208        0.0 sec     5:  6918 of 19830  65.1%   246.62/sec
-08-Apr 15:58:51 test207        0.1 sec     8:  6910 of 19830  65.2%   108.95/sec
-08-Apr 15:58:54 test206        2.8 sec    12:  6898 of 19830  65.2%     4.28/sec
-08-Apr 15:58:54 test204        0.1 sec    10:  6888 of 19830  65.3%    74.71/sec
-08-Apr 15:58:54 test203        0.0 sec     5:  6883 of 19830  65.3%  1156.34/sec
-08-Apr 15:58:54 test202        0.0 sec     8:  6875 of 19830  65.3%   918.91/sec
-08-Apr 15:58:54 test201        0.0 sec     7:  6868 of 19830  65.4%   967.65/sec
-08-Apr 15:58:57 test200        3.1 sec     7:  6861 of 19830  65.4%     2.29/sec
-08-Apr 15:58:57 test199        0.0 sec     1:  6860 of 19830  65.4%   197.94/sec
-08-Apr 15:58:57 test198        0.1 sec     4:  6856 of 19830  65.4%    56.70/sec
-08-Apr 15:58:58 test197        0.7 sec     1:  6855 of 19830  65.4%     1.40/sec
-08-Apr 15:59:00 test196        2.3 sec    15:  6840 of 19830  65.5%     6.50/sec
-08-Apr 16:01:20 test195      139.9 sec    79:  6761 of 19830  65.9%     0.56/sec
-08-Apr 16:02:24 test194       64.5 sec   124:  6637 of 19830  66.5%     1.92/sec
-08-Apr 16:02:43 test193       18.2 sec     6:  6631 of 19830  66.6%     0.33/sec
-08-Apr 16:02:51 test189        8.1 sec    10:  6621 of 19830  66.6%     1.24/sec
-08-Apr 16:02:51 test183        0.0 sec     4:  6617 of 19830  66.6%   154.36/sec
-08-Apr 16:02:52 test182        0.9 sec     9:  6608 of 19830  66.7%    10.15/sec
-08-Apr 16:02:52 test179        0.1 sec    18:  6590 of 19830  66.8%   309.06/sec
-08-Apr 16:02:52 test165        0.0 sec     3:  6587 of 19830  66.8%   516.80/sec
-08-Apr 16:02:53 test01         0.9 sec   751:  5836 of 19830  70.6%   827.13/sec
-08-Apr 16:02:53 test83         0.0 sec     1:  5835 of 19830  70.6%   277.78/sec
-08-Apr 16:02:53 test176        0.2 sec     7:  5828 of 19830  70.6%    40.04/sec
-08-Apr 16:02:53 test174        0.0 sec     9:  5819 of 19830  70.7%   320.91/sec
-08-Apr 16:02:53 test170        0.1 sec     1:  5818 of 19830  70.7%    13.87/sec
-08-Apr 16:02:54 test152        0.8 sec   405:  5413 of 19830  72.7%   536.46/sec
-08-Apr 16:02:54 test155        0.1 sec    13:  5400 of 19830  72.8%    99.85/sec
-08-Apr 16:02:55 test156        0.7 sec     2:  5398 of 19830  72.8%     2.76/sec
-08-Apr 16:02:55 test136        0.0 sec    21:  5377 of 19830  72.9%   466.47/sec
-08-Apr 16:02:55 test02         0.2 sec   133:  5244 of 19830  73.6%   618.52/sec
-08-Apr 16:02:55 test109        0.1 sec     2:  5242 of 19830  73.6%    25.43/sec
-08-Apr 16:02:55 test109        0.0 sec     1:  5241 of 19830  73.6%   401.93/sec
-08-Apr 16:02:55 test04         0.0 sec     8:  5233 of 19830  73.6%   304.23/sec
-08-Apr 16:03:03 test142        7.8 sec   627:  4606 of 19830  76.8%    80.20/sec
-08-Apr 16:03:03 test162        0.1 sec     1:  4605 of 19830  76.8%    18.55/sec
-08-Apr 16:03:03 test161        0.1 sec     1:  4604 of 19830  76.8%     9.92/sec
-08-Apr 16:03:04 test159        1.4 sec    23:  4581 of 19830  76.9%    16.08/sec
-08-Apr 16:03:05 test137        0.2 sec    10:  4571 of 19830  76.9%    59.44/sec
-08-Apr 16:03:05 test139        0.4 sec     2:  4569 of 19830  77.0%     4.55/sec
-08-Apr 16:03:05 test09         0.0 sec     1:  4568 of 19830  77.0%    86.72/sec
-08-Apr 16:03:05 test132        0.0 sec     1:  4567 of 19830  77.0%    49.89/sec
-08-Apr 16:03:09 test141        3.9 sec   110:  4457 of 19830  77.5%    27.98/sec
-08-Apr 16:03:10 test144        0.6 sec     1:  4456 of 19830  77.5%     1.78/sec
-08-Apr 16:03:10 test145        0.2 sec     5:  4451 of 19830  77.6%    30.90/sec
-08-Apr 16:03:10 test92         0.1 sec     4:  4447 of 19830  77.6%    55.88/sec
-08-Apr 16:03:10 test108        0.3 sec     2:  4445 of 19830  77.6%     7.18/sec
-08-Apr 16:03:10 test172        0.1 sec     3:  4442 of 19830  77.6%    38.83/sec
-08-Apr 16:03:11 test148        0.4 sec     7:  4435 of 19830  77.6%    16.78/sec
-08-Apr 16:03:11 testc2(1)      0.3 sec     6:  4429 of 19830  77.7%    17.21/sec
-08-Apr 16:03:12 test173        1.5 sec    11:  4418 of 19830  77.7%     7.34/sec
-08-Apr 16:03:13 test157        0.7 sec    13:  4405 of 19830  77.8%    19.85/sec
-08-Apr 16:03:20 test29         6.4 sec     3:  4402 of 19830  77.8%     0.47/sec
-08-Apr 16:03:20 test128        0.3 sec    15:  4387 of 19830  77.9%    56.89/sec
-08-Apr 16:03:50 test125       30.6 sec   639:  3748 of 19830  81.1%    20.90/sec
-08-Apr 16:03:51 test82         0.1 sec     5:  3743 of 19830  81.1%    59.26/sec
-08-Apr 16:04:08 test158       17.0 sec    19:  3724 of 19830  81.2%     1.12/sec
-08-Apr 16:04:10 test84         2.3 sec    19:  3705 of 19830  81.3%     8.24/sec
-08-Apr 16:04:11 test130        1.3 sec    18:  3687 of 19830  81.4%    14.08/sec
-08-Apr 16:04:21 test19b        9.9 sec    44:  3643 of 19830  81.6%     4.43/sec
-08-Apr 16:04:26 test19b        5.4 sec     5:  3638 of 19830  81.7%     0.93/sec
-08-Apr 16:04:27 test133        0.5 sec     2:  3636 of 19830  81.7%     4.32/sec
-08-Apr 16:04:30 test80         3.5 sec     2:  3634 of 19830  81.7%     0.57/sec
-08-Apr 16:04:52 test151       21.7 sec    74:  3560 of 19830  82.0%     3.41/sec
-08-Apr 16:04:52 test124        0.2 sec     3:  3557 of 19830  82.1%    14.50/sec
-08-Apr 16:05:08 test23        15.3 sec    88:  3469 of 19830  82.5%     5.76/sec
-08-Apr 16:05:18 test175        9.9 sec     1:  3468 of 19830  82.5%     0.10/sec
-08-Apr 16:06:27 test160       69.7 sec    16:  3452 of 19830  82.6%     0.23/sec
-08-Apr 16:07:37 test160       69.6 sec     3:  3449 of 19830  82.6%     0.04/sec
-08-Apr 16:07:49 test54        11.9 sec    20:  3429 of 19830  82.7%     1.68/sec
-08-Apr 16:08:06 test104       17.3 sec    38:  3391 of 19830  82.9%     2.19/sec
-08-Apr 16:08:10 test11         3.4 sec     3:  3388 of 19830  82.9%     0.88/sec
-08-Apr 16:08:11 test129        1.4 sec     1:  3387 of 19830  82.9%     0.73/sec
-08-Apr 16:08:11 test138        0.1 sec     1:  3386 of 19830  82.9%     9.25/sec
-08-Apr 16:13:21 test127      310.1 sec  1613:  1773 of 19830  91.1%     5.20/sec
-08-Apr 16:13:36 test76        15.3 sec    15:  1758 of 19830  91.1%     0.98/sec
-08-Apr 16:13:38 test107        1.6 sec     3:  1755 of 19830  91.1%     1.91/sec
-08-Apr 16:13:44 test69         6.0 sec     2:  1753 of 19830  91.2%     0.33/sec
-08-Apr 16:13:46 test135        1.9 sec     4:  1749 of 19830  91.2%     2.08/sec
-08-Apr 16:14:19 test17        33.4 sec    29:  1720 of 19830  91.3%     0.87/sec
-08-Apr 16:14:48 test53        28.8 sec     4:  1716 of 19830  91.3%     0.14/sec
-08-Apr 16:18:13 test19       204.4 sec    11:  1705 of 19830  91.4%     0.05/sec
-[malloc debugging turned off]
-08-Apr 16:30:34 test10       741.4 sec   784:   921 of 19830  95.4%     1.06/sec
-08-Apr 16:38:22 test75b      467.9 sec   870:    51 of 19830  99.7%     1.86/sec
-08-Apr 16:41:19 test16       177.5 sec     8:    43 of 19830  99.8%     0.05/sec
-08-Apr 16:42:59 test81        99.3 sec     6:    37 of 19830  99.8%     0.06/sec
-08-Apr 16:44:23 test21b       84.7 sec    21:    16 of 19830  99.9%     0.25/sec
-08-Apr 16:50:29 test18       365.6 sec    16:   all 19830 full 100%     0.04/sec
-[malloc debugging turned back on]
diff --git a/GraphBLAS/Tcov/log_Aug8.txt b/GraphBLAS/Tcov/log_Aug8.txt
new file mode 100644
index 000000000..5ae3cc38f
--- /dev/null
+++ b/GraphBLAS/Tcov/log_Aug8.txt
@@ -0,0 +1,140 @@
+
+---------------------------------------------- [malloc] [cover]
+08-Aug 18:46:15 test01          1.0 sec  2125: 17740 of 19865  10.7%  2092.91/s
+08-Aug 18:46:15 test199         0.3 sec    19: 17721 of 19865  10.8%    74.90/s
+08-Aug 18:46:15 test83          0.1 sec    15: 17706 of 19865  10.9%   178.01/s
+08-Aug 18:46:15 test210         0.0 sec     7: 17699 of 19865  10.9%  1056.13/s
+08-Aug 18:46:15 test165         0.1 sec    16: 17683 of 19865  11.0%   109.72/s
+08-Aug 18:46:15 test219         0.1 sec     9: 17674 of 19865  11.0%    90.65/s
+08-Aug 18:46:16 test241         0.4 sec    52: 17622 of 19865  11.3%   144.33/s
+08-Aug 18:46:16 test240         0.7 sec    41: 17581 of 19865  11.5%    58.52/s
+08-Aug 18:46:17 test220         0.1 sec    65: 17516 of 19865  11.8%   442.17/s
+08-Aug 18:46:17 test211         0.1 sec    29: 17487 of 19865  12.0%   507.13/s
+08-Aug 18:46:17 test202         0.2 sec    33: 17454 of 19865  12.1%   203.06/s
+08-Aug 18:46:18 test152         1.3 sec   864: 16590 of 19865  16.5%   688.95/s
+08-Aug 18:46:18 test222         0.1 sec     6: 16584 of 19865  16.5%    68.18/s
+08-Aug 18:46:19 test186         0.5 sec    71: 16513 of 19865  16.9%   150.75/s
+08-Aug 18:46:19 test150         0.1 sec    75: 16438 of 19865  17.3%   574.18/s
+08-Aug 18:46:19 test239         0.1 sec    14: 16424 of 19865  17.3%   109.53/s
+08-Aug 18:46:19 test235         0.0 sec    12: 16412 of 19865  17.4%   296.33/s
+08-Aug 18:46:19 test226         0.0 sec     8: 16404 of 19865  17.4%   596.88/s
+08-Aug 18:46:19 test223         0.0 sec     4: 16400 of 19865  17.4%   122.89/s
+08-Aug 18:46:19 test204         0.2 sec    22: 16378 of 19865  17.6%   136.22/s
+08-Aug 18:46:19 test203         0.1 sec    11: 16367 of 19865  17.6%   127.49/s
+08-Aug 18:46:19 test183         0.0 sec    12: 16355 of 19865  17.7%   290.78/s
+08-Aug 18:46:19 test179         0.2 sec    26: 16329 of 19865  17.8%   167.10/s
+08-Aug 18:46:19 test174         0.1 sec    10: 16319 of 19865  17.9%   167.80/s
+08-Aug 18:46:20 test155         0.1 sec    46: 16273 of 19865  18.1%   311.92/s
+08-Aug 18:46:20 test156         0.6 sec   258: 16015 of 19865  19.4%   401.47/s
+08-Aug 18:46:20 test136         0.2 sec    58: 15957 of 19865  19.7%   274.79/s
+08-Aug 18:46:21 test02          0.4 sec   185: 15772 of 19865  20.6%   493.95/s
+08-Aug 18:46:21 test109         0.1 sec    10: 15762 of 19865  20.7%    90.31/s
+08-Aug 18:46:21 test109         0.0 sec     1: 15761 of 19865  20.7%   390.32/s
+08-Aug 18:46:21 test04          0.1 sec    21: 15740 of 19865  20.8%   173.40/s
+08-Aug 18:46:21 test207         0.0 sec     1: 15739 of 19865  20.8%   184.03/s
+08-Aug 18:46:21 test221         0.0 sec     2: 15737 of 19865  20.8%   228.41/s
+08-Aug 18:46:21 test162         0.1 sec     8: 15729 of 19865  20.8%   111.39/s
+08-Aug 18:46:22 test159         1.1 sec   148: 15581 of 19865  21.6%   135.39/s
+08-Aug 18:46:22 test09          0.0 sec     5: 15576 of 19865  21.6%   468.69/s
+08-Aug 18:46:22 test132         0.1 sec    10: 15566 of 19865  21.6%    96.99/s
+08-Aug 18:46:26 test141         3.7 sec   543: 15023 of 19865  24.4%   146.75/s
+08-Aug 18:46:27 testc2(1,1)     0.4 sec    44: 14979 of 19865  24.6%   110.45/s
+08-Aug 18:46:27 test214         0.0 sec     3: 14976 of 19865  24.6%   354.90/s
+08-Aug 18:46:27 test213         0.0 sec     4: 14972 of 19865  24.6%   366.30/s
+08-Aug 18:46:29 test206         2.4 sec   265: 14707 of 19865  26.0%   112.59/s
+08-Aug 18:46:29 test212         0.1 sec    10: 14697 of 19865  26.0%   103.02/s
+08-Aug 18:46:29 test128         0.3 sec    54: 14643 of 19865  26.3%   215.58/s
+08-Aug 18:46:29 test82          0.1 sec    15: 14628 of 19865  26.4%   180.12/s
+08-Aug 18:46:30 test229         0.6 sec     7: 14621 of 19865  26.4%    10.84/s
+08-Aug 18:46:31 test144         0.6 sec     6: 14615 of 19865  26.4%    10.09/s
+08-Aug 18:46:43 test14         12.0 sec   620: 13995 of 19865  29.5%    51.57/s
+08-Aug 18:46:51 test180         7.9 sec   130: 13865 of 19865  30.2%    16.44/s
+08-Aug 18:46:54 test236         3.4 sec    73: 13792 of 19865  30.6%    21.34/s
+08-Aug 18:46:57 test232         2.5 sec    22: 13770 of 19865  30.7%     8.65/s
+08-Aug 18:47:13 test228        16.0 sec    36: 13734 of 19865  30.9%     2.25/s
+08-Aug 18:49:00 test154       107.1 sec  1504: 12230 of 19865  38.4%    14.05/s
+08-Aug 18:49:12 test238        12.8 sec    73: 12157 of 19865  38.8%     5.71/s
+08-Aug 18:49:39 test151b       26.4 sec   143: 12014 of 19865  39.5%     5.41/s
+08-Aug 18:49:42 test184         3.1 sec    43: 11971 of 19865  39.7%    13.98/s
+08-Aug 18:49:54 test191        12.3 sec    20: 11951 of 19865  39.8%     1.63/s
+08-Aug 18:50:42 test188        47.4 sec   194: 11757 of 19865  40.8%     4.09/s
+08-Aug 18:50:42 test237         0.8 sec    16: 11741 of 19865  40.9%    20.81/s
+08-Aug 18:50:43 test240         0.2 sec     1: 11740 of 19865  40.9%     4.61/s
+08-Aug 18:50:47 test224         4.7 sec    23: 11717 of 19865  41.0%     4.87/s
+08-Aug 18:50:50 test196         2.5 sec    11: 11706 of 19865  41.1%     4.46/s
+08-Aug 18:50:56 test209         5.8 sec    20: 11686 of 19865  41.2%     3.43/s
+08-Aug 18:51:13 test104        16.8 sec    40: 11646 of 19865  41.4%     2.38/s
+08-Aug 18:51:20 test189         7.9 sec     6: 11640 of 19865  41.4%     0.76/s
+08-Aug 18:52:24 test194        63.1 sec     8: 11632 of 19865  41.4%     0.13/s
+08-Aug 18:52:38 test76         14.1 sec    19: 11613 of 19865  41.5%     1.34/s
+08-Aug 18:53:12 test244        34.1 sec    17: 11596 of 19865  41.6%     0.50/s
+[malloc debugging turned off]
+08-Aug 18:53:12 test201         0.0 sec     5: 11591 of 19865  41.7%   678.06/s
+08-Aug 18:53:12 test225         0.2 sec     5: 11586 of 19865  41.7%    25.56/s
+08-Aug 18:53:12 test170         0.1 sec     1: 11585 of 19865  41.7%    11.10/s
+08-Aug 18:53:12 test176         0.1 sec     5: 11580 of 19865  41.7%    68.32/s
+08-Aug 18:53:12 test208         0.0 sec     5: 11575 of 19865  41.7%   273.63/s
+08-Aug 18:53:12 test216         0.1 sec     3: 11572 of 19865  41.7%    32.19/s
+08-Aug 18:53:19 test142         6.4 sec   630: 10942 of 19865  44.9%    98.74/s
+08-Aug 18:53:19 test137         0.2 sec     5: 10937 of 19865  44.9%    25.80/s
+08-Aug 18:53:19 test139         0.4 sec     2: 10935 of 19865  45.0%     4.58/s
+08-Aug 18:53:20 test145         0.2 sec     8: 10927 of 19865  45.0%    41.20/s
+08-Aug 18:53:20 test172         0.1 sec     3: 10924 of 19865  45.0%    31.63/s
+08-Aug 18:53:20 test148         0.5 sec     6: 10918 of 19865  45.0%    12.18/s
+08-Aug 18:53:20 test186         0.3 sec     8: 10910 of 19865  45.1%    31.31/s
+08-Aug 18:53:21 test157         0.7 sec    13: 10897 of 19865  45.1%    19.27/s
+08-Aug 18:53:22 test182         1.1 sec     9: 10888 of 19865  45.2%     8.42/s
+08-Aug 18:53:23 test108         0.3 sec     2: 10886 of 19865  45.2%     5.96/s
+08-Aug 18:53:23 test130         0.8 sec    25: 10861 of 19865  45.3%    30.66/s
+08-Aug 18:53:24 test124         0.2 sec     3: 10858 of 19865  45.3%    15.03/s
+08-Aug 18:53:24 test138         0.1 sec     1: 10857 of 19865  45.3%    12.08/s
+08-Aug 18:53:28 test227         4.3 sec    27: 10830 of 19865  45.5%     6.32/s
+08-Aug 18:53:57 test125        29.1 sec   639: 10191 of 19865  48.7%    21.98/s
+08-Aug 18:54:56 test234        58.9 sec    69: 10122 of 19865  49.0%     1.17/s
+08-Aug 18:55:42 test242        45.9 sec    77: 10045 of 19865  49.4%     1.68/s
+08-Aug 18:55:43 test173         1.0 sec    11: 10034 of 19865  49.5%    10.68/s
+08-Aug 18:55:46 test200         3.3 sec    10: 10024 of 19865  49.5%     3.07/s
+08-Aug 18:55:46 test197         0.2 sec     1: 10023 of 19865  49.5%     6.09/s
+08-Aug 18:55:56 test158         9.8 sec    19: 10004 of 19865  49.6%     1.94/s
+08-Aug 18:55:57 test84          1.2 sec    18:  9986 of 19865  49.7%    14.80/s
+08-Aug 18:56:07 test19b         9.6 sec    48:  9938 of 19865  50.0%     5.02/s
+08-Aug 18:56:12 test19b         5.0 sec     5:  9933 of 19865  50.0%     1.00/s
+08-Aug 18:56:12 test133         0.4 sec     2:  9931 of 19865  50.0%     5.17/s
+08-Aug 18:56:16 test80          3.9 sec    28:  9903 of 19865  50.1%     7.21/s
+08-Aug 18:56:33 test151        16.5 sec    73:  9830 of 19865  50.5%     4.41/s
+08-Aug 18:56:49 test23         15.8 sec    88:  9742 of 19865  51.0%     5.57/s
+08-Aug 18:56:50 test135         1.9 sec     7:  9735 of 19865  51.0%     3.71/s
+08-Aug 18:57:21 test160        30.4 sec    36:  9699 of 19865  51.2%     1.18/s
+08-Aug 18:57:26 test54          4.6 sec    20:  9679 of 19865  51.3%     4.31/s
+08-Aug 18:57:27 test129         1.1 sec     7:  9672 of 19865  51.3%     6.50/s
+08-Aug 18:57:29 test69          1.9 sec     3:  9669 of 19865  51.3%     1.62/s
+08-Aug 18:58:51 test230        82.1 sec   104:  9565 of 19865  51.8%     1.27/s
+08-Aug 19:08:15 test74        564.0 sec  5796:  3769 of 19865  81.0%    10.28/s
+08-Aug 19:11:20 test127       185.2 sec  1613:  2156 of 19865  89.1%     8.71/s
+08-Aug 19:11:23 test19          3.0 sec    12:  2144 of 19865  89.2%     4.00/s
+08-Aug 19:11:26 test11          3.3 sec     3:  2141 of 19865  89.2%     0.92/s
+08-Aug 19:11:57 test160        30.4 sec     4:  2137 of 19865  89.2%     0.13/s
+08-Aug 19:11:59 test215         2.8 sec     1:  2136 of 19865  89.2%     0.35/s
+08-Aug 19:12:14 test193        15.0 sec     5:  2131 of 19865  89.3%     0.33/s
+08-Aug 19:13:36 test195        81.3 sec    42:  2089 of 19865  89.5%     0.52/s
+08-Aug 19:13:43 test233         7.6 sec     1:  2088 of 19865  89.5%     0.13/s
+08-Aug 19:14:06 test243        22.9 sec     7:  2081 of 19865  89.5%     0.31/s
+08-Aug 19:14:12 test29          5.8 sec     2:  2079 of 19865  89.5%     0.34/s
+08-Aug 19:15:03 testca(1)      50.9 sec     6:  2073 of 19865  89.6%     0.12/s
+08-Aug 19:15:10 test187         7.2 sec     3:  2070 of 19865  89.6%     0.42/s
+08-Aug 19:15:15 test192         4.9 sec     1:  2069 of 19865  89.6%     0.20/s
+08-Aug 19:15:32 test181        16.9 sec    13:  2056 of 19865  89.7%     0.77/s
+08-Aug 19:17:09 test185        96.8 sec    14:  2042 of 19865  89.7%     0.14/s
+08-Aug 19:17:22 test53         13.4 sec     4:  2038 of 19865  89.7%     0.30/s
+08-Aug 19:18:05 test17         42.6 sec    29:  2009 of 19865  89.9%     0.68/s
+08-Aug 19:24:33 test231       388.4 sec   295:  1714 of 19865  91.4%     0.76/s
+08-Aug 19:35:08 test10        634.7 sec   793:   921 of 19865  95.4%     1.25/s
+08-Aug 19:44:11 test75b       543.1 sec   870:    51 of 19865  99.7%     1.60/s
+08-Aug 19:44:12 testc2(0,0)     1.0 sec     3:    48 of 19865  99.8%     3.14/s
+08-Aug 19:44:12 testc4(0)       0.4 sec     2:    46 of 19865  99.8%     5.66/s
+08-Aug 19:44:26 testc7(0)      13.1 sec     2:    44 of 19865  99.8%     0.15/s
+08-Aug 19:44:30 testcc(1)       4.5 sec     1:    43 of 19865  99.8%     0.22/s
+08-Aug 19:46:09 test81         98.6 sec     6:    37 of 19865  99.8%     0.06/s
+08-Aug 19:47:26 test21b        77.4 sec    21:    16 of 19865  99.9%     0.27/s
+08-Aug 19:52:56 test18        330.1 sec    16:   all 19865 full 100%     0.05/s
+[malloc debugging turned back on]
diff --git a/GraphBLAS/Test/GB_mex_Matrix_isStoredElement.c b/GraphBLAS/Test/GB_mex_Matrix_isStoredElement.c
new file mode 100644
index 000000000..0f4fcf5d9
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_Matrix_isStoredElement.c
@@ -0,0 +1,98 @@
+//------------------------------------------------------------------------------
+// GB_mex_Matrix_isStoredElement: interface for x = A(i,j)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// x = A (i,j), where i and j are zero-based.  If i and j arrays, then
+// x (k) = A (i (k), j (k)) is done for all k.
+
+// I and J and zero-based
+
+#include "GB_mex.h"
+
+#define USAGE "x = GB_mex_Matrix_isStoredElement (A, I, J)"
+
+#define FREE_ALL                        \
+{                                       \
+    GrB_Matrix_free_(&A) ;              \
+    GB_mx_put_global (true) ;           \
+}
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+    GrB_Matrix A = NULL ;
+    GrB_Index *I = NULL, ni = 0, I_range [3] ;
+    GrB_Index *J = NULL, nj = 0, J_range [3] ;
+    bool is_list ;
+
+    // check inputs
+    if (nargout > 1 || nargin < 3 || nargin > 5)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    #define GET_DEEP_COPY ;
+    #define FREE_DEEP_COPY ;
+
+    // get A (shallow copy)
+    A = GB_mx_mxArray_to_Matrix (pargin [0], "A input", false, true) ;
+    if (A == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("A failed") ;
+    }
+
+    // get I
+    if (!GB_mx_mxArray_to_indices (&I, pargin [1], &ni, I_range, &is_list))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("I failed") ;
+    }
+    if (!is_list)
+    {
+        mexErrMsgTxt ("I is invalid; must be a list") ;
+    }
+
+    // get J
+    if (!GB_mx_mxArray_to_indices (&J, pargin [2], &nj, J_range, &is_list))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("J failed") ;
+    }
+    if (!is_list)
+    {
+        mexErrMsgTxt ("J is invalid; must be a list") ;
+    }
+
+    if (ni != nj)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("I and J must be the same size") ;
+    }
+
+    // create output X
+    pargout [0] = GB_mx_create_full (ni, 1, GrB_BOOL) ;
+    bool *X = (bool *) mxGetData (pargout [0]) ;
+
+    // x = A (i,j)
+    for (int64_t k = 0 ; k < ni ; k++)
+    {
+        GrB_Info info = GxB_Matrix_isStoredElement (A, I [k], J [k]) ;
+        X [k] = (info == GrB_SUCCESS) ;
+    }
+
+    FREE_ALL ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_Vector_isStoredElement.c b/GraphBLAS/Test/GB_mex_Vector_isStoredElement.c
new file mode 100644
index 000000000..1fcd0945d
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_Vector_isStoredElement.c
@@ -0,0 +1,76 @@
+//------------------------------------------------------------------------------
+// GB_mex_Vector_isStoredElement: interface for x = v(i)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_mex.h"
+
+#define USAGE "y = GB_mex_Vector_isStoredElement (v, I)"
+
+#define FREE_ALL                                        \
+{                                                       \
+    GrB_Vector_free_(&v) ;                              \
+    GB_mx_put_global (true) ;                           \
+}
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+    GrB_Vector v = NULL ;
+    bool *X = NULL ;
+    GrB_Index *I = NULL, ni = 0, I_range [3] ;
+    bool is_list ;
+
+    // check inputs
+    if (nargout > 1 || nargin < 2 || nargin > 4)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    #define GET_DEEP_COPY ;
+    #define FREE_DEEP_COPY ;
+
+    // get v (shallow copy)
+    v = GB_mx_mxArray_to_Vector (pargin [0], "v input", false, true) ;
+    if (v == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("v failed") ;
+    }
+
+    // get I
+    if (!GB_mx_mxArray_to_indices (&I, pargin [1], &ni, I_range, &is_list))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("I failed") ;
+    }
+    if (!is_list)
+    {
+        mexErrMsgTxt ("I must be a list") ;
+    }
+
+    // create output X
+    pargout [0] = GB_mx_create_full (ni, 1, GrB_BOOL) ;
+    X = (bool *) mxGetData (pargout [0]) ;
+
+    // x = v (i)
+    for (int64_t k = 0 ; k < ni ; k++)
+    {
+        GrB_Info info = GxB_Vector_isStoredElement(v, I [k]) ;
+        X [k] = (info == GrB_SUCCESS) ;
+    }
+
+    FREE_ALL ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_about7.c b/GraphBLAS/Test/GB_mex_about7.c
index 6a7ef0db9..6c584217b 100644
--- a/GraphBLAS/Test/GB_mex_about7.c
+++ b/GraphBLAS/Test/GB_mex_about7.c
@@ -77,7 +77,7 @@ void mexFunction
     char *compiler ;
     int compiler_version [3] ;
     OK (GxB_Global_Option_get (GxB_COMPILER_NAME, &compiler)) ;
-    OK (GxB_Global_Option_get (GxB_COMPILER_VERSION, &compiler_version)) ;
+    OK (GxB_Global_Option_get (GxB_COMPILER_VERSION, compiler_version)) ;
     printf ("GraphBLAS compiled with:\n[%s] [v%d.%d.%d]\n", compiler,
         compiler_version [0], compiler_version [1], compiler_version [2]) ;
 
diff --git a/GraphBLAS/Test/GB_mex_about8.c b/GraphBLAS/Test/GB_mex_about8.c
index 7411e69d9..356c2277b 100644
--- a/GraphBLAS/Test/GB_mex_about8.c
+++ b/GraphBLAS/Test/GB_mex_about8.c
@@ -27,7 +27,7 @@ void mexFunction
 {
 
     GrB_Info info ;
-    GrB_Matrix A = NULL ;
+    GrB_Matrix A = NULL, C = NULL ;
 
     //--------------------------------------------------------------------------
     // startup GraphBLAS
@@ -63,6 +63,22 @@ void mexFunction
     GrB_free (&v_0) ;
     GrB_free (&v_1) ;
 
+    //--------------------------------------------------------------------------
+    // reshape error handling
+    //--------------------------------------------------------------------------
+
+    GrB_Index n =  (1L << 40) ;
+    OK (GrB_Matrix_new (&C, GrB_BOOL, n, n)) ;
+    expected = GrB_OUT_OF_MEMORY ;
+    ERR (GxB_Matrix_reshape (C, true, n/2, 2*n, NULL)) ;
+    OK (GrB_Matrix_free (&C)) ;
+
+    n = 12 ;
+    OK (GrB_Matrix_new (&C, GrB_BOOL, n, n)) ;
+    expected = GrB_DIMENSION_MISMATCH ;
+    ERR (GxB_Matrix_reshape (C, true, n, 2*n, NULL)) ;
+    OK (GrB_Matrix_free (&C)) ;
+
     //--------------------------------------------------------------------------
     // wrapup
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Test/GB_mex_about9.c b/GraphBLAS/Test/GB_mex_about9.c
new file mode 100644
index 000000000..d244f5037
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_about9.c
@@ -0,0 +1,61 @@
+//------------------------------------------------------------------------------
+// GB_mex_about9: still more basic tests (not for Windows)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Windows is limited to user-defined types of size 128 or less.
+
+#include "GB_mex.h"
+#include "GB_mex_errors.h"
+
+#define USAGE "GB_mex_about9"
+#define FREE_ALL ;
+#define GET_DEEP_COPY ;
+#define FREE_DEEP_COPY ;
+
+typedef struct
+{
+    double stuff [32] ;
+}
+bigtype ;
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    GrB_Info info ;
+
+    //--------------------------------------------------------------------------
+    // startup GraphBLAS
+    //--------------------------------------------------------------------------
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+
+    //--------------------------------------------------------------------------
+    // user-defined type of 256 bytes
+    //--------------------------------------------------------------------------
+
+    GrB_Type BigType ;
+    OK (GxB_Type_new (&BigType, sizeof (bigtype), "bigtype",
+        "typedef struct { double stuff [32] ; } bigtype")) ;
+    OK (GxB_Type_fprint (BigType, "(256-byte big type)", GxB_COMPLETE,
+        stdout)) ;
+    OK (GrB_Type_free (&BigType)) ;
+
+    //--------------------------------------------------------------------------
+    // wrapup
+    //--------------------------------------------------------------------------
+
+    GB_mx_put_global (true) ;
+    printf ("\nGB_mex_about9: all tests passed\n\n") ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_errors.c b/GraphBLAS/Test/GB_mex_errors.c
index cc647cfd3..2dcfdb508 100644
--- a/GraphBLAS/Test/GB_mex_errors.c
+++ b/GraphBLAS/Test/GB_mex_errors.c
@@ -1227,13 +1227,12 @@ void mexFunction
     ERR (GrB_Vector_extractElement_FP64_(&x_double, v, -1)) ;
     ERR (GrB_Vector_extractElement_FP64_(&x_double, v, 10)) ;
 
-    expected = GrB_DOMAIN_MISMATCH ;
-
-    ERR (GrB_Vector_extractElement_UDT ((void *) X, v, 0)) ;
-
     OK (GrB_Vector_setElement_FP64 (v, 22.8, 2)) ;
     OK (GrB_Vector_setElement_FP64 (v, 44.9, 4)) ;
 
+    expected = GrB_DOMAIN_MISMATCH ;
+    ERR (GrB_Vector_extractElement_UDT ((void *) X, v, 2)) ;
+
     x_double = 404 ;
     OK (GrB_Vector_extractElement_FP64_(&x_double, v, 3)) ;
     CHECK (x_double == 404) ;
@@ -1666,18 +1665,24 @@ void mexFunction
     ERR (GrB_Matrix_extractElement_FP64   (NULL, Acrud, 0, 0)) ;
     ERR (GrB_Matrix_extractElement_UDT    (NULL, Acrud, 0, 0)) ;
 
-    expected = GrB_INVALID_INDEX ;
+    OK (GrB_Matrix_setElement_FP64 (A, 22.8, 2, 0)) ;
+    OK (GrB_Matrix_setElement_FP64 (A, 44.9, 4, 0)) ;
 
+    expected = GrB_INVALID_INDEX ;
+    OK (GxB_Matrix_Option_set (A, GxB_FORMAT, GxB_BY_ROW)) ;
+    GxB_print (A, 3) ;
+    ERR (GrB_Matrix_extractElement_FP64_(&x_double, A, -1, 0)) ;
+    ERR (GrB_Matrix_extractElement_FP64_(&x_double, A, 10, 0)) ;
+    ERR (GrB_Matrix_extractElement_FP64_(&x_double, A, 0, 911)) ;
+    OK (GxB_Matrix_Option_set (A, GxB_FORMAT, GxB_BY_COL)) ;
+    GxB_print (A, 3) ;
     ERR (GrB_Matrix_extractElement_FP64_(&x_double, A, -1, 0)) ;
     ERR (GrB_Matrix_extractElement_FP64_(&x_double, A, 10, 0)) ;
     ERR (GrB_Matrix_extractElement_FP64_(&x_double, A, 0, 911)) ;
+    OK (GxB_Matrix_Option_set (A, GxB_FORMAT, GxB_BY_ROW)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
-
-    ERR (GrB_Matrix_extractElement_UDT ((void *) X, A, 0, 0)) ;
-
-    OK (GrB_Matrix_setElement_FP64 (A, 22.8, 2, 0)) ;
-    OK (GrB_Matrix_setElement_FP64 (A, 44.9, 4, 0)) ;
+    ERR (GrB_Matrix_extractElement_UDT ((void *) X, A, 2, 0)) ;
 
     x_double = 404 ;
     OK (GrB_Matrix_extractElement_FP64_(&x_double, A, 3, 0)) ;
diff --git a/GraphBLAS/Test/GB_mex_reshape.c b/GraphBLAS/Test/GB_mex_reshape.c
new file mode 100644
index 000000000..51fb29097
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_reshape.c
@@ -0,0 +1,80 @@
+//------------------------------------------------------------------------------
+// GB_mex_reshape: reshape a matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_mex.h"
+
+#define USAGE "C = GB_mex_reshape (A, nrows_new, ncols_new, by_col, in_place)"
+
+#define FREE_ALL                        \
+{                                       \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
+}
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+    GrB_Matrix C = NULL, A = NULL ;
+
+    // check inputs
+    if (nargout > 1 || nargin != 5)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
+
+    GrB_Index nrows_new = (GrB_Index) mxGetScalar (pargin [1]) ;
+    GrB_Index ncols_new = (GrB_Index) mxGetScalar (pargin [2]) ;
+    bool      by_col    = (bool     ) mxGetScalar (pargin [3]) ;
+    bool      in_place  = (bool     ) mxGetScalar (pargin [4]) ;
+
+    // reshape the matrix
+    if (in_place)
+    {
+        // in-place reshape of C
+        #define GET_DEEP_COPY \
+        C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
+        GET_DEEP_COPY ;
+        if (C == NULL)
+        {
+            FREE_ALL ;
+            mexErrMsgTxt ("C failed") ;
+        }
+        METHOD (GxB_Matrix_reshape (C, by_col, nrows_new, ncols_new, NULL)) ;
+    }
+    else
+    {
+        // C is a new matrix created from the input matrix A
+        #undef  GET_DEEP_COPY
+        #define GET_DEEP_COPY ;
+        A = GB_mx_mxArray_to_Matrix (pargin [0], "A", false, true) ;
+        if (A == NULL)
+        {
+            FREE_ALL ;
+            mexErrMsgTxt ("A failed") ;
+        }
+        METHOD (GxB_Matrix_reshapeDup (&C, A, by_col, nrows_new, ncols_new,
+            NULL)) ;
+    }
+
+    // return C as a struct and free the GraphBLAS C
+    pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
+
+    FREE_ALL ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_serialize.c b/GraphBLAS/Test/GB_mex_serialize.c
index e7bfa8ed8..47bc4f27b 100644
--- a/GraphBLAS/Test/GB_mex_serialize.c
+++ b/GraphBLAS/Test/GB_mex_serialize.c
@@ -26,6 +26,8 @@
 // ...
 // GxB_COMPRESSION_LZ4HC 2009  // LZ4HC:9
 
+// GxB_COMPRESSION_ZSTD  3000  // ZSTD with default level 3
+
 #define USAGE "C = GB_mex_serialize (A, method)"
 
 #define FREE_ALL                        \
diff --git a/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c b/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c
index fbc57cf06..112fb54b6 100644
--- a/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c
+++ b/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c
@@ -142,6 +142,7 @@ bool GB_mx_string_to_UnaryOp            // true if successful, false otherwise
         else if (MATCH (opname, "tgamma"  )) { opcode = GB_TGAMMA_unop_code ; }
         else if (MATCH (opname, "erf"     )) { opcode = GB_ERF_unop_code ; }
         else if (MATCH (opname, "erfc"    )) { opcode = GB_ERFC_unop_code ; }
+        else if (MATCH (opname, "cbrt"    )) { opcode = GB_CBRT_unop_code ; }
         else if (MATCH (opname, "frexpx"  )) { opcode = GB_FREXPX_unop_code ; }
         else if (MATCH (opname, "frexpe"  )) { opcode = GB_FREXPE_unop_code ; }
 
@@ -750,6 +751,18 @@ bool GB_mx_string_to_UnaryOp            // true if successful, false otherwise
                 }
                 break ;
 
+            case GB_CBRT_unop_code :    // z = cbrt (x)
+
+                switch (xcode)
+                {
+                    case GB_FP32_code    : op = GxB_CBRT_FP32   ; break ;
+                    case GB_FP64_code    : op = GxB_CBRT_FP64   ; break ;
+                    default              : 
+                        mexWarnMsgIdAndTxt ("GB:warn","unknown operator") ;
+                        return (false) ;
+                }
+                break ;
+
             case GB_FREXPX_unop_code :  // z = frexpx (x), mantissa from frexp
 
                 switch (xcode)
diff --git a/GraphBLAS/Test/GB_spec_op.m b/GraphBLAS/Test/GB_spec_op.m
index d35d5f5b5..fa7bc0449 100644
--- a/GraphBLAS/Test/GB_spec_op.m
+++ b/GraphBLAS/Test/GB_spec_op.m
@@ -20,7 +20,7 @@
 % 'round', ('trunc' or 'fix'), 'exp2', 'expm1', 'log10', 'log2', ('lgamma' or
 % 'gammaln'), ('tgamma' or 'gamma'), 'erf', 'erfc', 'frexpx', 'frexpe', 'conj',
 % ('creal' or 'real'), ('cimag' or 'imag'), ('carg' or 'angle'), 'isinf',
-% 'isnan', 'isfinite'.
+% 'isnan', 'isfinite', 'cbrt'.
 %
 % op.optype: 'logical', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32',
 % 'int64', 'uint64', 'single', 'double', 'single complex' or 'double complex'.
@@ -357,6 +357,9 @@
     case 'erfc'
         z = erfc (x) ;
 
+    case 'cbrt'
+        z = nthroot (x, 3) ;
+
     case 'frexpx'
         [z,~] = log2 (x) ;
 
diff --git a/GraphBLAS/Test/GB_spec_operator.m b/GraphBLAS/Test/GB_spec_operator.m
index b29216b2b..2c04e7c1e 100644
--- a/GraphBLAS/Test/GB_spec_operator.m
+++ b/GraphBLAS/Test/GB_spec_operator.m
@@ -266,7 +266,7 @@
     % unary ops for real floating-point only
     %--------------------------------------------------------------------------
 
-    case { 'lgamma', 'tgamma', 'erf', 'erfc', 'frexpx',  'frexpe' }
+    case { 'lgamma', 'tgamma', 'erf', 'erfc', 'frexpx',  'frexpe', 'cbrt' }
         % x and z have the same type
         if (~is_real_float)
             error ('invalid op') ;
diff --git a/GraphBLAS/Test/GB_spec_opsall.m b/GraphBLAS/Test/GB_spec_opsall.m
index 245221834..554ed60ba 100644
--- a/GraphBLAS/Test/GB_spec_opsall.m
+++ b/GraphBLAS/Test/GB_spec_opsall.m
@@ -134,7 +134,7 @@
 
 % unary ops for FP32 and FP64 only
 unary_ops.fpreal = {
-'lgamma', 'tgamma', 'erf', 'erfc', 'frexpx',  'frexpe' } ;
+'lgamma', 'tgamma', 'erf', 'erfc', 'frexpx',  'frexpe', 'cbrt' } ;
 
 % unary ops for FC32 and FC64 only
 unary_ops.complex = {
diff --git a/GraphBLAS/Test/logstat.m b/GraphBLAS/Test/logstat.m
index 91e12a3a8..ec23fc03b 100644
--- a/GraphBLAS/Test/logstat.m
+++ b/GraphBLAS/Test/logstat.m
@@ -77,8 +77,8 @@ function logstat (testscript, threads)
     % trim the year from the date
     s = s ([1:6 12:end]) ;
 
-    fprintf (   '%s %-10s %7.1f sec ', s, testscript, t) ;
-    fprintf (f, '%s %-10s %7.1f sec ', s, testscript, t) ;
+    fprintf (   '%s %-11s %7.1f sec ', s, testscript, t) ;
+    fprintf (f, '%s %-11s %7.1f sec ', s, testscript, t) ;
 
     if (~isempty (strfind (pwd, 'Tcov')))
         global GraphBLAS_debug GraphBLAS_grbcov
@@ -90,14 +90,14 @@ function logstat (testscript, threads)
             c = sum (GraphBLAS_grbcov > 0) ;
             n = length (GraphBLAS_grbcov) ;
             if (c == n)
-                fprintf (   '%5d:   all %5d full 100%% %8.2f/sec', ...
+                fprintf (   '%5d:   all %5d full 100%% %8.2f/s', ...
                     c - clast, n, (c-clast) / t) ;
-                fprintf (f, '%5d:   all %5d full 100%% %8.2f/sec', ...
+                fprintf (f, '%5d:   all %5d full 100%% %8.2f/s', ...
                     c - clast, n, (c-clast) / t) ;
             else
-                fprintf (   '%5d: %5d of %5d %5.1f%% %8.2f/sec', ...
+                fprintf (   '%5d: %5d of %5d %5.1f%% %8.2f/s', ...
                     c - clast, n-c, n, 100 * (c/n), (c-clast) / t) ;
-                fprintf (f, '%5d: %5d of %5d %5.1f%% %8.2f/sec', ...
+                fprintf (f, '%5d: %5d of %5d %5.1f%% %8.2f/s', ...
                     c - clast, n-c, n, 100 * (c/n), (c-clast) / t) ;
             end
             if (debug)
diff --git a/GraphBLAS/Test/make.m b/GraphBLAS/Test/make.m
index 2566b0cf3..e5e5f73c0 100644
--- a/GraphBLAS/Test/make.m
+++ b/GraphBLAS/Test/make.m
@@ -81,6 +81,7 @@ function make (what)
 
 hfiles = [ dir('*.h') ; dir('Template/*.c') ] ;
 inc = '-ITemplate -I../Include -I../Source -I../Source/Template -I../lz4 -I../rmm_wrap' ;
+inc = [inc ' -I../zstd -I../zstd/zstd_subset'] ;
 
 if (ismac)
     % Mac (do 'make install' for GraphBLAS first)
diff --git a/GraphBLAS/Test/test01.m b/GraphBLAS/Test/test01.m
index c0e9c034e..819107ddf 100644
--- a/GraphBLAS/Test/test01.m
+++ b/GraphBLAS/Test/test01.m
@@ -13,6 +13,9 @@
 GB_mex_about6 ;
 GB_mex_about7 ;
 GB_mex_about8 ;
+if (~ispc)
+    GB_mex_about9 ;
+end
 
 fprintf ('\ntest01: all tests passed\n') ;
 
diff --git a/GraphBLAS/Test/test17.m b/GraphBLAS/Test/test17.m
index bcc7103d7..635cab1a4 100644
--- a/GraphBLAS/Test/test17.m
+++ b/GraphBLAS/Test/test17.m
@@ -45,9 +45,11 @@
                         use_scalar = (rand (1) > 0.9) ;
                         x1 = GB_mex_Matrix_extractElement  (A, iu, ju, ...
                                 xtype, use_scalar) ;
-                        x2 = GB_spec_Matrix_extractElement (A, i, j, xtype) ;
+                        [x2 noval] = ...
+                            GB_spec_Matrix_extractElement (A, i, j, xtype) ;
                         assert (isequal (x1,x2))
-
+                        x3 = GB_mex_Matrix_isStoredElement (A, iu, ju) ;
+                        assert (isequal (~noval,x3))
                     end
                 end
                 end
@@ -56,9 +58,13 @@
                 for i = 0:(m*n)-1
                     iu = uint64 (i) ;
                     use_scalar = (rand (1) > 0.9) ;
-                    x1 = GB_mex_Vector_extractElement  (B, iu, xtype, use_scalar) ;
-                    x2 = GB_spec_Vector_extractElement (B, i, xtype) ;
+                    x1 = GB_mex_Vector_extractElement  (B, iu, xtype, ...
+                        use_scalar) ;
+                    [x2 noval] = ...
+                        GB_spec_Vector_extractElement (B, i, xtype) ;
                     assert (isequal (x1,x2))
+                    x3 = GB_mex_Vector_isStoredElement (B, iu) ;
+                    assert (isequal (~noval,x3))
                 end
             end
         end
diff --git a/GraphBLAS/Test/test228.m b/GraphBLAS/Test/test228.m
index 3f11fa394..5859574ae 100644
--- a/GraphBLAS/Test/test228.m
+++ b/GraphBLAS/Test/test228.m
@@ -21,7 +21,7 @@
                 A.sparsity = A_sparsity ;
                 C = GB_mex_serialize (A, -2) ;      % GrB_serialize
                 GB_spec_compare (A, C) ;
-                for method = [-1 0 1000 2000:2009]
+                for method = [-1 0 1000 2000:2009 3000:3019]
                     C = GB_mex_serialize (A, method) ;
                     GB_spec_compare (A, C) ;
                 end
diff --git a/GraphBLAS/Test/test244.m b/GraphBLAS/Test/test244.m
new file mode 100644
index 000000000..fde43361b
--- /dev/null
+++ b/GraphBLAS/Test/test244.m
@@ -0,0 +1,75 @@
+function test244
+%TEST244 test reshape
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default')
+
+[~, ~, ~, types, ~, ~] = GB_spec_opsall ;
+types = types.all ;
+
+for k1 = 1:length(types)
+    type = types {k1} ;
+    fprintf ('\n%-14s ', type) ;
+
+    for m = [1 2 6] % 1:6
+        for n = [1 2 6] % 1:6
+            mn = m*n ;
+            f = factor (mn) ;
+            for d = [0.3 inf]
+                A = GB_spec_random (m, n, d, 99, type) ;
+                fprintf ('.') ;
+                for sparsity = [1 2 4 8]
+                    A.sparsity = sparsity ;
+                    for is_csc = [0 1]
+                        A.is_csc = is_csc ;
+                        for iso = [false true]
+                            A.iso = iso ;
+
+                            for k = 1:length (f)
+                                S = nchoosek (f, k) ;
+                                for i = 1:size(S,1)
+
+                                    m2 = prod (S (i,:)) ;
+                                    n2 = mn / m2 ;
+
+                                    % reshape by column
+                                    C1 = A ;
+                                    x = 1 ;
+                                    if (iso)
+                                        [i,j,x] = find (C1.matrix, 1,'first') ;
+                                        C1.matrix (C1.pattern) = x ;
+                                    end
+                                    C1.matrix  = reshape (C1.matrix,  m2, n2) ;
+                                    C1.pattern = reshape (C1.pattern, m2, n2) ;
+
+                                    for inplace = [false true]
+                                        C2 = GB_mex_reshape (A, m2, n2, ...
+                                            true, inplace) ;
+                                        GB_spec_compare (C1, C2, 0) ;
+                                    end
+
+                                    % reshape by row
+                                    C1 = A ;
+                                    if (iso)
+                                        C1.matrix (C1.pattern) = x ;
+                                    end
+                                    C1.matrix  = reshape (C1.matrix', n2, m2)' ;
+                                    C1.pattern = reshape (C1.pattern', n2, m2)';
+                                    for inplace = [false true]
+                                        C2 = GB_mex_reshape (A, m2, n2, ...
+                                            false, inplace) ;
+                                        GB_spec_compare (C1, C2, 0) ;
+                                    end
+                                end
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+end
+
+fprintf ('\ntest244: all tests passed\n') ;
diff --git a/GraphBLAS/Test/testall.m b/GraphBLAS/Test/testall.m
index 352fc0ba8..f9d51811c 100644
--- a/GraphBLAS/Test/testall.m
+++ b/GraphBLAS/Test/testall.m
@@ -63,9 +63,9 @@ function testall (threads,longtests)
 hack (2) = 0 ;
 GB_mex_hack (hack) ;
 
-%-------------------------------------------------------------------------------
-% quick tests for statement coverage
-%-------------------------------------------------------------------------------
+%===============================================================================
+% quick tests for statement coverage, with malloc debugging
+%===============================================================================
 
 % Timings below are for test coverage (Tcov), with malloc debuging enabled, on
 % hypersparse.cse.tamu.edu (20 core Xeon).  Times will differ if this test is
@@ -75,100 +75,34 @@ function testall (threads,longtests)
 % tests with high rates (over 100/sec)
 %----------------------------------------
 
-logstat ('test243',t) ; % test GxB_Vector_Iterator
-logstat ('test242',t) ; % test GxB_Iterator for matrices
+logstat ('test01' ,t) ; % error handling
+logstat ('test199',t) ; % test dot2 with hypersparse
+logstat ('test83' ,t) ; % GrB_assign with C_replace and empty J
+logstat ('test210',t) ; % test iso assign25: C<M,struct>=A, C empty, A dense
+logstat ('test165',t) ; % test C=A*B' where A is diagonal and B becomes bitmap
+logstat ('test219',s) ; % test reduce to scalar (1 thread)
 logstat ('test241',t) ; % test GrB_mxm, triggering the swap_rule
-logstat ('testca',t) ;  % test complex mxm, mxv, and vxm
 logstat ('test240',t) ; % test dot4 and saxpy5
+logstat ('test220',t) ; % test mask C<M>=Z, iso case
+logstat ('test211',t) ; % test iso assign
+logstat ('test202',t) ; % test iso add and emult
+logstat ('test152',t) ; % test binops with C=A+B, all matrices dense
+logstat ('test222',t) ; % test user selectop for iso matrices
 
-% disable the Werk stack for these tests:
-hack (2) = 1 ;
-GB_mex_hack (hack) ;
-
-logstat ('test240',t) ; % test dot4 and saxpy5 again
-logstat ('testca',t) ;  % test complex mxm, mxv, and vxm
-logstat ('test238',t) ; % test GrB_mxm (dot4 and dot2)
-logstat ('test237',t) ; % test GrB_mxm (saxpy4)
-logstat ('test236',t) ; % test GxB_Matrix_sort and GxB_Vector_sort
-logstat ('test192',t) ; % test C<C,struct>=scalar
-logstat ('test191',t) ; % test split
-logstat ('test188',t) ; % test concat
-logstat ('test187',t) ; % test dup/assign for all sparsity formats
+hack (2) = 1 ; GB_mex_hack (hack) ; % disable the Werk stack
 logstat ('test186',t) ; % test saxpy for all sparsity formats
-logstat ('test186',s) ; % test saxpy for all sparsity formats
-logstat ('test185',s) ; % test dot4, saxpy for all sparsity formats
-logstat ('test184',t) ; % test special cases for mxm, transpose, and build
-logstat ('test181',s) ; % test transpose with explicit zeros in the mask
-logstat ('test180',s) ; % test assign and subassign (single threaded)
-logstat ('test180',t) ; % test assign and subassign (multi threaded)
 logstat ('test150',t) ; % mxm with zombies and typecasting (dot3 and saxpy)
-logstat ('test14' ,t) ; % GrB_reduce
-logstat ('test154',t) ; % apply with binop and scalar binding
-logstat ('test151b',t); % test bshift operator
-
-% re-enable the Werk stack for most tests:
-hack (2) = 0 ;
-GB_mex_hack (hack) ;
+hack (2) = 0 ; GB_mex_hack (hack) ; % re-enable the Werk stack
 
 logstat ('test239',t) ; % test GxB_eWiseUnion
-logstat ('test74' ,t) ; % test GrB_mxm on all semirings
 logstat ('test235',t) ; % test GxB_eWiseUnion and GrB_eWiseAdd
-logstat ('test234',t) ; % test GxB_eWiseUnion
-logstat ('test233',t) ; % test bitmap saxpy C=A*B with A sparse and B bitmap
-logstat ('test232',t) ; % test assign with GrB_Scalar
-logstat ('test231',t) ; % test GrB_select with idxunp
-logstat ('test230',t) ; % test apply with idxunops
-logstat ('test229',t) ; % test setElement
-logstat ('test228',t) ; % test serialize/deserialize
-logstat ('test227',t) ; % test kron
 logstat ('test226',t) ; % test kron with iso matrices
-logstat ('test225',t) ; % test mask operations (GB_masker)
-logstat ('test224',t) ; % test unpack/pack
 logstat ('test223',t) ; % test matrix multiply, C<!M>=A*B
-logstat ('test222',t) ; % test user selectop for iso matrices
-logstat ('test221',t) ; % test C += A where C is bitmap and A is full
-logstat ('test220',t) ; % test mask C<M>=Z, iso case
-logstat ('test219',s) ; % test reduce to scalar (1 thread)
-logstat ('test217',t) ; % test C<repl>(I,J)=A, bitmap assign
-logstat ('test216',t) ; % test C<A>=A, iso case
-logstat ('test215',t) ; % test C<M>=A'*B (dot2, ANY_PAIR semiring)
-logstat ('test214',t) ; % test C<M>=A'*B (tricount)
-logstat ('test213',t) ; % test iso assign (method 05d)
-logstat ('test212',t) ; % test iso mask all zero
-logstat ('test211',t) ; % test iso assign
-logstat ('test210',t) ; % test iso assign25: C<M,struct>=A, C empty, A dense
-logstat ('test209',t) ; % test iso build
-logstat ('test208',t) ; % test iso apply, bind 1st and 2nd
-logstat ('test207',t) ; % test iso subref
-logstat ('test206',t) ; % test iso select
 logstat ('test204',t) ; % test iso diag
 logstat ('test203',t) ; % test iso subref
-logstat ('test202',t) ; % test iso add and emult
-logstat ('test201',t) ; % test iso reduce to vector
-logstat ('test200',t) ; % test iso full matrix multiply
-logstat ('test199',t) ; % test dot2 with hypersparse
-logstat ('test198',t) ; % test apply with C=op(C)
-logstat ('test197',t) ; % test large sparse split
-logstat ('test196',t) ; % test hypersparse concat
-
-logstat ('test195',t) ; % test all variants of saxpy3
-logstat ('test194',t) ; % test GxB_Vector_diag
-logstat ('test193',t) ; % test GxB_Matrix_diag
-logstat ('test189',t) ; % test large assign
 logstat ('test183',s) ; % test eWiseMult with hypersparse mask
-
-logstat ('test182',s) ; % test for internal wait
 logstat ('test179',t) ; % test bitmap select
-
-logstat ('test165',t) ; % test C=A*B' where A is diagonal and B becomes bitmap
-
-logstat ('test01' ,t) ; % error handling
-logstat ('test83' ,t) ; % GrB_assign with C_replace and empty J
-
-logstat ('test176',t) ; % test GrB_assign, method 09, 11
 logstat ('test174',t) ; % test GrB_assign C<A>=A
-logstat ('test170',t) ; % test C<B>=A+B (alias M==B)
-logstat ('test152',t) ; % test binops with C=A+B, all matrices dense
 logstat ('test155',t) ; % test GrB_*_setElement and GrB_*_removeElement
 logstat ('test156',t) ; % test GrB_assign C=A with typecasting
 logstat ('test136',s) ; % subassignment special cases
@@ -176,110 +110,210 @@ function testall (threads,longtests)
 logstat ('test109',t) ; % terminal monoid with user-defined type
 logstat ('test109',s) ; % terminal monoid with user-defined type
 logstat ('test04' ,t) ; % simple mask and transpose test
-
-%----------------------------------------
-% tests with good rates (30 to 100/sec)
-%----------------------------------------
-
-logstat ('test142',t) ; % test GrB_assign with accum
+logstat ('test207',t) ; % test iso subref
+logstat ('test221',t) ; % test C += A where C is bitmap and A is full
 logstat ('test162',t) ; % test C<M>=A*B with very sparse M
-logstat ('test161',t) ; % test A*B*E
 logstat ('test159',t) ; % test A*B
-logstat ('test137',s) ; % GrB_eWiseMult with FIRST and SECOND operators
-logstat ('test139',s) ; % merge sort, special cases
 logstat ('test09' ,t) ; % duplicate I,J test of GB_mex_subassign
 logstat ('test132',t) ; % setElement
 logstat ('test141',t) ; % eWiseAdd with dense matrices
+logstat ('testc2(1,1)',t) ; % complex tests (quick case, builtin)
+logstat ('test214',t) ; % test C<M>=A'*B (tricount)
+logstat ('test213',t) ; % test iso assign (method 05d)
+logstat ('test206',t) ; % test iso select
+logstat ('test212',t) ; % test iso mask all zero
+logstat ('test128',t) ; % eWiseMult, eWiseAdd, eWiseUnion special cases
+logstat ('test82' ,t) ; % GrB_extract with index range (hypersparse)
+
+%----------------------------------------
+% tests with good rates (30 to 100/sec)
+%----------------------------------------
+
+logstat ('test229',t) ; % test setElement
 logstat ('test144',t) ; % cumsum
-logstat ('test145',t) ; % dot4 for C += A'*B
+
+hack (2) = 1 ; GB_mex_hack (hack) ; % disable the Werk stack
+logstat ('test14' ,t) ; % GrB_reduce
+hack (2) = 0 ; GB_mex_hack (hack) ; % re-enable the Werk stack
 
 %----------------------------------------
-% tests with decent rates (30 to 40/sec)
+% tests with decent rates (20 to 30/sec)
 %----------------------------------------
 
-logstat ('test92' ,t) ; % GB_subref: symbolic case
-logstat ('test108',t) ; % boolean monoids
+hack (2) = 1 ; GB_mex_hack (hack) ; % disable the Werk stack
+logstat ('test180',s) ; % test assign and subassign (single threaded)
+logstat ('test236',t) ; % test GxB_Matrix_sort and GxB_Vector_sort
+hack (2) = 0 ; GB_mex_hack (hack) ; % re-enable the Werk stack
+
+%----------------------------------------
+% tests with decent rates (10 to 20/sec)
+%----------------------------------------
+
+logstat ('test232',t) ; % test assign with GrB_Scalar
+logstat ('test228',t) ; % test serialize/deserialize
+
+hack (2) = 1 ; GB_mex_hack (hack) ; % disable the Werk stack
+logstat ('test154',t) ; % apply with binop and scalar binding
+hack (2) = 0 ; GB_mex_hack (hack) ; % re-enable the Werk stack
+
+%----------------------------------------
+% tests with low coverage/sec rates (1/sec to 10/sec)
+%----------------------------------------
+
+hack (2) = 1 ; GB_mex_hack (hack) ; % disable the Werk stack
+logstat ('test238',t) ; % test GrB_mxm (dot4 and dot2)
+logstat ('test151b',t); % test bshift operator
+logstat ('test184',t) ; % test special cases for mxm, transpose, and build
+logstat ('test191',t) ; % test split
+logstat ('test188',t) ; % test concat
+logstat ('test237',t) ; % test GrB_mxm (saxpy4)
+logstat ('test240',t) ; % test dot4 and saxpy5 again
+hack (2) = 0 ; GB_mex_hack (hack) ; % re-enable the Werk stack
+
+logstat ('test224',t) ; % test unpack/pack
+logstat ('test196',t) ; % test hypersparse concat
+logstat ('test209',t) ; % test iso build
+logstat ('test104',t) ; % export/import
+
+%----------------------------------------
+% tests with very low coverage/sec rates  (< 1/sec)
+%----------------------------------------
+
+logstat ('test189',t) ; % test large assign
+logstat ('test194',t) ; % test GxB_Vector_diag
+logstat ('test76' ,s) ; % GxB_resize (single threaded). ***
+
+logstat ('test244',t) ; % test GxB_Matrix_reshape*
+
+%===============================================================================
+% tests with no malloc debugging
+%===============================================================================
+
+% Turn off malloc debugging
+malloc_debugging = stat ;
+if (malloc_debugging)
+    debug_off
+    fprintf ('[malloc debugging turned off]\n') ;
+    f = fopen ('log.txt', 'a') ;
+    fprintf (f, '[malloc debugging turned off]\n') ;
+    fclose (f) ;
+end
+
+%----------------------------------------
+% tests with good rates (30 to 100/sec)
+%----------------------------------------
+
+logstat ('test201',t) ; % test iso reduce to vector
+logstat ('test225',t) ; % test mask operations (GB_masker)
+logstat ('test170',t) ; % test C<B>=A+B (alias M==B)
+logstat ('test176',t) ; % test GrB_assign, method 09, 11
+
+logstat ('test208',t) ; % test iso apply, bind 1st and 2nd
+logstat ('test216',t) ; % test C<A>=A, iso case
+logstat ('test142',t) ; % test GrB_assign with accum
+logstat ('test137',s) ; % GrB_eWiseMult with FIRST and SECOND operators
+logstat ('test139',s) ; % merge sort, special cases
+
+logstat ('test145',t) ; % dot4 for C += A'*B
 logstat ('test172',t) ; % test eWiseMult with M bitmap/full
 logstat ('test148',t) ; % ewise with alias
-logstat ('testc2(1)',t) ; % complex tests (quick case)
 
 %----------------------------------------
 % tests with decent rates (20 to 30/sec)
 %----------------------------------------
 
-logstat ('test173',t) ; % test GrB_assign C<A>=A
+hack (2) = 1 ; GB_mex_hack (hack) ; % disable the Werk stack
+logstat ('test186',s) ; % test saxpy for all sparsity formats
+hack (2) = 0 ; GB_mex_hack (hack) ; % re-enable the Werk stack
+
 logstat ('test157',t) ; % test sparsity formats
-logstat ('test29' ,t) ; % reduce with zombies
+logstat ('test182',s) ; % test for internal wait
 
 %----------------------------------------
 % tests with decent rates (10 to 20/sec)
 %----------------------------------------
 
-logstat ('test128',t) ; % eWiseMult, eWiseAdd, eWiseUnion special cases
+logstat ('test108',t) ; % boolean monoids
+logstat ('test130',t) ; % GrB_apply, hypersparse cases
+logstat ('test124',t) ; % GrB_extract, case 6
+logstat ('test138',s) ; % test assign, with coarse-only tasks in IxJ slice
+
+logstat ('test227',t) ; % test kron
+
+% longer tests but with decent coverage rates:
 logstat ('test125',t) ; % test GrB_mxm: row and column scaling
-logstat ('test82' ,t) ; % GrB_extract with index range (hypersparse)
 
 %----------------------------------------
-% tests with low coverage/sec rates (1/sec to 10/sec)
+% 1 to 10/sec
 %----------------------------------------
 
+logstat ('test234',t) ; % test GxB_eWiseUnion
+
+logstat ('test242',t) ; % test GxB_Iterator for matrices
+logstat ('test173',t) ; % test GrB_assign C<A>=A
+logstat ('test200',t) ; % test iso full matrix multiply
+logstat ('test197',t) ; % test large sparse split
+
 logstat ('test158',t) ; % test colscale and rowscale
 logstat ('test84' ,t) ; % GrB_assign (row and column with C in CSR/CSC format)
-logstat ('test130',t) ; % GrB_apply, hypersparse cases
-
 logstat ('test19b',t) ; % GrB_assign, many pending operators
 logstat ('test19b',s) ; % GrB_assign, many pending operators
 logstat ('test133',t) ; % test mask operations (GB_masker)
 logstat ('test80' ,t) ; % test GrB_mxm on all semirings (different matrix)
 logstat ('test151',t) ; % test bitwise operators
-logstat ('test124',t) ; % GrB_extract, case 6
 logstat ('test23' ,t) ; % quick test of GB_*_build
-
-logstat ('test175',t) ; % test142 updated
-logstat ('test160',t) ; % test A*B, parallel
+logstat ('test135',t) ; % reduce to scalar
 logstat ('test160',s) ; % test A*B, single threaded
 logstat ('test54' ,t) ; % assign and extract with begin:inc:end
-logstat ('test104',t) ; % export/import
-logstat ('test11' ,t) ; % exhaustive test of GrB_extractTuples
+
+logstat ('test129',t) ; % test GxB_select (tril and nonzero, hypersparse)
+logstat ('test69' ,t) ; % assign and subassign with alias
+logstat ('test230',t) ; % test apply with idxunops
+logstat ('test74' ,t) ; % test GrB_mxm on all semirings
+logstat ('test127',t) ; % test eWiseAdd, eWiseMult (all types and operators)
+logstat ('test19',t) ;  % GxB_subassign, many pending operators
 
 %----------------------------------------
-% tests with very low coverage/sec rates  (< 1/sec)
+% < 1 per sec
 %----------------------------------------
 
-logstat ('test129',t) ; % test GxB_select (tril and nonzero, hypersparse)
-logstat ('test138',s) ; % test assign, with coarse-only tasks in IxJ slice
-logstat ('test127',t) ; % test eWiseAdd, eWiseMult (all types and operators)
-logstat ('test76' ,s) ; % GxB_resize (single threaded)
-logstat ('test107',t) ; % monoids with terminal values
-logstat ('test69' ,t) ; % assign and subassign with alias
-logstat ('test135',t) ; % reduce to scalar
-logstat ('test17' ,t) ; % quick test of GrB_*_extractElement
-logstat ('test53' ,t) ; % quick test of GB_mex_Matrix_extract
+logstat ('test11' ,t) ; % exhaustive test of GrB_extractTuples
+logstat ('test160',t) ; % test A*B, parallel
+logstat ('test215',t) ; % test C<M>=A'*B (dot2, ANY_PAIR semiring)
 
-logstat ('test19',t) ;  % GxB_subassign, many pending operators
+logstat ('test193',t) ; % test GxB_Matrix_diag
+logstat ('test195',t) ; % test all variants of saxpy3
+logstat ('test233',t) ; % test bitmap saxpy C=A*B with A sparse and B bitmap
+logstat ('test243',t) ; % test GxB_Vector_Iterator
+logstat ('test29' ,t) ; % reduce with zombies
+
+logstat ('testca(1)',t) ;  % test complex mxm, mxv, and vxm
+hack (2) = 1 ; GB_mex_hack (hack) ; % disable the Werk stack
+logstat ('test187',t) ; % test dup/assign for all sparsity formats
+logstat ('test192',t) ; % test C<C,struct>=scalar
+logstat ('test181',s) ; % test transpose with explicit zeros in the mask
+logstat ('test185',s) ; % test dot4, saxpy for all sparsity formats
+hack (2) = 0 ; GB_mex_hack (hack) ; % re-enable the Werk stack
+logstat ('test53' ,t) ; % quick test of GB_mex_Matrix_extract
+logstat ('test17' ,t) ; % quick test of GrB_*_extractElement
+logstat ('test231',t) ; % test GrB_select with idxunp
 
 %----------------------------------------
 % longer tests (200 seconds to 600 seconds)
 %----------------------------------------
 
-% Turn off malloc debugging
-malloc_debugging = stat ;
-if (malloc_debugging)
-    debug_off
-    fprintf ('[malloc debugging turned off]\n') ;
-    f = fopen ('log.txt', 'a') ;
-    fprintf (f, '[malloc debugging turned off]\n') ;
-    fclose (f) ;
-end
-
 logstat ('test10' ,t) ; % GrB_apply
 logstat ('test75b',t) ; % test GrB_mxm A'*B (quicker than test75)
-logstat ('test16' ,t) ; % user-defined complex operators
+logstat ('testc2(0,0)',t) ;  % A'*B, A+B, A*B, user-defined complex type
+logstat ('testc4(0)',t) ;  % extractElement, setElement, user-defined complex
+logstat ('testc7(0)',t) ;  % assign, builtin complex
+logstat ('testcc(1)',t) ;  % transpose, builtin complex
 logstat ('test81' ,t) ; % GrB_Matrix_extract with stride, range, backwards
 logstat ('test21b',t) ; % quick test of GB_mex_assign
 logstat ('test18' ,t) ; % quick tests of GrB_eWiseAdd and eWiseMult
 
-%-------------------------------------------------------------------------------
+
+%===============================================================================
 % The following tests are not required for statement coverage.  Some need
 % other packages in SuiteSparse (CSparse, SSMULT, ssget).  By default, these
 % tests are not run.  To install them, see test_other.m.  Timing is with malloc
@@ -302,6 +336,7 @@ function testall (threads,longtests)
 
 logstat ('test13',t) ;     %      % simple tests of GB_mex_transpose
 logstat ('test15',t) ;            % simple test of GB_mex_AxB
+logstat ('test16' ,t) ;    %  177 % user-defined complex operators
 logstat ('test18(1)',t) ;  %      % lengthy tests of GrB_eWiseAdd and eWiseMult
 
 logstat ('test20',t) ;            % quick test of GB_mex_mxm on a few semirings
@@ -381,6 +416,7 @@ function testall (threads,longtests)
 
 logstat ('test90',t) ;     %    1 % test user-defined semirings
 logstat ('test91',t) ;     %      % test subref performance on dense vectors
+logstat ('test92' ,t) ;    %   .1 % GB_subref: symbolic case
 logstat ('test95',t) ;     %      % performance test for GrB_transpose
 logstat ('test96',t) ;     %   16 % A*B using dot product
 logstat ('test97',t) ;     %    0 % GB_mex_assign, scalar expansion and zombies
@@ -392,6 +428,7 @@ function testall (threads,longtests)
 logstat ('test103',t) ;    %      % GrB_transpose aliases
 logstat ('test105',t) ;    %    2 % eWiseAdd for hypersparse
 logstat ('test106',t) ;    %    4 % GxB_subassign with alias
+logstat ('test107',t) ;    %    2 % monoids with terminal values
 
 logstat ('test110',t) ;    %    0 % binary search of M(:,j) in accum/mask
 logstat ('test111',t) ;    %      % performance test for eWiseAdd
@@ -417,6 +454,7 @@ function testall (threads,longtests)
 logstat ('test147',t) ;           % C<M>=A*B with very sparse M
 logstat ('test149',t) ;           % test fine hash tasks for C<!M>=A*B
 
+logstat ('test161',t) ;    %      % test A*B*E
 logstat ('test163',t) ;    %   .6 % test C<!M>=A'*B where C and M are sparse
 logstat ('test164',t) ;    %    0 % test dot5 method
 logstat ('test166',t) ;    %   .1 % test GxB_select with a dense matrix
@@ -425,14 +463,18 @@ function testall (threads,longtests)
 logstat ('test169',t) ;    %    0 % test C<!M>=A+B with C sparse, M hyper, A and B sparse
 
 logstat ('test171',t) ;    %    1 % test conversion and GB_memset
-logstat ('test177',t) ;    %   1.2 % test C<!M>=A*B, C and B bitmap, M and A sparse
+logstat ('test175',t) ;    %    8 % test142 updated
+logstat ('test177',t) ;    %  1.2 % test C<!M>=A*B, C and B bitmap, M and A sparse
+
+logstat ('test180',t) ;    %  16  % test assign and subassign (multi threaded)
 
 logstat ('test190',t) ;    %   .3 % test dense matrix for C<!M>=A*B
+logstat ('test198',t) ;    %   .1 % test apply with C=op(C)
 
 logstat ('test205',t) ;    %    0 % test iso kron
+logstat ('test217',t) ;    %    0 % test C<repl>(I,J)=A, bitmap assign
 logstat ('test218',t) ;    %    0 % test C=A+B, C and A are full, B is bitmap
 
-% tested via test16:
 logstat ('testc1',t) ;     %      % test complex operators
 logstat ('testc2',t) ;     %      % test complex A*B, A'*B, A*B', A'*B', A+B
 logstat ('testc3',t) ;     %      % test complex GrB_extract
diff --git a/GraphBLAS/Test/testc1.m b/GraphBLAS/Test/testc1.m
index 013a32b8c..17bd1ec61 100644
--- a/GraphBLAS/Test/testc1.m
+++ b/GraphBLAS/Test/testc1.m
@@ -1,4 +1,4 @@
-function testc1
+function testc1(use_builtin)
 %TESTC1 test complex operators
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -6,6 +6,11 @@
 
 rng 'default'
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 A = sparse (rand (2) + 1i * rand (2))  ;
 C = GB_mex_dump (A,0) ;
 GB_spec_compare (C, A) ;
@@ -99,3 +104,4 @@
 
 fprintf ('testc1: all complex operator tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testc2.m b/GraphBLAS/Test/testc2.m
index 6576e8313..3e2c23297 100644
--- a/GraphBLAS/Test/testc2.m
+++ b/GraphBLAS/Test/testc2.m
@@ -1,4 +1,4 @@
-function testc2(quick)
+function testc2(quick,use_builtin)
 %TESTC2 test complex A*B, A'*B, A*B', A'*B', A+B
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -6,6 +6,11 @@ function testc2(quick)
 
 rng ('default') ;
 
+if (nargin < 2)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 if (nargin < 1)
     quick = 0 ;
 end
@@ -134,4 +139,5 @@ function testc2(quick)
 
 fprintf ('testc2: all complex A*B, A''*B, A*B'', A''*B'' tests passed, maxerr %g\n', maxerr) ;
 
+GB_builtin_complex_set (true) ;
 
diff --git a/GraphBLAS/Test/testc3.m b/GraphBLAS/Test/testc3.m
index e0b550de7..9a516c959 100644
--- a/GraphBLAS/Test/testc3.m
+++ b/GraphBLAS/Test/testc3.m
@@ -1,4 +1,4 @@
-function testc3
+function testc3(use_builtin)
 %TESTC3 test complex GrB_extract
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -6,6 +6,11 @@
 
 rng ('default') ;
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 d = struct ('outp', 'replace') ;
 seed = 1 ;
 for m = [1 5 10 100]
@@ -97,3 +102,4 @@
 
 fprintf ('\ntestc3: all tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testc4.m b/GraphBLAS/Test/testc4.m
index 6c196f5f9..36bb37a93 100644
--- a/GraphBLAS/Test/testc4.m
+++ b/GraphBLAS/Test/testc4.m
@@ -1,4 +1,4 @@
-function testc4
+function testc4(use_builtin)
 %TESTC4 test complex extractElement and setElement
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -6,11 +6,17 @@
 
 rng ('default') ;
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 seed = 1 ;
 for m = [1 5 10 100]
     for n = [1 5 10 100]
         seed = seed + 1 ;
         A = GB_mex_random (m, n, 10*(m+n), 1, seed) ;
+        S = logical (spones (A)) ;
 
         ktuples = 400 ;
 
@@ -23,11 +29,15 @@
 
             use_scalar = (rand (1) > 0.8) ;
             x1 = GB_mex_Matrix_extractElement (A, I0, J0, '', use_scalar)  ;
+            s1 = GB_mex_Matrix_isStoredElement (A, I0, J0) ;
             x2 = complex (zeros (ktuples,1)) ;
+            s2 = false (ktuples,1) ;
             for k = 1:ktuples
                 x2 (k) = A (I (k), J (k)) ;
+                s2 (k) = S (I (k), J (k)) ;
             end
             assert (isequal (x1, x2))
+            assert (isequal (s1, s2))
 
             if (n == 1)
                 x1 = GB_mex_Vector_extractElement (A, I0, '', use_scalar)  ;
@@ -72,3 +82,4 @@
 
 fprintf ('\ntestc4: all tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testc5.m b/GraphBLAS/Test/testc5.m
index 340f7673d..7609866a3 100644
--- a/GraphBLAS/Test/testc5.m
+++ b/GraphBLAS/Test/testc5.m
@@ -1,9 +1,14 @@
-function testc5
+function testc5(use_builtin)
 %TESTC5 test complex subref
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: Apache-2.0
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 seed = 1 ;
 for m = [1 5 10 100]
     for n = [1 5 10 100]
@@ -24,3 +29,4 @@
 end
 fprintf ('testc5: all complex subref C = A(I,J) tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testc6.m b/GraphBLAS/Test/testc6.m
index 319c4e909..d7a553664 100644
--- a/GraphBLAS/Test/testc6.m
+++ b/GraphBLAS/Test/testc6.m
@@ -1,4 +1,4 @@
-function testc6
+function testc6(use_builtin)
 %TESTC6 test complex apply
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -6,6 +6,11 @@
 
 rng 'default'
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 [complex_binary complex_unary] = GB_user_opsall ;
 
 dr  = struct ('outp', 'replace') ;
@@ -91,3 +96,4 @@
 
 fprintf ('testc6: all complex apply C<Mask>=op(A) tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testc7.m b/GraphBLAS/Test/testc7.m
index 8a4cfdeac..25a00e019 100644
--- a/GraphBLAS/Test/testc7.m
+++ b/GraphBLAS/Test/testc7.m
@@ -1,4 +1,4 @@
-function testc7
+function testc7(use_builtin)
 %TESTC7 test complex assign
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -7,6 +7,11 @@
 fprintf ('\ntestc7: all complex assign C(I,J)=A --------------------------\n') ;
 rng ('default')
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 dclear.outp = 'replace' ;
 dclear.mask = 'complement' ;
 tol = 1e-13 ;
@@ -79,3 +84,5 @@
 
 fprintf ('\ntestc7: all complex assign C(I,J)=A tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
+
diff --git a/GraphBLAS/Test/testc8.m b/GraphBLAS/Test/testc8.m
index 4ece2d239..277d18ef3 100644
--- a/GraphBLAS/Test/testc8.m
+++ b/GraphBLAS/Test/testc8.m
@@ -1,4 +1,4 @@
-function testc8
+function testc8(use_builtin)
 %TESTC8 test complex eWiseAdd and eWiseMult
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
@@ -6,6 +6,11 @@
 
 fprintf ('testc8: test complex eWiseAdd and eWiseMult\n') ;
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 rng ('default')
 seed = 1 ;
 for m = [1 5 10 100]
@@ -41,3 +46,4 @@
 
 fprintf ('testc8: all complex eWise tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testc9.m b/GraphBLAS/Test/testc9.m
index ab4bea181..29cd6873c 100644
--- a/GraphBLAS/Test/testc9.m
+++ b/GraphBLAS/Test/testc9.m
@@ -1,9 +1,14 @@
-function testc9
+function testc9(use_builtin)
 %TESTC9 test complex extractTuples
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: Apache-2.0
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 seed = 1 ;
 for m = [1 5 10 100]
     for n = [1 5 10 100]
@@ -28,3 +33,4 @@
 
 fprintf ('testc9: all complex extractTuples tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testca.m b/GraphBLAS/Test/testca.m
index 115d4af99..cbe59b612 100644
--- a/GraphBLAS/Test/testca.m
+++ b/GraphBLAS/Test/testca.m
@@ -1,9 +1,14 @@
-function testca
+function testca(use_builtin)
 %TESTCA test complex mxm, mxv, and vxm
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: Apache-2.0
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 fprintf ('testca: test complex mxm, mxv, and vxm\n') ;
 rng ('default') ;
 dnn = struct ;
@@ -103,3 +108,5 @@
 
 fprintf ('\ntestca: all complex mxm, mxv, vxm tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
+
diff --git a/GraphBLAS/Test/testcb.m b/GraphBLAS/Test/testcb.m
index 349087c4e..a682444f2 100644
--- a/GraphBLAS/Test/testcb.m
+++ b/GraphBLAS/Test/testcb.m
@@ -1,9 +1,14 @@
-function testcb
+function testcb(use_builtin)
 %TESTCB test complex reduce
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: Apache-2.0
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 tol = 1e-13 ;
 seed = 1 ;
 for m = [1 5 10 100]
@@ -57,3 +62,4 @@
 
 fprintf ('testcb: all complex reduce tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testcc.m b/GraphBLAS/Test/testcc.m
index 498f1b4c2..705758397 100644
--- a/GraphBLAS/Test/testcc.m
+++ b/GraphBLAS/Test/testcc.m
@@ -1,9 +1,14 @@
-function testcc
+function testcc(use_builtin)
 %TESTCC test complex transpose
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
 % SPDX-License-Identifier: Apache-2.0
 
+if (nargin < 1)
+    use_builtin = true ;
+end
+GB_builtin_complex_set (use_builtin) ;
+
 dt = struct ('inp0', 'tran') ;
 seed = 1 ;
 for m = [1 5 10 100]
@@ -30,3 +35,4 @@
 
 fprintf ('testcc: all complex transpose tests passed\n') ;
 
+GB_builtin_complex_set (true) ;
diff --git a/GraphBLAS/Test/testcz.m b/GraphBLAS/Test/testcz.m
new file mode 100644
index 000000000..5f5301572
--- /dev/null
+++ b/GraphBLAS/Test/testcz.m
@@ -0,0 +1,31 @@
+threads {1} = [4 1] ;
+t = threads ;
+
+% user-defined complex type
+% logstat ('testc1(0)',t) ;  % test ops
+logstat ('testc2(0,0)',t) ;  % A'*B, A+B, A*B
+% logstat ('testc3(0)',t) ;  % extract column, extract submatrix
+logstat ('testc4(0)',t) ;  % extractElement, setElement
+% logstat ('testc5(0)',t) ;  % subref
+% logstat ('testc6(0)',t) ;  % apply
+logstat ('testc7(0)',t) ;  % assign
+% logstat ('testc8(0)',t) ;  % eWise
+% logstat ('testc9(0)',t) ;  % extractTuples
+% logstat ('testca(0)',t) ;  % mxm, mxv, vxm
+% logstat ('testcb(0)',t) ;  % reduce
+% logstat ('testcc(0)',t) ;  % transpose
+
+% builtin complex type: GxB_FC64
+% logstat ('testc1(1)',t) ;  % test ops
+% logstat ('testc2(0,1)',t) ;  % A'*B, A+B, A*B
+% logstat ('testc3(1)',t) ;  % extract column, extract submatrix
+% logstat ('testc4(1)',t) ;  % extractElement, setElement
+% logstat ('testc5(1)',t) ;  % subref
+% logstat ('testc6(1)',t) ;  % apply
+logstat ('testc7(1)',t) ;  % assign
+% logstat ('testc8(1)',t) ;  % eWise
+% logstat ('testc9(1)',t) ;  % extractTuples
+% logstat ('testca(1)',t) ;  % mxm, mxv, vxm
+% logstat ('testcb(1)',t) ;  % reduce
+logstat ('testcc(1)',t) ;  % transpose
+
diff --git a/GraphBLAS/alternative/Makefile b/GraphBLAS/alternative/Makefile
index f2f83a1e9..50a841582 100644
--- a/GraphBLAS/alternative/Makefile
+++ b/GraphBLAS/alternative/Makefile
@@ -20,8 +20,8 @@ default: library
 
 # This version info must match ../CMakeLists.txt 
 VER1 = 7
-VER2 = 0
-VER3 = 3
+VER2 = 2
+VER3 = 0
 
 # pick your compiler:
   CC = gcc
@@ -33,20 +33,27 @@ VER3 = 3
 # CC = icc -mp1
 # CC = c++
 
+# when using clang
+# CFLAGS += -Xclang
+
 SRC = ../Source/*.c ../Source/Generated1/*.c ../Source/Generated2/*.c
-INC = ../Include/*.h ../Source/*.h ../Source/Template/* ../Source/Generated1/*.h ../Source/Generated2/*.h
+INC = ../Include/*.h ../Source/*.h ../Source/Template/* ../Source/Generated1/*.h ../Source/Generated2/*.h ../rmm_wrap/*.h*
 SRC2 = $(notdir $(wildcard $(SRC)))
 OBJ = $(SRC2:.c=.o)
-LDFLAGS = -fopenmp -lm
-CFLAGS = -fopenmp -fexceptions -fPIC
+
+# pick OpenMP options:
+# LDFLAGS = -fopenmp -lm
+# LDFLAGS = -openmp -lm
+CFLAGS += -fopenmp -fexceptions -fPIC
+
 # pick the optimization level:
   CFLAGS += -O3
 # CFLAGS += -g
+
 ifneq ($(CC),c++)
     CFLAGS += -std=c11
 endif
-CPPFLAGS = -I../Include -I../Source -I../Source/Template -I../Source/Generated1 -I../Source/Generated2 -I../lz4 -I../cpu_features/include
-SO_OPTS = $(LDFLAGS)
+CPPFLAGS = -I../Include -I../Source -I../Source/Template -I../Source/Generated1 -I../Source/Generated2 -I../lz4 -I../cpu_features/include -I../cpu_features -I../cpu_features/src -I../cpu_features/include/internal -I../rmm_wrap
 CFLAGS += -Wno-pragmas
 
 # To compile the libgraphblas_renamed library, change all occurences of
@@ -54,19 +61,35 @@ CFLAGS += -Wno-pragmas
 # CFLAGS += -DGBRENAME=1
 # CPPFLAGS += -I../GraphBLAS/rename
 
+# Select options for cpu_features:
+# no cpu_features:
+#   CFLAGS += -DGBNCPUFEAT=1
+# cpu_features with getauxval (does not work on the Mac):
+#   CFLAGS += -DHAVE_STRONG_GETAUXVAL=1
+# cpu_features with dlfcn.h (works on the Mac):
+#   CFLAGS += -DHAVE_DLFCN_H=1
+# To enable X86, and AVX2 and/or AVX512 when not using cpu_features:
+#   CFLAGS += -DGBX86=1
+#   CFLAGS += -DGBAVX2=1
+#   CFLAGS += -DGBAVX512=1
+
 UNAME := $(shell uname)
 ifeq ($(UNAME),Darwin)
     # Mac
+    CFLAGS += -DHAVE_DLFCN_H=1
     CFLAGS += -fno-common
     SO_NAME = libgraphblas.dylib.$(VER1).$(VER2).$(VER3)
     SO_NAME0 = libgraphblas.dylib
     SO_NAME1 = libgraphblas.dylib.$(VER1)
+    SO_OPTS = $(LDFLAGS)
     SO_OPTS += -dynamiclib -shared  -Wl,-install_name -Wl,$(SO_NAME1) -undefined dynamic_lookup
 else
     # Linux
+    CFLAGS += -DHAVE_DLFCN_H=1 -DHAVE_STRONG_GETAUXVAL=1
     SO_NAME = libgraphblas.so.$(VER1).$(VER2).$(VER3)
     SO_NAME0 = libgraphblas.so
     SO_NAME1 = libgraphblas.so.$(VER1)
+    SO_OPTS = $(LDFLAGS)
     SO_OPTS += -shared -Wl,-soname -Wl,$(SO_NAME1)
 endif
 
@@ -114,7 +137,7 @@ DOBJ = $(DSRC2:.c=.o)
 	$(CC) -c $(CFLAGS) $(DCPPFLAGS) $< -o $(notdir $@)
 
 %_demo: ../Demo/Program/%_demo.c $(SO_NAME) $(DINC) $(DOBJ)
-	$(CC) $(CFLAGS) $(DCPPFLAGS) $< $(DOBJ) $(DLIBS) -o $@
+	$(CC) $(CFLAGS) $(LDFLAGS) $(DCPPFLAGS) $< $(DOBJ) $(DLIBS) -o $@
 
 DEMO_PRG = $(notdir $(wildcard ../Demo/Program/*_demo.c))
 DEMO = $(DEMO_PRG:.c=)
diff --git a/GraphBLAS/rmm_wrap/.gitignore b/GraphBLAS/rmm_wrap/.gitignore
index 55cb890ab..b0e40afb3 100644
--- a/GraphBLAS/rmm_wrap/.gitignore
+++ b/GraphBLAS/rmm_wrap/.gitignore
@@ -1,2 +1,8 @@
+# Ignore these files:
 build/
 rmm_wrap_test
+rmm_log.txt
+
+# Do not ignore this file
+!.gitignore
+
diff --git a/GraphBLAS/rmm_wrap/CMakeLists.txt b/GraphBLAS/rmm_wrap/CMakeLists.txt
index 69644a2c0..3f10f1323 100644
--- a/GraphBLAS/rmm_wrap/CMakeLists.txt
+++ b/GraphBLAS/rmm_wrap/CMakeLists.txt
@@ -1,3 +1,11 @@
+#-------------------------------------------------------------------------------
+# rmm_wrap/CMakeLists.txt: cmake script for building rmm_wrap
+#-------------------------------------------------------------------------------
+
+# SPDX-License-Identifier: Apache-2.0
+
+#-------------------------------------------------------------------------------
+
 cmake_minimum_required(VERSION 3.20.1)
 project(rmm_wrap VERSION 0.1)
 
diff --git a/GraphBLAS/rmm_wrap/README.md b/GraphBLAS/rmm_wrap/README.md
new file mode 100644
index 000000000..37c31be42
--- /dev/null
+++ b/GraphBLAS/rmm_wrap/README.md
@@ -0,0 +1,35 @@
+# rmm_wrap is a C-callable wrapper for the NVIDIA Rapids Memory Manager.
+
+SPDX-License-Identifier: Apache-2.0
+
+rmm_wrap defines a single global object, the RMM_Wrap_Handle that holds
+an RMM (Rapids Memory Manager) memory resource and a hash map (C++
+std:unordered_map).  This allows rmm_wrap to provide 7 functions to a C
+application:
+
+Create/destroy an RMM resource:
+
+    rmm_wrap_initialize: create the RMM resource
+    rmm_wrap_finalize: destroy the RMM resource
+
+C-style malloc/calloc/realloc/free methods:
+
+    rmm_wrap_malloc:  malloc a block of memory using RMM
+    rmm_wrap_calloc:  calloc a block of memory using RMM
+    rmm_wrap_realloc: realloc a block of allocated by this RMM wrapper
+    rmm_wrap_free:    free a block of memory allocated by this RMM wrapper
+
+PMR-based allocate/deallocate methods (C-callable):
+
+    rmm_wrap_allocate (std::size_t *size)
+    rmm_wrap_deallocate (void *p, std::size_t size)
+
+Files in this package:
+
+    CMakeLists.txt      compiles the rmm_wrap library
+    README.md           this file
+    rmm_wrap.cpp        rmm_wrap_* functions
+    rmm_wrap.h          definitions for an external C program
+    rmm_wrap.hpp        internal defintions for rmm_wrap only
+    rmm_wrap_test.c     tests for the rmm_wrap library
+
diff --git a/GraphBLAS/rmm_wrap/pmr_malloc.h b/GraphBLAS/rmm_wrap/pmr_malloc.h
deleted file mode 100644
index 431750ca6..000000000
--- a/GraphBLAS/rmm_wrap/pmr_malloc.h
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#ifndef PMR_MALLOC_H
-#define PMR_MALLOC_H
-#include <stdlib.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-    void *rmm_wrap_malloc (size_t size) ;
-    void  rmm_wrap_free (void *) ;
-    void *rmm_wrap_calloc (size_t, size_t) ;
-    void *rmm_wrap_realloc (void *, size_t) ;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/GraphBLAS/rmm_wrap/rmm_device.h b/GraphBLAS/rmm_wrap/rmm_device.h
deleted file mode 100644
index 53dacc02f..000000000
--- a/GraphBLAS/rmm_wrap/rmm_device.h
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#ifndef RMM_META_H
-#define RMM_META_H
-
-typedef struct
-{
-    char    name [256] ;
-    size_t  total_global_memory ;
-    int  number_of_sms ;
-    int  compute_capability_major;
-    int  compute_capability_minor;
-    bool use_memory_pool;
-    int  pool_size;             // TODO: should this be size_t?
-    int  max_pool_size;         // TODO: should this be size_t?
-    void *memory_resource;
-}
-rmm_device ;      // TODO: rename this?
-
-#endif
diff --git a/GraphBLAS/rmm_wrap/rmm_log.txt b/GraphBLAS/rmm_wrap/rmm_log.txt
deleted file mode 100644
index 9a9ba74b5..000000000
--- a/GraphBLAS/rmm_wrap/rmm_log.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-[1365032][10:45:29:476098][info  ] ----- RMM LOG BEGIN [PTDS DISABLED] -----
-[1365032][10:45:29:476191][error ] [A][Stream 0x1][Upstream 262144B][FAILURE maximum pool size exceeded]
diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.cpp b/GraphBLAS/rmm_wrap/rmm_wrap.cpp
index ead45131a..05d306a32 100644
--- a/GraphBLAS/rmm_wrap/rmm_wrap.cpp
+++ b/GraphBLAS/rmm_wrap/rmm_wrap.cpp
@@ -2,17 +2,34 @@
 // rmm_wrap.cpp: C-callable wrapper for an RMM memory resource
 //------------------------------------------------------------------------------
 
-// rmm_wrap.cpp contains a single global object, the RMM_something that holds
-// a RMM memory resource and a hash map (C++ std:unordered_map).  This allows
-// rmm_wrap to provide 6 functions to a C application:
+// SPDX-License-Identifier: Apache-2.0
 
-//  ...
+//------------------------------------------------------------------------------
+
+// rmm_wrap.cpp contains a single global object, the RMM_Wrap_Handle that holds
+// an RMM (Rapids Memory Manager) memory resource and a hash map (C++
+// std:unordered_map).  This allows rmm_wrap to provide 7 functions to a C
+// application:
+
+// Create/destroy an RMM resource:
+//      rmm_wrap_initialize: create the RMM resource
+//      rmm_wrap_finalize: destroy the RMM resource
+
+// C-style malloc/calloc/realloc/free methods:
+//      rmm_wrap_malloc:  malloc a block of memory using RMM
+//      rmm_wrap_calloc:  calloc a block of memory using RMM
+//      rmm_wrap_realloc: realloc a block of allocated by this RMM wrapper
+//      rmm_wrap_free:    free a block of memory allocated by this RMM wrapper
+
+// PMR-based allocate/deallocate methods (C-callable):
+//      rmm_wrap_allocate (std::size_t *size)
+//      rmm_wrap_deallocate (void *p, std::size_t size)
 
 #include "rmm_wrap.hpp"
 #include <iostream>
 
 //------------------------------------------------------------------------------
-// RMM_Wrap_Handle: a single global object
+// RMM_Wrap_Handle: a global object containing the RMM context
 //------------------------------------------------------------------------------
 
 // rmm_wrap_context is a pointer to a single, global RMM_Wrap_Handle object
@@ -22,7 +39,7 @@
 typedef struct
 {
     RMM_MODE mode;
-    std::shared_ptr<rmm::mr::device_memory_resource>   resource; 
+    std::shared_ptr<rmm::mr::device_memory_resource>   resource;
     std::shared_ptr<std::pmr::memory_resource>         host_resource;
     std::shared_ptr<alloc_map>                         size_map ;
 }
@@ -32,53 +49,88 @@ RMM_Wrap_Handle ;
 static RMM_Wrap_Handle *rmm_wrap_context = NULL ;
 
 //------------------------------------------------------------------------------
+// make a resource pool
 //------------------------------------------------------------------------------
 
-//inline auto make_host() { return std::make_shared<rmm::mr::new_delete_resource>(); }
+#if 0
+inline auto make_host()
+{
+    return std::make_shared<rmm::mr::new_delete_resource>() ;
+}
 
-//inline auto make_host_pinned() { return std::make_shared<rmm::mr::pinned_memory_resource>(); }
+inline auto make_host_pinned()
+{
+    return std::make_shared<rmm::mr::pinned_memory_resource>() ;
+}
+#endif
 
-inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+inline auto make_cuda()
+{
+    return std::make_shared<rmm::mr::cuda_memory_resource>() ;
+}
 
-inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+inline auto make_managed()
+{
+    return std::make_shared<rmm::mr::managed_memory_resource>() ;
+}
 
-//inline auto make_and_set_host_pool(std::size_t initial_size, std::size_t maximum_size) 
-//{ 
-//        auto resource = std::pmr::synchronized_pool_resource(); 
-//                       
-//        rmm::mr::set_current_device_resource( resource );
-//        return resource;
-//}
+#if 0
+inline auto make_and_set_host_pool
+(
+    std::size_t initial_size,
+    std::size_t maximum_size
+)
+{
+    auto resource = std::pmr::synchronized_pool_resource() ;
+    rmm::mr::set_current_device_resource( resource ) ;
+    return resource;
+}
 
- // inline auto make_and_set_host_pinned_pool(std::size_t initial_size, std::size_t maximum_size) 
- // { 
- //         auto resource = rmm::mr::make_owning_wrapper<pool_mr>
- //                                 ( make_host_pinned(), initial_size, maximum_size );
- //         rmm::mr::set_current_device_resource( resource.get());
- //         return resource;
- // }
+inline auto make_and_set_host_pinned_pool
+(
+    std::size_t initial_size,
+    std::size_t maximum_size
+)
+{
+    auto resource = rmm::mr::make_owning_wrapper<pool_mr>
+        ( make_host_pinned(), initial_size, maximum_size ) ;
+    rmm::mr::set_current_device_resource( resource.get()) ;
+    return resource;
+}
+#endif
 
-//alloc_map is an unordered_map of allocation address to size of each allocation
+// size_map is an unordered alloc_map that maps allocation address to the size
+// of each allocation
 
-inline auto make_and_set_device_pool(std::size_t initial_size, std::size_t maximum_size) 
-{ 
+inline auto make_and_set_device_pool
+(
+    std::size_t initial_size,
+    std::size_t maximum_size
+)
+{
     auto resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>
-                    ( make_cuda(), initial_size, maximum_size );
-    rmm::mr::set_current_device_resource( resource.get());
+                    ( make_cuda(), initial_size, maximum_size ) ;
+    rmm::mr::set_current_device_resource( resource.get()) ;
     return resource;
 }
 
-inline auto make_and_set_managed_pool(std::size_t initial_size, std::size_t maximum_size) 
-{ 
-    std::cout<< " make_managed_pool called with  init_size "<<initial_size<<" max_size "<<maximum_size<<"\n";
+inline auto make_and_set_managed_pool
+(
+    std::size_t initial_size,
+    std::size_t maximum_size
+)
+{
+    //  std::cout<< " make_managed_pool called with  init_size"
+    //  <<initial_size<<" max_size "<<maximum_size<<"\n";
+
     auto resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>
-                        ( make_managed(), initial_size, maximum_size );
-    rmm::mr::set_current_device_resource( resource.get());
+                        ( make_managed(), initial_size, maximum_size ) ;
+    rmm::mr::set_current_device_resource( resource.get()) ;
     return resource;
 }
 
 //------------------------------------------------------------------------------
-// rmm_wrap_destroy_handle: destroy the global rmm_wrap_context
+// rmm_wrap_finalize: destroy the global rmm_wrap_context
 //------------------------------------------------------------------------------
 
 // Destroy the rmm_wrap_context.  This method allows destroys the contents of
@@ -90,7 +142,7 @@ void rmm_wrap_finalize (void)
     if (rmm_wrap_context != NULL)
     {
         delete (rmm_wrap_context) ;
-        rmm_wrap_context = NULL;
+        rmm_wrap_context = NULL ;
     }
 }
 
@@ -98,13 +150,18 @@ void rmm_wrap_finalize (void)
 // rmm_wrap_initialize: initialize the global rmm_wrap_context
 //------------------------------------------------------------------------------
 
-// Describe:
-// mode:  ...
-// init_pool_size: ...
-// max_pool_size: ...
-
-int rmm_wrap_initialize(RMM_MODE mode,  std::size_t init_pool_size, std::size_t max_pool_size)
+int rmm_wrap_initialize             // returns -1 on error, 0 on success
+(
+    RMM_MODE mode,                  // TODO: describe
+    std::size_t init_pool_size,     // TODO: describe
+    std::size_t max_pool_size       // TODO: describe
+)
 {
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
     if (rmm_wrap_context != NULL)
     {
         // rmm_wrap_initialize cannot be called twice
@@ -112,9 +169,10 @@ int rmm_wrap_initialize(RMM_MODE mode,  std::size_t init_pool_size, std::size_t
     }
 
     // create the RMM wrap handle and save it as a global pointer.
-    rmm_wrap_context = new RMM_Wrap_Handle(); 
+    rmm_wrap_context = new RMM_Wrap_Handle() ;
 
-    std::cout<< " init called with mode "<<mode<<" init_size "<<init_pool_size<<" max_size "<<max_pool_size<<"\n";
+    //  std::cout<< " init called with mode "<<mode<<" init_size "
+    // <<init_pool_size<<" max_size "<<max_pool_size<<"\n";
 
     //--------------------------------------------------------------------------
     // Construct a resource that uses a coalescing best-fit pool allocator
@@ -122,20 +180,27 @@ int rmm_wrap_initialize(RMM_MODE mode,  std::size_t init_pool_size, std::size_t
 
     if (mode == rmm_wrap_host )
     {
-        //rmm_wrap_context->host_resource =  std::pmr::synchronized_pool_resource(); // (init_pool_size, max_pool_size) ;
-        //rmm_wrap_context->host_resource =  make_and_set_host_pool(); // (init_pool_size, max_pool_size) ;
+        // rmm_wrap_context->host_resource =
+        //  std::pmr::synchronized_pool_resource() ;
+        //  // (init_pool_size, max_pool_size) ;
+        // rmm_wrap_context->host_resource =  make_and_set_host_pool() ;
+        //  // (init_pool_size, max_pool_size) ;
     }
     else if (mode == rmm_wrap_host_pinned )
     {
-      //  rmm_wrap_context->host_resource =  std::pmr::synchronized_pool_resource(); // (init_pool_size, max_pool_size) ;
+        // rmm_wrap_context->host_resource =
+        //  std::pmr::synchronized_pool_resource() ;
+        //  // (init_pool_size, max_pool_size) ;
     }
     else if (mode == rmm_wrap_device )
     {
-        rmm_wrap_context->resource =  make_and_set_device_pool( init_pool_size, max_pool_size) ;
+        rmm_wrap_context->resource =
+            make_and_set_device_pool( init_pool_size, max_pool_size) ;
     }
     else if ( mode == rmm_wrap_managed )
     {
-        rmm_wrap_context->resource =  make_and_set_managed_pool( init_pool_size, max_pool_size) ;
+        rmm_wrap_context->resource =
+            make_and_set_managed_pool( init_pool_size, max_pool_size) ;
     }
     else
     {
@@ -147,7 +212,7 @@ int rmm_wrap_initialize(RMM_MODE mode,  std::size_t init_pool_size, std::size_t
     rmm_wrap_context->mode = mode;
 
     //--------------------------------------------------------------------------
-    // create size map to lookup size of each allocation 
+    // create size map to lookup size of each allocation
     //--------------------------------------------------------------------------
 
     rmm_wrap_context->size_map = std::make_shared<alloc_map> () ;
@@ -161,49 +226,42 @@ int rmm_wrap_initialize(RMM_MODE mode,  std::size_t init_pool_size, std::size_t
 }
 
 //------------------------------------------------------------------------------
-
-/*
-    GrB_init (mode) ;       // ANSI C11 malloc/calloc/realloc/free, no PMR
-    GxB_init (mode, mymalloc, mycalloc, myrealloc, myfree)
-
-    GxB_init (mode, mymalloc, NULL, NULL, myfree)
-
-    GxB_init (mode, mxMalloc, NULL, NULL, mxFree)
-    GxB_init (mode, pymalloc, pycalloc, pyrealloc, pyfree)
-    GxB_init (mode, jl_malloc, jl_calloc, jl_realloc, jl_free)
-    GxB_init (mode, RedisModule_malloc, RedisModule_calloc, RedisModule_realloc, RedisModule_realloc)
-
-    GxB_init (mode, rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free)
-*/
-
-//------------------------------------------------------------------------------
-// rmm_wrap_malloc
+// rmm_wrap_malloc: malloc-equivalent method using RMM
 //------------------------------------------------------------------------------
 
+// rmm_wrap_malloc is identical to the ANSI C11 malloc function, except that
+// it uses RMM underneath to allocate the space.
+
 void *rmm_wrap_malloc (std::size_t size)
 {
     return (rmm_wrap_allocate (&size)) ;
 }
 
 //------------------------------------------------------------------------------
-// rmm_wrap_calloc
+// rmm_wrap_calloc: calloc-equivalent method using RMM
 //------------------------------------------------------------------------------
 
+// rmm_wrap_calloc is identical to the ANSI C11 calloc function, except that
+// it uses RMM underneath to allocate the space.
+
 void *rmm_wrap_calloc (std::size_t n, std::size_t size)
 {
     std::size_t s = n * size ;
     void *p = rmm_wrap_allocate (&s) ;
-    // NOTE: single-threaded on the CPU.  If you want
-    // a faster method, malloc the space and use cudaMemset
-    // for the GPU or GB_memset on the CPU.
+    // NOTE: this is single-threaded on the CPU.  If you want a faster method,
+    // malloc the space and use cudaMemset for the GPU or GB_memset on the CPU.
+    // The GraphBLAS GB_calloc_memory method uses malloc and GB_memset.
     memset (p, 0, s) ;
     return (p) ;
 }
 
 //------------------------------------------------------------------------------
-// rmm_wrap_realloc
+// rmm_wrap_realloc: realloc-equivalent method using RMM
 //------------------------------------------------------------------------------
 
+// rmm_wrap_realloc is identical to the ANSI C11 realloc function, except that
+// it uses RMM underneath to allocate the space.
+
 void *rmm_wrap_realloc (void *p, std::size_t newsize)
 {
     if (p == NULL)
@@ -231,7 +289,7 @@ void *rmm_wrap_realloc (void *p, std::size_t newsize)
 
     // check for quick return
     if (newsize >= oldsize/2 && newsize <= oldsize)
-    { 
+    {
         // Be lazy. If the block does not change, or is shrinking but only by a
         // small amount, then leave the block as-is.
         return (p) ;
@@ -259,22 +317,34 @@ void *rmm_wrap_realloc (void *p, std::size_t newsize)
 }
 
 //------------------------------------------------------------------------------
-// rmm_wrap_free
+// rmm_wrap_free: free a block of memory, size not needed
 //------------------------------------------------------------------------------
 
+// rmm_wrap_free is identical to the ANSI C11 free function, except that
+// it uses RMM underneath to allocate the space.
+
 void rmm_wrap_free (void *p)
 {
     rmm_wrap_deallocate (p, 0) ;
 }
 
 //------------------------------------------------------------------------------
-// rmm_wrap_allocate
+// rmm_wrap_allocate: allocate a block of memory using RMM
 //------------------------------------------------------------------------------
 
 void *rmm_wrap_allocate( std::size_t *size)
 {
     if (rmm_wrap_context == NULL) return (NULL) ;
 
+    alloc_map *am = rmm_wrap_context->size_map.get() ;
+    if (am == NULL)
+    {
+        // PANIC!
+        // std::cout<< "Uh oh, can't allocate before initializing RMM"
+        // << std::endl;
+        return (NULL) ;
+    }
+
     // ensure size is nonzero
     if (*size == 0) *size = 256 ;
     // round-up the allocation to a multiple of 256
@@ -283,41 +353,37 @@ void *rmm_wrap_allocate( std::size_t *size)
     {
         *size += (256 - aligned) ;
     }
-    printf(" rmm_wrap_alloc %ld bytes\n",*size);
-    rmm::mr::device_memory_resource *memoryresource = rmm::mr::get_current_device_resource();
-    void *p = memoryresource->allocate( *size );
+
+//  printf(" rmm_wrap_alloc %ld bytes\n",*size) ;
+
+    rmm::mr::device_memory_resource *memoryresource =
+        rmm::mr::get_current_device_resource() ;
+    void *p = memoryresource->allocate( *size ) ;
     if (p == NULL)
     {
+        // out of memory
         *size = 0 ;
         return (NULL) ;
     }
 
     // insert p into the hashmap
-    alloc_map *am = rmm_wrap_context->size_map.get() ;
-    if (am == NULL)
-    {
-       std::cout<< "Uh oh, can't allocate before initializing RMM"<< std::endl;
-    }
-    else
-    {
-       am->emplace ( (std::size_t)p, (std::size_t)(*size) ) ;
-    }
-    return p ;
-}
+    am->emplace ((std::size_t)p, (std::size_t)(*size)) ;
 
+    // return the allocated block
+    return (p) ;
+}
 
 //------------------------------------------------------------------------------
-// rmm_wrap_allocate
+// rmm_wrap_deallocate: deallocate a block previously allocated by RMM
 //------------------------------------------------------------------------------
 
 void rmm_wrap_deallocate( void *p, std::size_t size)
 {
     if (rmm_wrap_context == NULL) return ;
-    //printf("dealloc %ld bytes\n", size); 
 
-    // Note: there are 3 PANIC cases below.  The API of rmm_wrap_deallocate does not
-    // allow an error condition to be returned.  These PANICs could be logged,
-    // or they could terminate the program if debug mode enabled, etc.
+    // Note: there are 3 PANIC cases below.  The API of rmm_wrap_deallocate
+    // does not allow an error condition to be returned.  These PANICs could be
+    // logged, or they could terminate the program if debug mode enabled, etc.
     // In production, all we can do is ignore the PANIC.
 
     if (p == NULL)
@@ -330,15 +396,17 @@ void rmm_wrap_deallocate( void *p, std::size_t size)
         return ;
     }
 
-
     // check the size given.  If the input size is zero, then the
     // size is unknown (say rmm_wrap_free(p)).  In that case, just trust the
     // hashmap.  Otherwise, double-check to make sure the size is correct.
     alloc_map *am = rmm_wrap_context->size_map.get() ;
-    size_t actual_size = 0; 
-    if ( am == NULL)
+    size_t actual_size = 0 ;
+    if (am == NULL)
     {
-       std::cout<< "Uh oh, can't deallocate before initializing RMM"<< std::endl;
+        // PANIC!
+        // std::cout<< "Uh oh, can't deallocate before initializing RMM"
+        // << std::endl;
+        return ;
     }
     else
     {
@@ -363,7 +431,8 @@ void rmm_wrap_deallocate( void *p, std::size_t size)
     am->erase ( (std::size_t)(p) ) ;
 
     // deallocate the block of memory
-    rmm::mr::device_memory_resource *memoryresource = rmm::mr::get_current_device_resource();
-    memoryresource->deallocate( p, actual_size );
+    rmm::mr::device_memory_resource *memoryresource =
+        rmm::mr::get_current_device_resource() ;
+    memoryresource->deallocate( p, actual_size ) ;
 }
 
diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.h b/GraphBLAS/rmm_wrap/rmm_wrap.h
index 678281c30..c2461719b 100644
--- a/GraphBLAS/rmm_wrap/rmm_wrap.h
+++ b/GraphBLAS/rmm_wrap/rmm_wrap.h
@@ -1,4 +1,33 @@
 //------------------------------------------------------------------------------
+// rmm_wrap/rmm_wrap.h: include file for rmm_wrap
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// example usage in GraphBLAS:
+
+/*
+    GrB_init (mode) ;       // ANSI C11 malloc/calloc/realloc/free, no PMR
+    GxB_init (mode, mymalloc, mycalloc, myrealloc, myfree) ;
+
+    GxB_init (mode, mymalloc, NULL, NULL, myfree) ;
+
+    GxB_init (mode, mxMalloc, NULL, NULL, mxFree) ;
+    GxB_init (mode, pymalloc, pycalloc, pyrealloc, pyfree) ;
+    GxB_init (mode, jl_malloc, jl_calloc, jl_realloc, jl_free) ;
+    GxB_init (mode, RedisModule_malloc, RedisModule_calloc,
+        RedisModule_realloc, RedisModule_realloc) ;
+
+    // using the RMM functions:
+    rmm_wrap_initialize (rmm_wrap_managed, 256 * 1000000L, 256 * 1000000000L) ;
+    GxB_init (GxB_NONBLOCKING_GPU, rmm_wrap_malloc, rmm_wrap_calloc,
+        rmm_wrap_realloc, rmm_wrap_free) ;
+    // ... use GraphBLAS on the GPU
+    rmm_wrap_finalize ( ) ;
+*/
+
 //------------------------------------------------------------------------------
 
 #ifndef RMM_WRAP_H
@@ -12,14 +41,30 @@ extern "C" {
 #endif
 
 // TODO describe the modes
-typedef enum { rmm_wrap_host=0, rmm_wrap_host_pinned=1, rmm_wrap_device=2, rmm_wrap_managed=3 } RMM_MODE ;
+typedef enum
+{
+    rmm_wrap_host = 0,
+    rmm_wrap_host_pinned = 1,
+    rmm_wrap_device = 2,
+    rmm_wrap_managed = 3
+}
+RMM_MODE ;
+
+// create an RMM resource
+int rmm_wrap_initialize
+(
+    RMM_MODE mode,
+    size_t init_pool_size,
+    size_t max_pool_size
+) ;
 
+// destroy an RMM resource
 void rmm_wrap_finalize (void) ;
-int rmm_wrap_initialize (RMM_MODE mode, size_t init_pool_size, size_t max_pool_size) ;
 
 // example usage:
     //  rmm_wrap_initialize (rmm_wrap_managed, INT32_MAX, INT64_MAX) ;
-    //  GxB_init (GrB_NONBLOCKING, rmm_wrap_malloc, rmm_wrap_calloc, rmm_wrap_realloc, rmm_wrap_free) ;
+    //  GxB_init (GrB_NONBLOCKING, rmm_wrap_malloc, rmm_wrap_calloc,
+    //      rmm_wrap_realloc, rmm_wrap_free) ;
     //  use GraphBLAS ...
     //  GrB_finalize ( ) ;
     //  rmm_wrap_finalize ( ) ;
diff --git a/GraphBLAS/rmm_wrap/rmm_wrap.hpp b/GraphBLAS/rmm_wrap/rmm_wrap.hpp
index ab1d592c5..5f8b7d46f 100644
--- a/GraphBLAS/rmm_wrap/rmm_wrap.hpp
+++ b/GraphBLAS/rmm_wrap/rmm_wrap.hpp
@@ -1,3 +1,11 @@
+//------------------------------------------------------------------------------
+// rmm_wrap/rmm_wrap.hpp
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
 #pragma once
 
 #include "stddef.h"
@@ -32,5 +40,5 @@ typedef rmm::mr::pool_memory_resource<pinned_mr> host_pinned_pool_mr;
 typedef rmm::mr::pool_memory_resource<device_mr> device_pool_mr;
 typedef rmm::mr::pool_memory_resource<managed_mr> managed_pool_mr;
 
-
 typedef std::unordered_map< std::size_t, std::size_t> alloc_map;
+
diff --git a/GraphBLAS/rmm_wrap/rmm_wrap_test.c b/GraphBLAS/rmm_wrap/rmm_wrap_test.c
index 6fc17f44b..3e4fefa66 100644
--- a/GraphBLAS/rmm_wrap/rmm_wrap_test.c
+++ b/GraphBLAS/rmm_wrap/rmm_wrap_test.c
@@ -1,10 +1,17 @@
+//------------------------------------------------------------------------------
+// rmm_wrap/rmm_wrap_test.c:  simple main program for testing rmm_wrap
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
 
 #include "rmm_wrap.h"
 
 int main()
 {
 
-    size_t init_size, max_size;      
+    size_t init_size, max_size;
     init_size = 256*(1ULL<<10);
     max_size  = 256*(1ULL<<20);
 
@@ -23,7 +30,6 @@ int main()
     rmm_wrap_deallocate( p, buff_size);
     rmm_wrap_finalize();
 
-
     rmm_wrap_initialize( rmm_wrap_device, init_size, max_size );
     printf("RMM initialized!  in device mode\n");
 
@@ -35,6 +41,5 @@ int main()
     fflush(stdout);
     rmm_wrap_deallocate( p, buff_size);
     rmm_wrap_finalize();
-
-
 }
+
diff --git a/GraphBLAS/zstd/LICENSE b/GraphBLAS/zstd/LICENSE
new file mode 100644
index 000000000..a793a8028
--- /dev/null
+++ b/GraphBLAS/zstd/LICENSE
@@ -0,0 +1,30 @@
+BSD License
+
+For Zstandard software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/GraphBLAS/zstd/README.txt b/GraphBLAS/zstd/README.txt
new file mode 100644
index 000000000..5822def6b
--- /dev/null
+++ b/GraphBLAS/zstd/README.txt
@@ -0,0 +1,28 @@
+ZSTD Library, Copyright (c) Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+SPDX-License-Identifier: BSD-3-clause
+
+Notes by Tim Davis, on inclusion of ZSTD into SuiteSparse:GraphBLAS:
+
+This directory contains a minimal copy of zstd v1.5.3, from
+https://github.com/facebook/zstd.git, by Yann Collet.
+See ./LICENSE and ./README_zstd.md for more details.
+
+Files in this folder:
+
+    LICENSE         BSD 3-clause, Copyright (c) 2016-present, Facebook, Inc.
+    FIXME           ... list all the files/folders here
+    README_zstd.md  zstd/README.md
+    README.txt      this file
+
+When ZSTD is compiled for use in SuiteSparse:GraphBLAS, ZSTD_DEPS_MALLOC is
+defined, and ZSTD_malloc, ZSTD_calloc, and ZSTD_free, are provided to ZSTD
+instead of have it use the standard ANSI C11 malloc/calloc/free.  Those
+functions use whatever memory manager is given to GxB_init, or the ANSI C11
+malloc/calloc/free for GrB_init.
+
+This compile-time change could cause a conflict if ZSTD is also installed as a
+system-wide library.  To avoid the conflict, all ZSTD function names are renamed
+to GB_ZSTD_*, using #defines in ../Source/GB_zstd.c.  SuiteSparse:GraphBLAS will
+use this version of ZSTD, integrated into the libgraphblas.so (.dylib, .dll),
+rather than a separate libzstd.so library.
+
diff --git a/GraphBLAS/zstd/README_zstd.md b/GraphBLAS/zstd/README_zstd.md
new file mode 100644
index 000000000..015cde567
--- /dev/null
+++ b/GraphBLAS/zstd/README_zstd.md
@@ -0,0 +1,217 @@
+Zstandard library files
+================================
+
+The __lib__ directory is split into several sub-directories,
+in order to make it easier to select or exclude features.
+
+
+#### Building
+
+`Makefile` script is provided, supporting [Makefile conventions](https://www.gnu.org/prep/standards/html_node/Makefile-Conventions.html#Makefile-Conventions),
+including commands variables, staged install, directory variables and standard targets.
+- `make` : generates both static and dynamic libraries
+- `make install` : install libraries and headers in target system directories
+
+`libzstd` default scope is pretty large, including compression, decompression, dictionary builder,
+and support for decoding legacy formats >= v0.5.0.
+The scope can be reduced on demand (see paragraph _modular build_).
+
+
+#### Multithreading support
+
+When building with `make`, by default the dynamic library is multithreaded and static library is single-threaded (for compatibility reasons).
+
+Enabling multithreading requires 2 conditions :
+- set build macro `ZSTD_MULTITHREAD` (`-DZSTD_MULTITHREAD` for `gcc`)
+- for POSIX systems : compile with pthread (`-pthread` compilation flag for `gcc`)
+
+For convenience, we provide a build target to generate multi and single threaded libraries:
+- Force enable multithreading on both dynamic and static libraries by appending `-mt` to the target, e.g. `make lib-mt`.
+- Force disable multithreading on both dynamic and static libraries by appending `-nomt` to the target, e.g. `make lib-nomt`.
+- By default, as mentioned before, dynamic library is multithreaded, and static library is single-threaded, e.g. `make lib`.
+
+When linking a POSIX program with a multithreaded version of `libzstd`,
+note that it's necessary to invoke the `-pthread` flag during link stage.
+
+Multithreading capabilities are exposed
+via the [advanced API defined in `lib/zstd.h`](https://github.com/facebook/zstd/blob/v1.4.3/lib/zstd.h#L351).
+
+
+#### API
+
+Zstandard's stable API is exposed within [lib/zstd.h](zstd.h).
+
+
+#### Advanced API
+
+Optional advanced features are exposed via :
+
+- `lib/zstd_errors.h` : translates `size_t` function results
+                        into a `ZSTD_ErrorCode`, for accurate error handling.
+
+- `ZSTD_STATIC_LINKING_ONLY` : if this macro is defined _before_ including `zstd.h`,
+                          it unlocks access to the experimental API,
+                          exposed in the second part of `zstd.h`.
+                          All definitions in the experimental APIs are unstable,
+                          they may still change in the future, or even be removed.
+                          As a consequence, experimental definitions shall ___never be used with dynamic library___ !
+                          Only static linking is allowed.
+
+
+#### Modular build
+
+It's possible to compile only a limited set of features within `libzstd`.
+The file structure is designed to make this selection manually achievable for any build system :
+
+- Directory `lib/common` is always required, for all variants.
+
+- Compression source code lies in `lib/compress`
+
+- Decompression source code lies in `lib/decompress`
+
+- It's possible to include only `compress` or only `decompress`, they don't depend on each other.
+
+- `lib/dictBuilder` : makes it possible to generate dictionaries from a set of samples.
+        The API is exposed in `lib/dictBuilder/zdict.h`.
+        This module depends on both `lib/common` and `lib/compress` .
+
+- `lib/legacy` : makes it possible to decompress legacy zstd formats, starting from `v0.1.0`.
+        This module depends on `lib/common` and `lib/decompress`.
+        To enable this feature, define `ZSTD_LEGACY_SUPPORT` during compilation.
+        Specifying a number limits versions supported to that version onward.
+        For example, `ZSTD_LEGACY_SUPPORT=2` means : "support legacy formats >= v0.2.0".
+        Conversely, `ZSTD_LEGACY_SUPPORT=0` means "do __not__ support legacy formats".
+        By default, this build macro is set as `ZSTD_LEGACY_SUPPORT=5`.
+        Decoding supported legacy format is a transparent capability triggered within decompression functions.
+        It's also allowed to invoke legacy API directly, exposed in `lib/legacy/zstd_legacy.h`.
+        Each version does also provide its own set of advanced API.
+        For example, advanced API for version `v0.4` is exposed in `lib/legacy/zstd_v04.h` .
+
+- While invoking `make libzstd`, it's possible to define build macros
+        `ZSTD_LIB_COMPRESSION, ZSTD_LIB_DECOMPRESSION`, `ZSTD_LIB_DICTBUILDER`,
+        and `ZSTD_LIB_DEPRECATED` as `0` to forgo compilation of the
+        corresponding features. This will also disable compilation of all
+        dependencies (e.g. `ZSTD_LIB_COMPRESSION=0` will also disable
+        dictBuilder).
+
+- There are a number of options that can help minimize the binary size of
+  `libzstd`.
+
+  The first step is to select the components needed (using the above-described
+  `ZSTD_LIB_COMPRESSION` etc.).
+
+  The next step is to set `ZSTD_LIB_MINIFY` to `1` when invoking `make`. This
+  disables various optional components and changes the compilation flags to
+  prioritize space-saving.
+
+  Detailed options: Zstandard's code and build environment is set up by default
+  to optimize above all else for performance. In pursuit of this goal, Zstandard
+  makes significant trade-offs in code size. For example, Zstandard often has
+  more than one implementation of a particular component, with each
+  implementation optimized for different scenarios. For example, the Huffman
+  decoder has complementary implementations that decode the stream one symbol at
+  a time or two symbols at a time. Zstd normally includes both (and dispatches
+  between them at runtime), but by defining `HUF_FORCE_DECOMPRESS_X1` or
+  `HUF_FORCE_DECOMPRESS_X2`, you can force the use of one or the other, avoiding
+  compilation of the other. Similarly, `ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT`
+  and `ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG` force the compilation and use of
+  only one or the other of two decompression implementations. The smallest
+  binary is achieved by using `HUF_FORCE_DECOMPRESS_X1` and
+  `ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT` (implied by `ZSTD_LIB_MINIFY`).
+
+  For squeezing the last ounce of size out, you can also define
+  `ZSTD_NO_INLINE`, which disables inlining, and `ZSTD_STRIP_ERROR_STRINGS`,
+  which removes the error messages that are otherwise returned by
+  `ZSTD_getErrorName` (implied by `ZSTD_LIB_MINIFY`).
+
+  Finally, when integrating into your application, make sure you're doing link-
+  time optimization and unused symbol garbage collection (via some combination of,
+  e.g., `-flto`, `-ffat-lto-objects`, `-fuse-linker-plugin`,
+  `-ffunction-sections`, `-fdata-sections`, `-fmerge-all-constants`,
+  `-Wl,--gc-sections`, `-Wl,-z,norelro`, and an archiver that understands
+  the compiler's intermediate representation, e.g., `AR=gcc-ar`). Consult your
+  compiler's documentation.
+
+- While invoking `make libzstd`, the build macro `ZSTD_LEGACY_MULTITHREADED_API=1`
+  will expose the deprecated `ZSTDMT` API exposed by `zstdmt_compress.h` in
+  the shared library, which is now hidden by default.
+
+- The build macro `DYNAMIC_BMI2` can be set to 1 or 0 in order to generate binaries
+  which can detect at runtime the presence of BMI2 instructions, and use them only if present.
+  These instructions contribute to better performance, notably on the decoder side.
+  By default, this feature is automatically enabled on detecting
+  the right instruction set (x64) and compiler (clang or gcc >= 5).
+  It's obviously disabled for different cpus,
+  or when BMI2 instruction set is _required_ by the compiler command line
+  (in this case, only the BMI2 code path is generated).
+  Setting this macro will either force to generate the BMI2 dispatcher (1)
+  or prevent it (0). It overrides automatic detection.
+
+- The build macro `ZSTD_NO_UNUSED_FUNCTIONS` can be defined to hide the definitions of functions
+  that zstd does not use. Not all unused functions are hidden, but they can be if needed.
+  Currently, this macro will hide function definitions in FSE and HUF that use an excessive
+  amount of stack space.
+
+- The build macro `ZSTD_NO_INTRINSICS` can be defined to disable all explicit intrinsics.
+  Compiler builtins are still used.
+
+- The build macro `ZSTD_DECODER_INTERNAL_BUFFER` can be set to control
+  the amount of extra memory used during decompression to store literals.
+  This defaults to 64kB.  Reducing this value reduces the memory footprint of
+  `ZSTD_DCtx` decompression contexts,
+  but might also result in a small decompression speed cost.
+
+
+#### Windows : using MinGW+MSYS to create DLL
+
+DLL can be created using MinGW+MSYS with the `make libzstd` command.
+This command creates `dll\libzstd.dll` and the import library `dll\libzstd.lib`.
+The import library is only required with Visual C++.
+The header file `zstd.h` and the dynamic library `dll\libzstd.dll` are required to
+compile a project using gcc/MinGW.
+The dynamic library has to be added to linking options.
+It means that if a project that uses ZSTD consists of a single `test-dll.c`
+file it should be linked with `dll\libzstd.dll`. For example:
+```
+    gcc $(CFLAGS) -Iinclude/ test-dll.c -o test-dll dll\libzstd.dll
+```
+The compiled executable will require ZSTD DLL which is available at `dll\libzstd.dll`.
+
+
+#### Advanced Build options
+
+The build system requires a hash function in order to
+separate object files created with different compilation flags.
+By default, it tries to use `md5sum` or equivalent.
+The hash function can be manually switched by setting the `HASH` variable.
+For example : `make HASH=xxhsum`
+The hash function needs to generate at least 64-bit using hexadecimal format.
+When no hash function is found,
+the Makefile just generates all object files into the same default directory,
+irrespective of compilation flags.
+This functionality only matters if `libzstd` is compiled multiple times
+with different build flags.
+
+The build directory, where object files are stored
+can also be manually controlled using variable `BUILD_DIR`,
+for example `make BUILD_DIR=objectDir/v1`.
+In which case, the hash function doesn't matter.
+
+
+#### Deprecated API
+
+Obsolete API on their way out are stored in directory `lib/deprecated`.
+At this stage, it contains older streaming prototypes, in `lib/deprecated/zbuff.h`.
+These prototypes will be removed in some future version.
+Consider migrating code towards supported streaming API exposed in `zstd.h`.
+
+
+#### Miscellaneous
+
+The other files are not source code. There are :
+
+ - `BUCK` : support for `buck` build system (https://buckbuild.com/)
+ - `Makefile` : `make` script to build and install zstd library (static and dynamic)
+ - `README.md` : this file
+ - `dll/` : resources directory for Windows compilation
+ - `libzstd.pc.in` : script for `pkg-config` (used in `make install`)
diff --git a/GraphBLAS/zstd/zstd_subset/common/bits.h b/GraphBLAS/zstd/zstd_subset/common/bits.h
new file mode 100644
index 000000000..c0e917750
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/bits.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_BITS_H
+#define ZSTD_BITS_H
+
+#include "mem.h"
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
+{
+    assert(val != 0);
+    {
+        static const int DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                                30, 22, 20, 15, 25, 17, 4, 8,
+                                                31, 27, 13, 23, 21, 19, 16, 7,
+                                                26, 12, 18, 6, 11, 5, 10, 9};
+        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER)
+#       if STATIC_BMI2 == 1
+            return _tzcnt_u32(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanForward(&r, val);
+                return (unsigned)r;
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)__builtin_ctz(val);
+#   else
+        return ZSTD_countTrailingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
+    assert(val != 0);
+    {
+        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+                                            11, 14, 16, 18, 22, 25, 3, 30,
+                                            8, 12, 20, 28, 15, 17, 24, 7,
+                                            19, 27, 23, 6, 26, 5, 4, 31};
+        val |= val >> 1;
+        val |= val >> 2;
+        val |= val >> 4;
+        val |= val >> 8;
+        val |= val >> 16;
+        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER)
+#       if STATIC_BMI2 == 1
+            return _lzcnt_u32(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanReverse(&r, val);
+                return (unsigned)(31 - r);
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)__builtin_clz(val);
+#   else
+        return ZSTD_countLeadingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER) && defined(_WIN64)
+#       if STATIC_BMI2 == 1
+            return _tzcnt_u64(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanForward64(&r, val);
+                return (unsigned)r;
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4) && defined(__LP64__)
+        return (unsigned)__builtin_ctzll(val);
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (leastSignificantWord == 0) {
+                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
+            } else {
+                return ZSTD_countTrailingZeros32(leastSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER) && defined(_WIN64)
+#       if STATIC_BMI2 == 1
+            return _lzcnt_u64(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanReverse64(&r, val);
+                return (unsigned)(63 - r);
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)(__builtin_clzll(val));
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (mostSignificantWord == 0) {
+                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
+            } else {
+                return ZSTD_countLeadingZeros32(mostSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+            return ZSTD_countTrailingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countTrailingZeros32((U32)val) >> 3;
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+            return ZSTD_countLeadingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countLeadingZeros32((U32)val) >> 3;
+        }
+    }
+}
+
+MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    return 31 - ZSTD_countLeadingZeros32(val);
+}
+
+#endif /* ZSTD_BITS_H */
diff --git a/GraphBLAS/zstd/zstd_subset/common/bitstream.h b/GraphBLAS/zstd/zstd_subset/common/bitstream.h
new file mode 100644
index 000000000..841778626
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/bitstream.h
@@ -0,0 +1,437 @@
+/* ******************************************************************
+ * bitstream
+ * Part of FSE library
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "mem.h"            /* unaligned access routines */
+#include "compiler.h"       /* UNLIKELY() */
+#include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+#include "error_private.h"  /* error codes and messages */
+#include "bits.h"           /* ZSTD_highbit32 */
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#ifndef ZSTD_NO_INTRINSICS
+#  if (defined(__BMI__) || defined(__BMI2__)) && defined(__GNUC__)
+#    include <immintrin.h>   /* support for bextr (experimental)/bzhi */
+#  elif defined(__ICCARM__)
+#    include <intrinsics.h>
+#  endif
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct {
+    size_t bitContainer;
+    unsigned bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+    const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {
+    0,          1,         3,         7,         0xF,       0x1F,
+    0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+    0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+    0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+    0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(size_t)
+ *  @return : 0 if success,
+ *            otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+#if defined(STATIC_BMI2) && STATIC_BMI2 == 1 && !defined(ZSTD_NO_INTRINSICS)
+    return  _bzhi_u64(bitContainer, nbBits);
+#else
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+#endif
+}
+
+/*! BIT_addBits() :
+ *  can add up to 31 bits into `bitC`.
+ *  Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
+{
+    DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+    assert(nbBits < BIT_MASK_SIZE);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_,
+ *  meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
+{
+    assert((value>>nbBits) == 0);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+ *            or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+*  bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ *  Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { ZSTD_memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                ZSTD_FALLTHROUGH;
+
+        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                ZSTD_FALLTHROUGH;
+
+        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                ZSTD_FALLTHROUGH;
+
+        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+                ZSTD_FALLTHROUGH;
+
+        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+                ZSTD_FALLTHROUGH;
+
+        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                ZSTD_FALLTHROUGH;
+
+        default: break;
+        }
+        {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+            if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+        }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+{
+    U32 const regMask = sizeof(bitContainer)*8 - 1;
+    /* if start > regMask, bitstream is corrupted, and result is undefined */
+    assert(nbBits < BIT_MASK_SIZE);
+    /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better
+     * than accessing memory. When bmi2 instruction is not present, we consider
+     * such cpus old (pre-Haswell, 2013) and their performance is not of that
+     * importance.
+     */
+#if defined(__x86_64__) || defined(_M_X86)
+    return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1);
+#else
+    return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+#endif
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+{
+    /* arbitrate between double-shift and shift+mask */
+#if 1
+    /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8,
+     * bitstream is likely corrupted, and result is undefined */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    /* this code path is slower on my os-x laptop */
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStreamFast() :
+ *  Similar to BIT_reloadDStream(), but with two differences:
+ *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+ *  2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
+ *     point you must use BIT_reloadDStream() to reload.
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+{
+    if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+        return BIT_DStream_overflow;
+    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+    bitD->ptr -= bitD->bitsConsumed >> 3;
+    bitD->bitsConsumed &= 7;
+    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+    return BIT_DStream_unfinished;
+}
+
+/*! BIT_reloadDStream() :
+ *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+ *  This function is safe, it guarantees it will not read beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->limitPtr) {
+        return BIT_reloadDStreamFast(bitD);
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    /* start < ptr < limitPtr */
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/GraphBLAS/zstd/zstd_subset/common/compiler.h b/GraphBLAS/zstd/zstd_subset/common/compiler.h
new file mode 100644
index 000000000..6c7100e83
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/compiler.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+#include "portability_macros.h"
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+/* force inlining */
+
+#if !defined(ZSTD_NO_INLINE)
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#else
+
+#define INLINE_KEYWORD
+#define FORCE_INLINE_ATTR
+
+#endif
+
+/**
+  On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC).
+  This explicitly marks such functions as __cdecl so that the code will still compile
+  if a CC other than __cdecl has been made the default.
+*/
+#if  defined(_MSC_VER)
+#  define WIN_CDECL __cdecl
+#else
+#  define WIN_CDECL
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to eliminate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+#  define HINT_INLINE static INLINE_KEYWORD
+#else
+#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#endif
+
+/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+#if defined(__GNUC__)
+#  define UNUSED_ATTR __attribute__((unused))
+#else
+#  define UNUSED_ATTR
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  if defined(__GNUC__) || defined(__ICCARM__)
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+
+/* target attribute */
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Target attribute for BMI2 dynamic dispatch.
+ * Enable lzcnt, bmi, and bmi2.
+ * We test for bmi1 & bmi2. lzcnt is included in bmi1.
+ */
+#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2")
+
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH build macro */
+#if defined(NO_PREFETCH)
+#  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
+#  define PREFETCH_L2(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  elif defined(__aarch64__)
+#    define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+#    define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
+#  else
+#    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+#    define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH_L2(_ptr + _pos);         \
+    }                                     \
+}
+
+/* vectorization
+ * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+ * and some compilers, like Intel ICC and MCST LCC, do not support it at all. */
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__)
+#  if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
+#    define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
+#  else
+#    define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")")
+#  endif
+#else
+#  define DONT_VECTORIZE
+#endif
+
+/* Tell the compiler that a branch is likely or unlikely.
+ * Only use these macros if it causes the compiler to generate better code.
+ * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
+ * and clang, please do.
+ */
+#if defined(__GNUC__)
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
+#ifndef STATIC_BMI2
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))
+#    ifdef __AVX2__  //MSVC does not have a BMI2 specific flag, but every CPU that supports AVX2 also supports BMI2
+#       define STATIC_BMI2 1
+#    endif
+#  elif defined(__BMI2__) && defined(__x86_64__) && defined(__GNUC__)
+#    define STATIC_BMI2 1
+#  endif
+#endif
+
+#ifndef STATIC_BMI2
+    #define STATIC_BMI2 0
+#endif
+
+/* compile time determination of SIMD support */
+#if !defined(ZSTD_NO_INTRINSICS)
+#  if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
+#    define ZSTD_ARCH_X86_SSE2
+#  endif
+#  if defined(__ARM_NEON) || defined(_M_ARM64)
+#    define ZSTD_ARCH_ARM_NEON
+#  endif
+#
+#  if defined(ZSTD_ARCH_X86_SSE2)
+#    include <emmintrin.h>
+#  elif defined(ZSTD_ARCH_ARM_NEON)
+#    include <arm_neon.h>
+#  endif
+#endif
+
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
+# define ZSTD_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define ZSTD_HAS_C_ATTRIBUTE(x) 0
+#endif
+
+/* Only use C++ attributes in C++. Some compilers report support for C++
+ * attributes when compiling with C.
+ */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define ZSTD_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define ZSTD_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+/* Define ZSTD_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute.
+ * - C23: https://en.cppreference.com/w/c/language/attributes/fallthrough
+ * - CPP17: https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * - Else: __attribute__((__fallthrough__))
+ */
+#ifndef ZSTD_FALLTHROUGH
+# if ZSTD_HAS_C_ATTRIBUTE(fallthrough)
+#  define ZSTD_FALLTHROUGH [[fallthrough]]
+# elif ZSTD_HAS_CPP_ATTRIBUTE(fallthrough)
+#  define ZSTD_FALLTHROUGH [[fallthrough]]
+# elif __has_attribute(__fallthrough__)
+/* Leading semicolon is to satisfy gcc-11 with -pedantic. Without the semicolon
+ * gcc complains about: a label can only be part of a statement and a declaration is not a statement.
+ */
+#  define ZSTD_FALLTHROUGH ; __attribute__((__fallthrough__))
+# else
+#  define ZSTD_FALLTHROUGH
+# endif
+#endif
+
+/*-**************************************************************
+*  Alignment check
+*****************************************************************/
+
+/* this test was initially positioned in mem.h,
+ * but this file is removed (or replaced) for linux kernel
+ * so it's now hosted in compiler.h,
+ * which remains valid for both user & kernel spaces.
+ */
+
+#ifndef ZSTD_ALIGNOF
+# if defined(__GNUC__) || defined(_MSC_VER)
+/* covers gcc, clang & MSVC */
+/* note : this section must come first, before C11,
+ * due to a limitation in the kernel source generator */
+#  define ZSTD_ALIGNOF(T) __alignof(T)
+
+# elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+/* C11 support */
+#  include <stdalign.h>
+#  define ZSTD_ALIGNOF(T) alignof(T)
+
+# else
+/* No known support for alignof() - imperfect backup */
+#  define ZSTD_ALIGNOF(T) (sizeof(void*) < sizeof(T) ? sizeof(void*) : sizeof(T))
+
+# endif
+#endif /* ZSTD_ALIGNOF */
+
+/*-**************************************************************
+*  Sanitizer
+*****************************************************************/
+
+#if ZSTD_MEMORY_SANITIZER
+/* Not all platforms that support msan provide sanitizers/msan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+#include <stddef.h>  /* size_t */
+#define ZSTD_DEPS_NEED_STDINT
+#include "zstd_deps.h"  /* intptr_t */
+
+/* Make memory region fully initialized (without changing its contents). */
+void __msan_unpoison(const volatile void *a, size_t size);
+
+/* Make memory region fully uninitialized (without changing its contents).
+   This is a legacy interface that does not update origin information. Use
+   __msan_allocated_memory() instead. */
+void __msan_poison(const volatile void *a, size_t size);
+
+/* Returns the offset of the first (at least partially) poisoned byte in the
+   memory range, or -1 if the whole range is good. */
+intptr_t __msan_test_shadow(const volatile void *x, size_t size);
+#endif
+
+#if ZSTD_ADDRESS_SANITIZER
+/* Not all platforms that support asan provide sanitizers/asan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+#include <stddef.h>  /* size_t */
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as unaddressable.
+ *
+ * This memory must be previously allocated by your program. Instrumented
+ * code is forbidden from accessing addresses in this region until it is
+ * unpoisoned. This function is not guaranteed to poison the entire region -
+ * it could poison only a subregion of <c>[addr, addr+size)</c> due to ASan
+ * alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can poison or
+ * unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_poison_memory_region(void const volatile *addr, size_t size);
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as addressable.
+ *
+ * This memory must be previously allocated by your program. Accessing
+ * addresses in this region is allowed until this region is poisoned again.
+ * This function could unpoison a super-region of <c>[addr, addr+size)</c> due
+ * to ASan alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can
+ * poison or unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+#endif
+
+#endif /* ZSTD_COMPILER_H */
diff --git a/GraphBLAS/zstd/zstd_subset/common/cpu.h b/GraphBLAS/zstd/zstd_subset/common/cpu.h
new file mode 100644
index 000000000..8acd33be3
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/cpu.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include "mem.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1));
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\t"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
diff --git a/GraphBLAS/zstd/zstd_subset/common/debug.c b/GraphBLAS/zstd/zstd_subset/common/debug.c
new file mode 100644
index 000000000..bb863c9ea
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/debug.c
@@ -0,0 +1,24 @@
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * This module only hosts one global variable
+ * which can be used to dynamically influence the verbosity of traces,
+ * such as DEBUGLOG and RAWLOG
+ */
+
+#include "debug.h"
+
+int g_debuglevel = DEBUGLEVEL;
diff --git a/GraphBLAS/zstd/zstd_subset/common/debug.h b/GraphBLAS/zstd/zstd_subset/common/debug.h
new file mode 100644
index 000000000..3b2a320a1
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/debug.h
@@ -0,0 +1,107 @@
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * The purpose of this header is to enable debug functions.
+ * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
+ * and DEBUG_STATIC_ASSERT() for compile-time.
+ *
+ * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
+ *
+ * Level 1 enables assert() only.
+ * Starting level 2, traces can be generated and pushed to stderr.
+ * The higher the level, the more verbose the traces.
+ *
+ * It's possible to dynamically adjust level using variable g_debug_level,
+ * which is only declared if DEBUGLEVEL>=2,
+ * and is a global variable, not multi-thread protected (use with care)
+ */
+
+#ifndef DEBUG_H_12987983217
+#define DEBUG_H_12987983217
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* static assert is triggered at compile time, leaving no runtime artefact.
+ * static assert only works with compile-time constants.
+ * Also, this variant can only be used inside a function. */
+#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
+
+
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+
+/* recommended values for DEBUGLEVEL :
+ * 0 : release mode, no debug, all run-time checks disabled
+ * 1 : enables assert() only, no display
+ * 2 : reserved, for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (verbose)
+ * 7+: events at every position (*very* verbose)
+ *
+ * It's generally inconvenient to output traces > 5.
+ * In which case, it's possible to selectively trigger high verbosity levels
+ * by modifying g_debug_level.
+ */
+
+#if (DEBUGLEVEL>=1)
+#  define ZSTD_DEPS_NEED_ASSERT
+#  include "zstd_deps.h"
+#else
+#  ifndef assert   /* assert may be already defined, due to prior #include <assert.h> */
+#    define assert(condition) ((void)0)   /* disable assert (default) */
+#  endif
+#endif
+
+#if (DEBUGLEVEL>=2)
+#  define ZSTD_DEPS_NEED_IO
+#  include "zstd_deps.h"
+extern int g_debuglevel; /* the variable is only declared,
+                            it actually lives in debug.c,
+                            and is shared by the whole process.
+                            It's not thread-safe.
+                            It's useful when enabling very verbose levels
+                            on selective conditions (such as position in src) */
+
+#  define RAWLOG(l, ...) {                                       \
+                if (l<=g_debuglevel) {                           \
+                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                     \
+                if (l<=g_debuglevel) {                           \
+                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+                    ZSTD_DEBUG_PRINT(" \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* DEBUG_H_12987983217 */
diff --git a/GraphBLAS/zstd/zstd_subset/common/entropy_common.c b/GraphBLAS/zstd/zstd_subset/common/entropy_common.c
new file mode 100644
index 000000000..98bd4238d
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/entropy_common.c
@@ -0,0 +1,341 @@
+/* ******************************************************************
+ * Common functions of New Generation Entropy library
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "mem.h"
+#include "error_private.h"       /* ERR_*, ERROR */
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+#include "huf.h"
+#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+FORCE_INLINE_TEMPLATE
+size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                           const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    unsigned const maxSV1 = *maxSVPtr + 1;
+    int previous0 = 0;
+
+    if (hbSize < 8) {
+        /* This function only works when hbSize >= 8 */
+        char buffer[8] = {0};
+        ZSTD_memcpy(buffer, headerBuffer, hbSize);
+        {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+                                                    buffer, sizeof(buffer));
+            if (FSE_isError(countSize)) return countSize;
+            if (countSize > hbSize) return ERROR(corruption_detected);
+            return countSize;
+    }   }
+    assert(hbSize >= 8);
+
+    /* init */
+    ZSTD_memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    for (;;) {
+        if (previous0) {
+            /* Count the number of repeats. Each time the
+             * 2-bit repeat code is 0b11 there is another
+             * repeat.
+             * Avoid UB by setting the high bit to 1.
+             */
+            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+            while (repeats >= 12) {
+                charnum += 3 * 12;
+                if (LIKELY(ip <= iend-7)) {
+                    ip += 3;
+                } else {
+                    bitCount -= (int)(8 * (iend - 7 - ip));
+                    bitCount &= 31;
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> bitCount;
+                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+            }
+            charnum += 3 * repeats;
+            bitStream >>= 2 * repeats;
+            bitCount += 2 * repeats;
+
+            /* Add the final repeat which isn't 0b11. */
+            assert((bitStream & 3) < 3);
+            charnum += bitStream & 3;
+            bitCount += 2;
+
+            /* This is an error, but break and return an error
+             * at the end, because returning out of a loop makes
+             * it harder for the compiler to optimize.
+             */
+            if (charnum >= maxSV1) break;
+
+            /* We don't need to set the normalized count to 0
+             * because we already memset the whole buffer to 0.
+             */
+
+            if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                assert((bitCount >> 3) <= 3); /* For first condition to work */
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                bitCount &= 31;
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> bitCount;
+        }
+        {
+            int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            /* When it matters (small blocks), this is a
+             * predictable branch, because we don't use -1.
+             */
+            if (count >= 0) {
+                remaining -= count;
+            } else {
+                assert(count == -1);
+                remaining += count;
+            }
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+
+            assert(threshold > 1);
+            if (remaining < threshold) {
+                /* This branch can be folded into the
+                 * threshold update condition because we
+                 * know that threshold > 1.
+                 */
+                if (remaining <= 1) break;
+                nbBits = ZSTD_highbit32(remaining) + 1;
+                threshold = 1 << (nbBits - 1);
+            }
+            if (charnum >= maxSV1) break;
+
+            if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                bitCount &= 31;
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> bitCount;
+    }   }
+    if (remaining != 1) return ERROR(corruption_detected);
+    /* Only possible when there are too many zeros. */
+    if (charnum > maxSV1) return ERROR(maxSymbolValue_tooSmall);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t FSE_readNCount_body_default(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+#endif
+
+size_t FSE_readNCount_bmi2(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return FSE_readNCount_body_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+    }
+#endif
+    (void)bmi2;
+    return FSE_readNCount_body_default(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+
+size_t FSE_readNCount(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0);
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                   U32* nbSymbolsPtr, U32* tableLogPtr,
+                   const void* src, size_t srcSize,
+                   void* workSpace, size_t wkspSize,
+                   int bmi2)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* ZSTD_memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        /* max (hwSize-1) values decoded, as last one is implied */
+        oSize = FSE_decompress_wksp_bmi2(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize, bmi2);
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << ZSTD_highbit32(rest);
+            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t HUF_readStats_body_default(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize)
+{
+    return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 0);
+}
+
+#if DYNAMIC_BMI2
+static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize)
+{
+    return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 1);
+}
+#endif
+
+size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize,
+                     int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+    }
+#endif
+    (void)bmi2;
+    return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+}
diff --git a/GraphBLAS/zstd/zstd_subset/common/error_private.c b/GraphBLAS/zstd/zstd_subset/common/error_private.c
new file mode 100644
index 000000000..1b67500f3
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/error_private.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+#include "error_private.h"
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+#ifdef ZSTD_STRIP_ERROR_STRINGS
+    (void)code;
+    return "Error strings stripped";
+#else
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(corruption_detected): return "Data corruption detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+    case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+    case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+    case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+    case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+#endif
+}
diff --git a/GraphBLAS/zstd/zstd_subset/common/error_private.h b/GraphBLAS/zstd/zstd_subset/common/error_private.h
new file mode 100644
index 000000000..007d81066
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/error_private.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include "../zstd_errors.h"  /* enum list */
+#include "compiler.h"
+#include "debug.h"
+#include "zstd_deps.h"       /* size_t */
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#undef ERROR   /* already defined on Visual Studio */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+/* check and forward error code */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * This is a helper function to help force C99-correctness during compilation.
+ * Under strict compilation modes, variadic macro arguments can't be empty.
+ * However, variadic function arguments can be. Using a function therefore lets
+ * us statically check that at least one (string) argument was passed,
+ * independent of the compilation flags.
+ */
+static INLINE_KEYWORD UNUSED_ATTR
+void _force_has_format_string(const char *format, ...) {
+  (void)format;
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * We want to force this function invocation to be syntactically correct, but
+ * we don't want to force runtime evaluation of its arguments.
+ */
+#define _FORCE_HAS_FORMAT_STRING(...) \
+  if (0) { \
+    _force_has_format_string(__VA_ARGS__); \
+  }
+
+#define ERR_QUOTE(str) #str
+
+/**
+ * Return the specified error if the condition evaluates to true.
+ *
+ * In debug modes, prints additional information.
+ * In order to do that (particularly, printing the conditional that failed),
+ * this can't just wrap RETURN_ERROR().
+ */
+#define RETURN_ERROR_IF(cond, err, ...) \
+  if (cond) { \
+    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  }
+
+/**
+ * Unconditionally return the specified error.
+ *
+ * In debug modes, prints additional information.
+ */
+#define RETURN_ERROR(err, ...) \
+  do { \
+    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  } while(0);
+
+/**
+ * If the provided expression evaluates to an error code, returns that error code.
+ *
+ * In debug modes, prints additional information.
+ */
+#define FORWARD_IF_ERROR(err, ...) \
+  do { \
+    size_t const err_code = (err); \
+    if (ERR_isError(err_code)) { \
+      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+      RAWLOG(3, ": " __VA_ARGS__); \
+      RAWLOG(3, "\n"); \
+      return err_code; \
+    } \
+  } while(0);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
diff --git a/GraphBLAS/zstd/zstd_subset/common/fse.h b/GraphBLAS/zstd/zstd_subset/common/fse.h
new file mode 100644
index 000000000..466a07281
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/fse.h
@@ -0,0 +1,717 @@
+/* ******************************************************************
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef FSE_H
+#define FSE_H
+
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include "zstd_deps.h"    /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define FSE_PUBLIC_API
+#endif
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSE_compress() :
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+    @return : size of compressed data (<= dstCapacity).
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+*/
+FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/*! FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSE_isError() .
+
+    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE advanced functions
+******************************************/
+/*! FSE_compress2() :
+    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+    Both parameters can be defined as '0' to mean : use default value
+    @return : size of compressed data
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
+*/
+FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[] (see hist.h)
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_optimalTableLog():
+    dynamically downsize 'tableLog' when conditions are met.
+    It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+    @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+    normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+    'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+    useLowProbCount is a boolean parameter which trades off compressed size for
+    faster header decoding. When it is set to 1, the compressed data will be slightly
+    smaller. And when it is set to 0, FSE_readNCount() and FSE_buildDTable() will be
+    faster. If you are compressing a small amount of data (< 2 KB) then useLowProbCount=0
+    is a good default, since header deserialization makes a big speed difference.
+    Otherwise, useLowProbCount=1 is a good default, since the speed difference is small.
+    @return : tableLog,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
+                    const unsigned* count, size_t srcSize, unsigned maxSymbolValue, unsigned useLowProbCount);
+
+/*! FSE_NCountWriteBound():
+    Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+    Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+    Compactly save 'normalizedCounter' into 'buffer'.
+    @return : size of the compressed table,
+              or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                                 const short* normalizedCounter,
+                                 unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+    Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+
+/*! FSE_buildCTable():
+    Builds `ct`, which must be already allocated, using FSE_createCTable().
+    @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+    Compress `src` using `ct` into `dst` which must be already allocated.
+    @return : size of compressed data (<= `dstCapacity`),
+              or 0 if compressed data could not fit into `dst`,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSE_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize);
+
+/*! FSE_readNCount_bmi2():
+ * Same as FSE_readNCount() but pass bmi2=1 when your CPU supports BMI2 and 0 otherwise.
+ */
+FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize, int bmi2);
+
+/*! Constructor and Destructor of FSE_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+
+/*! FSE_buildDTable():
+    Builds 'dt', which must be already allocated, using FSE_createDTable().
+    return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+#endif  /* FSE_H */
+
+#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+#define FSE_H_FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) ((size) + ((size)>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */)
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<((maxTableLog)-1)) + (((maxSymbolValue)+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<(maxTableLog)))
+
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
+
+/* *****************************************
+ *  FSE advanced API
+ ***************************************** */
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+ */
+#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`.
+ * See FSE_buildCTable_wksp() for breakdown of workspace usage.
+ */
+#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */)
+#define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog))
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+#define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1ULL << maxTableLog) + 8)
+#define FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ((FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) + sizeof(unsigned) - 1) / sizeof(unsigned))
+FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+/**< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+
+size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+/**< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
+
+typedef enum {
+   FSE_repeat_none,  /**< Cannot use the previous table */
+   FSE_repeat_check, /**< Can use the previous table but it must be checked */
+   FSE_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } FSE_repeat;
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+    FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+    errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+    FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1);
+    statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
+}
+
+
+/* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
+}
+
+/* FSE_bitCost() :
+ * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+    U32 const threshold = (minNbBits+1) << 16;
+    assert(tableLog < 16);
+    assert(accuracyLog < 31-tableLog);  /* ensure enough room for renormalization double shift */
+    {   U32 const tableSize = 1 << tableLog;
+        U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+        U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog;   /* linear interpolation (very approximate) */
+        U32 const bitMultiplier = 1 << accuracyLog;
+        assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+        assert(normalizedDeltaFromThreshold <= bitMultiplier);
+        return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+    }
+}
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#  define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#  define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+#if (FSE_DEFAULT_MEMORY_USAGE > FSE_MAX_MEMORY_USAGE)
+#  error "FSE_DEFAULT_MEMORY_USAGE must be <= FSE_MAX_MEMORY_USAGE"
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#  define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/common/fse_decompress.c b/GraphBLAS/zstd/zstd_subset/common/fse_decompress.c
new file mode 100644
index 000000000..7034fd97b
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/fse_decompress.c
@@ -0,0 +1,405 @@
+/* ******************************************************************
+ * FSE : Finite State Entropy decoder
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "debug.h"      /* assert */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"
+#include "bits.h"       /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+FSE_DTable* FSE_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_freeDTable (FSE_DTable* dt)
+{
+    ZSTD_free(dt);
+}
+
+static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16* symbolNext = (U16*)workSpace;
+    BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1);
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge);
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    if (highThreshold == tableSize - 1) {
+        size_t const tableMask = tableSize-1;
+        size_t const step = FSE_TABLESTEP(tableSize);
+        /* First lay down the symbols in order.
+         * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
+         * misses since small blocks generally have small table logs, so nearly
+         * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+         * our buffer to handle the over-write.
+         */
+        {
+            U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                pos += n;
+            }
+        }
+        /* Now we spread those positions across the table.
+         * The benefit of doing it in two stages is that we avoid the
+         * variable size inner loop, which caused lots of branch misses.
+         * Now we can run through all the positions without any branch misses.
+         * We unroll the loop twice, since that is what empirically worked best.
+         */
+        {
+            size_t position = 0;
+            size_t s;
+            size_t const unroll = 2;
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableDecode[uPosition].symbol = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);
+        }
+    } else {
+        U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_buildDTable_internal(dt, normalizedCounter, maxSymbolValue, tableLog, workSpace, wkspSize);
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+typedef struct {
+    short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+    FSE_DTable dtable[1]; /* Dynamically sized */
+} FSE_DecompressWksp;
+
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+        void* dst, size_t dstCapacity,
+        const void* cSrc, size_t cSrcSize,
+        unsigned maxLog, void* workSpace, size_t wkspSize,
+        int bmi2)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
+
+    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+    if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+
+    /* normal FSE decoding mode */
+    {
+        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+        if (FSE_isError(NCountLength)) return NCountLength;
+        if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+        assert(NCountLength <= cSrcSize);
+        ip += NCountLength;
+        cSrcSize -= NCountLength;
+    }
+
+    if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
+    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+    wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+
+    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+
+    {
+        const void* ptr = wksp->dtable;
+        const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+        const U32 fastMode = DTableH->fastMode;
+
+        /* select fast mode (static) */
+        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
+    }
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1);
+}
+#endif
+
+size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+    }
+#endif
+    (void)bmi2;
+    return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+}
+
+
+typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+#ifndef ZSTD_NO_UNUSED_FUNCTIONS
+size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) {
+    U32 wksp[FSE_BUILD_DTABLE_WKSP_SIZE_U32(FSE_TABLELOG_ABSOLUTE_MAX, FSE_MAX_SYMBOL_VALUE)];
+    return FSE_buildDTable_wksp(dt, normalizedCounter, maxSymbolValue, tableLog, wksp, sizeof(wksp));
+}
+
+size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
+{
+    /* Static analyzer seems unable to understand this table will be properly initialized later */
+    U32 wksp[FSE_DECOMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+    return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, FSE_MAX_TABLELOG, wksp, sizeof(wksp));
+}
+#endif
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/GraphBLAS/zstd/zstd_subset/common/huf.h b/GraphBLAS/zstd/zstd_subset/common/huf.h
new file mode 100644
index 000000000..85518481e
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/huf.h
@@ -0,0 +1,364 @@
+/* ******************************************************************
+ * huff0 huffman codec,
+ * part of Finite State Entropy library
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include "zstd_deps.h"    /* size_t */
+
+
+/* *** library symbols visibility *** */
+/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+ *        HUF symbols remain "private" (internal symbols for library only).
+ *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define HUF_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+#else
+#  define HUF_PUBLIC_API
+#endif
+
+
+/* ========================== */
+/* ***  simple functions  *** */
+/* ========================== */
+
+/** HUF_compress() :
+ *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+ * 'dst' buffer must be already allocated.
+ *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+ * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+ * @return : size of compressed data (<= `dstCapacity`).
+ *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+ */
+HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/** HUF_decompress() :
+ *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+ *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+ * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+ *  Note : in contrast with FSE, HUF_decompress can regenerate
+ *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+ *         because it knows size to regenerate (originalSize).
+ * @return : size of regenerated data (== originalSize),
+ *           or an error code, which can be tested using HUF_isError()
+ */
+HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /**< maximum input size for a single block compressed with HUF_compress */
+HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
+
+/* Error Management */
+HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
+
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress2() :
+ *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+ * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+ * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               unsigned maxSymbolValue, unsigned tableLog);
+
+/** HUF_compress4X_wksp() :
+ *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+ * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     unsigned maxSymbolValue, unsigned tableLog,
+                                     void* workSpace, size_t wkspSize);
+
+#endif   /* HUF_H_298734234 */
+
+/* ******************************************************************
+ *  WARNING !!
+ *  The following section contains advanced and experimental definitions
+ *  which shall never be used in the context of a dynamic library,
+ *  because they are not guaranteed to remain stable in the future.
+ *  Only consider them in association with static linking.
+ * *****************************************************************/
+#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+#define HUF_H_HUF_STATIC_LINKING_ONLY
+
+/* *** Dependencies *** */
+#include "mem.h"   /* U32 */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX  255
+
+#define HUF_TABLELOG_ABSOLUTEMAX  12  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+/* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */
+typedef size_t HUF_CElt;   /* consider it an incomplete type */
+#define HUF_CTABLE_SIZE_ST(maxSymbolValue)   ((maxSymbolValue)+2)   /* Use tables of size_t, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t))
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+#endif
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+#endif
+
+
+/* ****************************************
+ *  HUF detailed API
+ * ****************************************/
+
+/*! HUF_compress() does the following:
+ *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ *  2. (optional) refine tableLog using HUF_optimalTableLog()
+ *  3. build Huffman table from count using HUF_buildCTable()
+ *  4. save Huffman table to memory buffer using HUF_writeCTable()
+ *  5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ *  The following API allows targeting specific sub-functions for advanced tasks.
+ *  For example, it's possible to compress several blocks using the same 'CTable',
+ *  or to save and regenerate 'CTable' using external methods.
+ */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+
+typedef enum {
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } HUF_repeat;
+/** HUF_compress4X_repeat() :
+ *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid.
+ *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                       const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                             void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ *  Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+/*! HUF_readStats_wksp() :
+ * Same as HUF_readStats() but takes an external workspace which must be
+ * 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1)
+#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                          U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                          const void* src, size_t srcSize,
+                          void* workspace, size_t wkspSize,
+                          int bmi2);
+
+/** HUF_readCTable() :
+ *  Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);
+
+/** HUF_getNbBitsFromCTable() :
+ *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+ *  Note 1 : is not inlined, as HUF_CElt definition is private */
+U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+
+/*
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+#endif
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+
+
+/* ====================== */
+/* single stream variants */
+/* ====================== */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+/** HUF_compress1X_repeat() :
+ *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid.
+ *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+#endif
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+#endif
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+
+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+#endif
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+#endif
+
+#endif /* HUF_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/common/mem.h b/GraphBLAS/zstd/zstd_subset/common/mem.h
new file mode 100644
index 000000000..4b10f7c1f
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/mem.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>  /* size_t, ptrdiff_t */
+#include "compiler.h"  /* __has_builtin */
+#include "debug.h"  /* DEBUG_STATIC_ASSERT */
+#include "zstd_deps.h"  /* ZSTD_memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#  if defined(_AIX)
+#    include <inttypes.h>
+#  else
+#    include <stdint.h> /* intptr_t */
+#  endif
+  typedef   uint8_t BYTE;
+  typedef   uint8_t U8;
+  typedef    int8_t S8;
+  typedef  uint16_t U16;
+  typedef   int16_t S16;
+  typedef  uint32_t U32;
+  typedef   int32_t S32;
+  typedef  uint64_t U64;
+  typedef   int64_t S64;
+#else
+# include <limits.h>
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
+  typedef unsigned char      BYTE;
+  typedef unsigned char      U8;
+  typedef   signed char      S8;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O API
+*****************************************************************/
+/*=== Static platform detection ===*/
+MEM_STATIC unsigned MEM_32bits(void);
+MEM_STATIC unsigned MEM_64bits(void);
+MEM_STATIC unsigned MEM_isLittleEndian(void);
+
+/*=== Native unaligned read/write ===*/
+MEM_STATIC U16 MEM_read16(const void* memPtr);
+MEM_STATIC U32 MEM_read32(const void* memPtr);
+MEM_STATIC U64 MEM_read64(const void* memPtr);
+MEM_STATIC size_t MEM_readST(const void* memPtr);
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value);
+MEM_STATIC void MEM_write32(void* memPtr, U32 value);
+MEM_STATIC void MEM_write64(void* memPtr, U64 value);
+
+/*=== Little endian unaligned read/write ===*/
+MEM_STATIC U16 MEM_readLE16(const void* memPtr);
+MEM_STATIC U32 MEM_readLE24(const void* memPtr);
+MEM_STATIC U32 MEM_readLE32(const void* memPtr);
+MEM_STATIC U64 MEM_readLE64(const void* memPtr);
+MEM_STATIC size_t MEM_readLEST(const void* memPtr);
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val);
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val);
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32);
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64);
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val);
+
+/*=== Big endian unaligned read/write ===*/
+MEM_STATIC U32 MEM_readBE32(const void* memPtr);
+MEM_STATIC U64 MEM_readBE64(const void* memPtr);
+MEM_STATIC size_t MEM_readBEST(const void* memPtr);
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32);
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64);
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val);
+
+/*=== Byteswap ===*/
+MEM_STATIC U32 MEM_swap32(U32 in);
+MEM_STATIC U64 MEM_swap64(U64 in);
+MEM_STATIC size_t MEM_swapST(size_t in);
+
+
+/*-**************************************************************
+*  Memory I/O Implementation
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    return 1;
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return 0;
+#elif defined(__clang__) && __LITTLE_ENDIAN__
+    return 1;
+#elif defined(__clang__) && __BIG_ENDIAN__
+    return 0;
+#elif defined(_MSC_VER) && (_M_AMD64 || _M_IX86)
+    return 1;
+#elif defined(__DMC__) && defined(_M_IX86)
+    return 1;
+#else
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+#endif
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+    __pragma( pack(push, 1) )
+    typedef struct { U16 v; } unalign16;
+    typedef struct { U32 v; } unalign32;
+    typedef struct { U64 v; } unalign64;
+    typedef struct { size_t v; } unalignArch;
+    __pragma( pack(pop) )
+#else
+    typedef struct { U16 v; } __attribute__((packed)) unalign16;
+    typedef struct { U32 v; } __attribute__((packed)) unalign32;
+    typedef struct { U64 v; } __attribute__((packed)) unalign64;
+    typedef struct { size_t v; } __attribute__((packed)) unalignArch;
+#endif
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32_fallback(U32 in)
+{
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+}
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap32))
+    return __builtin_bswap32(in);
+#else
+    return MEM_swap32_fallback(in);
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64_fallback(U64 in)
+{
+     return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap64))
+    return __builtin_bswap64(in);
+#else
+    return MEM_swap64_fallback(in);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return (U32)MEM_readLE16(memPtr) + ((U32)(((const BYTE*)memPtr)[2]) << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+/* code only tested on 32 and 64 bits systems */
+MEM_STATIC void MEM_check(void) { DEBUG_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
diff --git a/GraphBLAS/zstd/zstd_subset/common/pool.c b/GraphBLAS/zstd/zstd_subset/common/pool.c
new file mode 100644
index 000000000..5c1d07d35
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/pool.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include "zstd_deps.h" /* size_t */
+#include "debug.h"     /* assert */
+#include "zstd_internal.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+#include "pool.h"
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+#include "threading.h"   /* pthread adaptation */
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+    POOL_function function;
+    void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    ZSTD_customMem customMem;
+    /* Keep track of the threads */
+    ZSTD_pthread_t* threads;
+    size_t threadCapacity;
+    size_t threadLimit;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates if the queue is empty */
+    int queueEmpty;
+
+    /* The mutex protects the queue */
+    ZSTD_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    ZSTD_pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    ZSTD_pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+        while ( ctx->queueEmpty
+            || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+            if (ctx->shutdown) {
+                /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+                 * a few threads will be shutdown while !queueEmpty,
+                 * but enough threads will remain active to finish the queue */
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                return opaque;
+            }
+            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            ctx->numThreadsBusy++;
+            ctx->queueEmpty = (ctx->queueHead == ctx->queueTail);
+            /* Unlock the mutex, signal a pusher, and run the job */
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+
+            job.function(job.opaque);
+
+            /* If the intended queue size was 0, signal after finishing job */
+            ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+            ctx->numThreadsBusy--;
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        }
+    }  /* for (;;) */
+    assert(0);  /* Unreachable */
+}
+
+/* ZSTD_createThreadPool() : public access point */
+POOL_ctx* ZSTD_createThreadPool(size_t numThreads) {
+    return POOL_create (numThreads, 0);
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem)
+{
+    POOL_ctx* ctx;
+    /* Check parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx*)ZSTD_customCalloc(sizeof(POOL_ctx), customMem);
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate
+     * empty and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job*)ZSTD_customMalloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    ctx->numThreadsBusy = 0;
+    ctx->queueEmpty = 1;
+    {
+        int error = 0;
+        error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+        if (error) { POOL_free(ctx); return NULL; }
+    }
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_customMalloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->threadCapacity = 0;
+    ctx->customMem = customMem;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->threadCapacity = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->threadCapacity = numThreads;
+        ctx->threadLimit = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+    /* Shut down the queue */
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->threadCapacity; ++i) {
+            ZSTD_pthread_join(ctx->threads[i], NULL);  /* note : could fail */
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+    ZSTD_customFree(ctx->queue, ctx->customMem);
+    ZSTD_customFree(ctx->threads, ctx->customMem);
+    ZSTD_customFree(ctx, ctx->customMem);
+}
+
+/*! POOL_joinJobs() :
+ *  Waits for all queued jobs to finish executing.
+ */
+void POOL_joinJobs(POOL_ctx* ctx) {
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    while(!ctx->queueEmpty || ctx->numThreadsBusy > 0) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+void ZSTD_freeThreadPool (ZSTD_threadPool* pool) {
+  POOL_free (pool);
+}
+
+size_t POOL_sizeof(const POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx)
+        + ctx->queueSize * sizeof(POOL_job)
+        + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+    if (numThreads <= ctx->threadCapacity) {
+        if (!numThreads) return 1;
+        ctx->threadLimit = numThreads;
+        return 0;
+    }
+    /* numThreads > threadCapacity */
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customMalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+        if (!threadPool) return 1;
+        /* replace existing thread pool */
+        ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
+        ZSTD_customFree(ctx->threads, ctx->customMem);
+        ctx->threads = threadPool;
+        /* Initialize additional threads */
+        {   size_t threadId;
+            for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+                if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+                    ctx->threadCapacity = threadId;
+                    return 1;
+            }   }
+    }   }
+    /* successfully expanded */
+    ctx->threadCapacity = numThreads;
+    ctx->threadLimit = numThreads;
+    return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+    int result;
+    if (ctx==NULL) return 1;
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    result = POOL_resize_internal(ctx, numThreads);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return result;
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+    if (ctx->queueSize > 1) {
+        return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+    } else {
+        return (ctx->numThreadsBusy == ctx->threadLimit) ||
+               !ctx->queueEmpty;
+    }
+}
+
+
+static void
+POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job const job = {function, opaque};
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;
+
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+    int dummy;
+};
+static POOL_ctx g_poolCtx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx*
+POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem)
+{
+    (void)numThreads;
+    (void)queueSize;
+    (void)customMem;
+    return &g_poolCtx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+    assert(!ctx || ctx == &g_poolCtx);
+    (void)ctx;
+}
+
+void POOL_joinJobs(POOL_ctx* ctx){
+    assert(!ctx || ctx == &g_poolCtx);
+    (void)ctx;
+}
+
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+    (void)ctx; (void)numThreads;
+    return 0;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
+size_t POOL_sizeof(const POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    assert(ctx == &g_poolCtx);
+    return sizeof(*ctx);
+}
+
+#endif  /* ZSTD_MULTITHREAD */
diff --git a/GraphBLAS/zstd/zstd_subset/common/pool.h b/GraphBLAS/zstd/zstd_subset/common/pool.h
new file mode 100644
index 000000000..b86a3452e
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/pool.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include "zstd_deps.h"
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
+#include "../zstd.h"
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+ *  Create a thread pool with at most `numThreads` threads.
+ * `numThreads` must be at least 1.
+ *  The maximum number of queued jobs before blocking is `queueSize`.
+ * @return : POOL_ctx pointer on success, else NULL.
+*/
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem);
+
+/*! POOL_free() :
+ *  Free a thread pool returned by POOL_create().
+ */
+void POOL_free(POOL_ctx* ctx);
+
+
+/*! POOL_joinJobs() :
+ *  Waits for all queued jobs to finish executing.
+ */
+void POOL_joinJobs(POOL_ctx* ctx);
+
+/*! POOL_resize() :
+ *  Expands or shrinks pool's number of threads.
+ *  This is more efficient than releasing + creating a new context,
+ *  since it tries to preserve and re-use existing threads.
+ * `numThreads` must be at least 1.
+ * @return : 0 when resize was successful,
+ *           !0 (typically 1) if there is an error.
+ *    note : only numThreads can be resized, queueSize remains unchanged.
+ */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads);
+
+/*! POOL_sizeof() :
+ * @return threadpool memory usage
+ *  note : compatible with NULL (returns 0 in this case)
+ */
+size_t POOL_sizeof(const POOL_ctx* ctx);
+
+/*! POOL_function :
+ *  The function type that can be added to a thread pool.
+ */
+typedef void (*POOL_function)(void*);
+
+/*! POOL_add() :
+ *  Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+ *  Possibly blocks until there is room in the queue.
+ *  Note : The function may be executed asynchronously,
+ *         therefore, `opaque` must live until function has been completed.
+ */
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+ *  Add the job `function(opaque)` to thread pool _if_ a queue slot is available.
+ *  Returns immediately even if not (does not block).
+ * @return : 1 if successful, 0 if not.
+ */
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/common/portability_macros.h b/GraphBLAS/zstd/zstd_subset/common/portability_macros.h
new file mode 100644
index 000000000..1650fa3d8
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/portability_macros.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_PORTABILITY_MACROS_H
+#define ZSTD_PORTABILITY_MACROS_H
+
+/**
+ * This header file contains macro definitions to support portability.
+ * This header is shared between C and ASM code, so it MUST only
+ * contain macro definitions. It MUST not contain any C code.
+ *
+ * This header ONLY defines macros to detect platforms/feature support.
+ *
+ */
+
+
+/* compat. with non-clang compilers */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+
+/* compat. with non-clang compilers */
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+/* compat. with non-clang compilers */
+#ifndef __has_feature
+#  define __has_feature(x) 0
+#endif
+
+/* detects whether we are being compiled under msan */
+#ifndef ZSTD_MEMORY_SANITIZER
+#  if __has_feature(memory_sanitizer)
+#    define ZSTD_MEMORY_SANITIZER 1
+#  else
+#    define ZSTD_MEMORY_SANITIZER 0
+#  endif
+#endif
+
+/* detects whether we are being compiled under asan */
+#ifndef ZSTD_ADDRESS_SANITIZER
+#  if __has_feature(address_sanitizer)
+#    define ZSTD_ADDRESS_SANITIZER 1
+#  elif defined(__SANITIZE_ADDRESS__)
+#    define ZSTD_ADDRESS_SANITIZER 1
+#  else
+#    define ZSTD_ADDRESS_SANITIZER 0
+#  endif
+#endif
+
+/* detects whether we are being compiled under dfsan */
+#ifndef ZSTD_DATAFLOW_SANITIZER
+# if __has_feature(dataflow_sanitizer)
+#  define ZSTD_DATAFLOW_SANITIZER 1
+# else
+#  define ZSTD_DATAFLOW_SANITIZER 0
+# endif
+#endif
+
+/* Mark the internal assembly functions as hidden  */
+#ifdef __ELF__
+# define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
+#else
+# define ZSTD_HIDE_ASM_FUNCTION(func)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+      && (defined(__x86_64__) || defined(_M_X64)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/**
+ * Only enable assembly for GNUC compatible compilers,
+ * because other platforms may not support GAS assembly syntax.
+ *
+ * Only enable assembly for Linux / MacOS, other platforms may
+ * work, but they haven't been tested. This could likely be
+ * extended to BSD systems.
+ *
+ * Disable assembly when MSAN is enabled, because MSAN requires
+ * 100% of code to be instrumented to work.
+ */
+#if defined(__GNUC__)
+#  if defined(__linux__) || defined(__linux) || defined(__APPLE__)
+#    if ZSTD_MEMORY_SANITIZER
+#      define ZSTD_ASM_SUPPORTED 0
+#    elif ZSTD_DATAFLOW_SANITIZER
+#      define ZSTD_ASM_SUPPORTED 0
+#    else
+#      define ZSTD_ASM_SUPPORTED 1
+#    endif
+#  else
+#    define ZSTD_ASM_SUPPORTED 0
+#  endif
+#else
+#  define ZSTD_ASM_SUPPORTED 0
+#endif
+
+/**
+ * Determines whether we should enable assembly for x86-64
+ * with BMI2.
+ *
+ * Enable if all of the following conditions hold:
+ * - ASM hasn't been explicitly disabled by defining ZSTD_DISABLE_ASM
+ * - Assembly is supported
+ * - We are compiling for x86-64 and either:
+ *   - DYNAMIC_BMI2 is enabled
+ *   - BMI2 is supported at compile time
+ */
+#if !defined(ZSTD_DISABLE_ASM) &&                                 \
+    ZSTD_ASM_SUPPORTED &&                                         \
+    defined(__x86_64__) &&                                        \
+    (DYNAMIC_BMI2 || defined(__BMI2__))
+# define ZSTD_ENABLE_ASM_X86_64_BMI2 1
+#else
+# define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+#endif
+
+/*
+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
+ * assembly sources when CET is enabled.
+ */
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
+    && defined(__has_include)
+# if __has_include(<cet.h>)
+#  include <cet.h>
+# endif
+#endif
+
+#endif /* ZSTD_PORTABILITY_MACROS_H */
diff --git a/GraphBLAS/zstd/zstd_subset/common/threading.c b/GraphBLAS/zstd/zstd_subset/common/threading.c
new file mode 100644
index 000000000..4f69f7662
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/threading.c
@@ -0,0 +1,124 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**
+ * This file will hold wrapper for systems, which do not support pthreads
+ */
+
+#include "threading.h"
+
+/* create fake symbol to avoid empty translation unit warning */
+int g_ZSTD_threading_useless_symbol;
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+
+
+/* ===  Dependencies  === */
+#include <process.h>
+#include <errno.h>
+
+
+/* ===  Implementation  === */
+
+static unsigned __stdcall worker(void *arg)
+{
+    ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg;
+    thread->arg = thread->start_routine(thread->arg);
+    return 0;
+}
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+            void* (*start_routine) (void*), void* arg)
+{
+    (void)unused;
+    thread->arg = arg;
+    thread->start_routine = start_routine;
+    thread->handle = (HANDLE) _beginthreadex(NULL, 0, worker, thread, 0, NULL);
+
+    if (!thread->handle)
+        return errno;
+    else
+        return 0;
+}
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void **value_ptr)
+{
+    DWORD result;
+
+    if (!thread.handle) return 0;
+
+    result = WaitForSingleObject(thread.handle, INFINITE);
+    CloseHandle(thread.handle);
+
+    switch (result) {
+    case WAIT_OBJECT_0:
+        if (value_ptr) *value_ptr = thread.arg;
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return GetLastError();
+    }
+}
+
+#endif   /* ZSTD_MULTITHREAD */
+
+#if defined(ZSTD_MULTITHREAD) && DEBUGLEVEL >= 1 && !defined(_WIN32)
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"
+
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr)
+{
+    *mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t));
+    if (!*mutex)
+        return 1;
+    return pthread_mutex_init(*mutex, attr);
+}
+
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex)
+{
+    if (!*mutex)
+        return 0;
+    {
+        int const ret = pthread_mutex_destroy(*mutex);
+        ZSTD_free(*mutex);
+        return ret;
+    }
+}
+
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr)
+{
+    *cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t));
+    if (!*cond)
+        return 1;
+    return pthread_cond_init(*cond, attr);
+}
+
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond)
+{
+    if (!*cond)
+        return 0;
+    {
+        int const ret = pthread_cond_destroy(*cond);
+        ZSTD_free(*cond);
+        return ret;
+    }
+}
+
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/common/threading.h b/GraphBLAS/zstd/zstd_subset/common/threading.h
new file mode 100644
index 000000000..fd0060d5a
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/threading.h
@@ -0,0 +1,155 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#include "debug.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#include <windows.h>
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef struct {
+    HANDLE handle;
+    void* (*start_routine)(void*);
+    void* arg;
+} ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)    /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#  include <pthread.h>
+
+#if DEBUGLEVEL < 1
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
+
+#else /* DEBUGLEVEL >= 1 */
+
+/* Debug implementation of threading.
+ * In this implementation we use pointers for mutexes and condition variables.
+ * This way, if we forget to init/destroy them the program will crash or ASAN
+ * will report leaks.
+ */
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t*
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr);
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex);
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock(*(a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock(*(a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t*
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr);
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond);
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait(*(a), *(b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal(*(a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast(*(a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
+
+#endif
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
diff --git a/GraphBLAS/zstd/zstd_subset/common/xxhash.c b/GraphBLAS/zstd/zstd_subset/common/xxhash.c
new file mode 100644
index 000000000..d49497cf1
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/xxhash.c
@@ -0,0 +1,24 @@
+/*
+ *  xxHash - Fast Hash algorithm
+ *  Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - xxHash homepage: http://www.xxhash.com
+ *  - xxHash source repository : https://github.com/Cyan4973/xxHash
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+*/
+
+
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/GraphBLAS/zstd/zstd_subset/common/xxhash.h b/GraphBLAS/zstd/zstd_subset/common/xxhash.h
new file mode 100644
index 000000000..f87102a1d
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/xxhash.h
@@ -0,0 +1,5686 @@
+/*
+ *  xxHash - Fast Hash algorithm
+ *  Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - xxHash homepage: http://www.xxhash.com
+ *  - xxHash source repository : https://github.com/Cyan4973/xxHash
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+*/
+
+
+#ifndef XXH_NO_XXH3
+# define XXH_NO_XXH3
+#endif
+
+#ifndef XXH_NAMESPACE
+# define XXH_NAMESPACE ZSTD_
+#endif
+
+/*!
+ * @mainpage xxHash
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
+
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MurmurHash 3a   2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
+    */
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  1
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return `XXH_VERSION_NUMBER` of the invoked library.
+ */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Common basic types
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform: need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup xxh32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that @ref xxh3_family provides competitive speed
+ *   for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+ * @see @ref xxh32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ * Example code for incrementally hashing a file:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <xxhash.h>
+ *    #define BUFFER_SIZE 256
+ *
+ *    // Note: XXH64 and XXH3 use the same interface.
+ *    XXH32_hash_t
+ *    hashFile(FILE* stream)
+ *    {
+ *        XXH32_state_t* state;
+ *        unsigned char buf[BUFFER_SIZE];
+ *        size_t amt;
+ *        XXH32_hash_t hash;
+ *
+ *        state = XXH32_createState();       // Create a state
+ *        assert(state != NULL);             // Error check here
+ *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
+ *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+ *            XXH32_update(state, buf, amt); // Hash the file in chunks
+ *        }
+ *        hash = XXH32_digest(state);        // Finalize the hash
+ *        XXH32_freeState(state);            // Clean up
+ *        return hash;
+ *    }
+ * @endcode
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ * @return XXH_OK.
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+/*
+Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+introduced in CPP17 and C23.
+CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+*/
+#if XXH_HAS_C_ATTRIBUTE(x)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_CPP_ATTRIBUTE(x)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((fallthrough))
+#else
+# define XXH_FALLTHROUGH
+#endif
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#  include <stdint.h>
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup xxh64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#ifndef XXH_NO_XXH3
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup xxh3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+ * can run XXH3 at competitive speeds, even without vector support.
+ * Further details are explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly
+ * based on default secret altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing "XXH3_generateSecret()" instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with default parameters.
+ * digest will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * return: >0 if *h128_1  > *h128_2
+ *         =0 if *h128_1 == *h128_2
+ *         <0 if *h128_1  < *h128_2
+ */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v[4];         /*!< Accumulator lanes */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v[4];         /*!< Accumulator lanes */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+};   /* typedef'd to XXH64_state_t */
+
+
+#ifndef XXH_NO_XXH3
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
+
+
+/* XXH128() :
+ * simple alias to pre-selected XXH3_128bits variant
+ */
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*
+ * XXH3_generateSecret():
+ *
+ * Derive a high-entropy secret from any user-defined content, named customSeed.
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
+ * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length @secretSize
+ * into an already allocated buffer @secretBuffer.
+ * @secretSize must be >= XXH3_SECRET_SIZE_MIN
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
+ * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
+ * are part of this list. They all accept a `secret` parameter
+ * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so
+ * XXH3_generateSecret() can be employed to ensure proper quality.
+ *
+ * customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
+ * The resulting `secret` will nonetheless provide all required qualities.
+ *
+ * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
+
+
+/*
+ * XXH3_generateSecret_fromSeed():
+ *
+ * Generate the same secret as the _withSeed() variants.
+ *
+ * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
+ * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ * This generator is notably useful in combination with `_withSecretandSeed()`,
+ * as a way to emulate a faster `_withSeed()` variant.
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
+
+/*
+ * *_withSecretandSeed() :
+ * These variants generate hash values using either
+ * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
+ * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output,
+ * via its impact to the seed.
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(const void* data, size_t len,
+                              const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(const void* data, size_t len,
+                               const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+                                    const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+                                     const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+
+
+#endif  /* XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *  .
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64 & arm64,
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
+#  if !defined(__clang__) && \
+( \
+    (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+    ( \
+        defined(__GNUC__) && ( \
+            (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
+            ( \
+                defined(__mips__) && \
+                (__mips <= 5 || __mips_isa_rev < 6) && \
+                (!defined(__mips16) || defined(__mips_mips16e2)) \
+            ) \
+        ) \
+    ) \
+)
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for ZSTD_malloc(), ZSTD_free() */
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"  /* size_t, ZSTD_malloc, ZSTD_free, ZSTD_memcpy */
+static void* XXH_malloc(size_t s) { return ZSTD_malloc(s); }
+static void  XXH_free  (void* p)  { ZSTD_free(p); }
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_memcpy(dest,src,size); }
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__) || defined(__clang__)
+#    define XXH_FORCE_INLINE static __attribute__((unused))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+#  define XXH_NO_INLINE static __attribute__((noinline))
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    include <assert.h>
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (e.g. eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t xxh_u8;
+#else
+  typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
+    return ((const xxh_unalign*)ptr)->u32;
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup xxh32_impl XXH32 implementation
+ * @ingroup impl
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is the only thing that prevents GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+     * and it is pointless writing a NEON implementation that is basically the
+     * same speed as scalar for XXH32.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param h32 The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= XXH_PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= XXH_PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param h32 The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ */
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                           \
+    h32 += (*ptr++) * XXH_PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * XXH_PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 8:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 9:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 10:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 11:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 7:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;
+           case 3:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;
+           case 2:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;
+           case 1:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    xxh_u32 h32;
+
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=16) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        xxh_u32 v2 = seed + XXH_PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - XXH_PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+/*!
+ * @ingroup xxh32_family
+ */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    statePtr->v[1] = seed + XXH_PRIME32_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME32_1;
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+
+            do {
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v[0], 1)
+            + XXH_rotl32(state->v[1], 7)
+            + XXH_rotl32(state->v[2], 12)
+            + XXH_rotl32(state->v[3], 18);
+    } else {
+        h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @ingroup xxh32_family
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    /* XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); */
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
+    return ((const xxh_unalign64*)ptr)->u64;
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup xxh64_impl XXH64 implementation
+ * @ingroup impl
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= XXH_PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= XXH_PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        h64 ^= (*ptr++) * XXH_PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(h64);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    xxh_u64 h64;
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=32) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 31;
+        xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        xxh_u64 v2 = seed + XXH_PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - XXH_PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+/*! @ingroup xxh64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    statePtr->v[1] = seed + XXH_PRIME64_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME64_1;
+    return XXH_OK;
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
+            state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
+            state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
+            state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+
+            do {
+                state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
+                state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
+                state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
+                state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
+        h64 = XXH64_mergeRound(h64, state->v[0]);
+        h64 = XXH64_mergeRound(h64, state->v[1]);
+        h64 = XXH64_mergeRound(h64, state->v[2]);
+        h64 = XXH64_mergeRound(h64, state->v[3]);
+    } else {
+        h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    /* XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); */
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup xxh3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
+   || defined(__aarch64__)  || defined(_M_ARM) \
+   || defined(_M_ARM64)     || defined(_M_ARM64EC)
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  elif defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment required for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*!
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && (defined(__GNUC__) || defined(__clang__)) \
+   && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+
+/*!
+ * @ingroup tuning
+ * @brief Controls the NEON to scalar ratio for XXH3
+ *
+ * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
+ * 2 lanes on scalar by default.
+ *
+ * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
+ * emulated 64-bit arithmetic is too slow.
+ *
+ * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
+ *
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
+ * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
+ * you are only using 2/3 of the CPU bandwidth.
+ *
+ * This is even more noticeable on the more advanced cores like the A76 which
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
+ *
+ * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
+ * remaining lanes will use scalar instructions. This improves the bandwidth
+ * and also gives the integer pipelines something to do besides twiddling loop
+ * counters and pointers.
+ *
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
+ * other CPUs:
+ *
+ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
+ *  |:----------------------|:--------------------|----------:|-----------:|------:|
+ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
+ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
+ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
+ *
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
+ *
+ * @see XXH3_accumulate_512_neon()
+ */
+# ifndef XXH3_NEON_LANES
+#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
+   && !defined(__OPTIMIZE_SIZE__)
+#   define XXH3_NEON_LANES 6
+#  else
+#   define XXH3_NEON_LANES XXH_ACC_NB
+#  endif
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+/* gcc's altivec.h can have the unwanted consequence to unconditionally
+ * #define bool, vector, and pixel keywords,
+ * with bad consequences for programs already using these keywords for other purposes.
+ * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
+ * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
+ * but it seems that, in some cases, it isn't.
+ * Force the build macro to be defined, so that keywords are not altered.
+ */
+#    if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+#      define __APPLE_ALTIVEC__
+#    endif
+#    include <altivec.h>
+#  endif
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*! Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= 0x9FB21C651E98DF25ULL;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= 0x9FB21C651E98DF25ULL;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    XXH_memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
+
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
+        for (i=0; i < nbRounds; ++i) {
+            /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
+             * this will warn "discards 'const' qualifier". */
+            union {
+                const __m512i* cp;
+                void* p;
+            } remote_const_void;
+            remote_const_void.cp = src + i;
+            dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dst16);
+#       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+
+        for (i=0; i < nbRounds; ++i) {
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+/* forward declarations for the scalar routines */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret, size_t lane);
+
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret, size_t lane);
+
+/*!
+ * @internal
+ * @brief The bulk processing loop for NEON.
+ *
+ * The NEON code path is actually partially scalar when running on AArch64. This
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
+ * CPU, and it also mitigates some GCC codegen issues.
+ *
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
+    {
+        uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        /* NEON for the first few lanes (these loops are normally interleaved) */
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            /* xacc[i] += swap(data_vec); */
+            uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+            uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+            xacc[i] = vaddq_u64 (xacc[i], swapped);
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+        /* Scalar for the remainder. This may be a zero iteration loop. */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarRound(acc, input, secret, i);
+        }
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (XXH_PRIME32_1);
+
+        size_t i;
+        /* NEON for the first few lanes (these loops are normally interleaved) */
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8    (xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64   (data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+        }
+        /* Scalar for the remainder. This may be a zero iteration loop. */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarScrambleRound(acc, secret, i);
+        }
+    }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* presumed aligned */
+    unsigned int* const xacc = (unsigned int*) acc;
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
+        acc_vec += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        acc_vec += vec_permi(data_vec, data_vec, 2);
+#else
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+        /* xacc[i] = acc_vec; */
+        vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+/*!
+ * @internal
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc,
+                 void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret,
+                 size_t lane)
+{
+    xxh_u64* xacc = (xxh_u64*) acc;
+    xxh_u8 const* xinput  = (xxh_u8 const*) input;
+    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    {
+        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes a 64 byte block of data using the scalar path.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarRound(acc, input, secret, i);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret,
+                         size_t lane)
+{
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
+        xxh_u64 acc64 = xacc[lane];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scrambles the accumulators after a large chunk has been read
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), it fights for bandwidth with
+     * the arithmetic instructions.
+     *
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     *
+     * See XXH3_NEON_LANES for details on the pipsline.
+     *
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == XXH3_kSecret);
+
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes Clang to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_f_accumulate_512 f_acc512)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+        f_acc512(acc,
+                 in,
+                 secret + n*XXH_SECRET_CONSUME_RATE);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate_512 f_acc512,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate_512 f_acc512,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+}
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate_512 f_acc512,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc512, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc512, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
+}
+
+
+/* ===   XXH3 streaming   === */
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
+    return XXH_OK;
+}
+
+/* Note : when XXH3_consumeStripes() is invoked,
+ * there must be a guarantee that at least one more byte must be consumed from input
+ * so that the function can blindly consume all stripes using the "normal" secret segment */
+XXH_FORCE_INLINE void
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate_512 f_acc512,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ASSERT(nbStripes <= nbStripesPerBlock);  /* can handle max 1 scramble per invocation */
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
+        size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
+        f_scramble(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
+        *nbStripesSoFarPtr = nbStripesAfterBlock;
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
+        *nbStripesSoFarPtr += nbStripes;
+    }
+}
+
+#ifndef XXH3_STREAM_USE_STACK
+# ifndef __clang__ /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
+            XXH3_f_accumulate_512 f_acc512,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    XXH_ASSERT(state != NULL);
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        /* small input : just fill in tmp buffer */
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc512, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+
+        /* large input to consume : ingest per full block */
+        if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
+            /* join to current block's end */
+            {   size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
+                XXH_ASSERT(nbStripesToEnd <= nbStripes);
+                XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
+                f_scramble(acc, secret + state->secretLimit);
+                state->nbStripesSoFar = 0;
+                input += nbStripesToEnd * XXH_STRIPE_LEN;
+                nbStripes -= nbStripesToEnd;
+            }
+            /* consume per entire blocks */
+            while(nbStripes >= state->nbStripesPerBlock) {
+                XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
+                f_scramble(acc, secret + state->secretLimit);
+                input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
+                nbStripes -= state->nbStripesPerBlock;
+            }
+            /* consume last partial block */
+            XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
+            input += nbStripes * XXH_STRIPE_LEN;
+            XXH_ASSERT(input < bEnd);  /* at least some bytes left */
+            state->nbStripesSoFar = nbStripes;
+            /* buffer predecessor of last partial stripe */
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+            XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
+        } else {
+            /* content to consume <= block size */
+            /* Consume input by a multiple of internal buffer size */
+            if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+                const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+                do {
+                    XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                        input, XXH3_INTERNALBUFFER_STRIPES,
+                                        secret, state->secretLimit,
+                                        f_acc512, f_scramble);
+                    input += XXH3_INTERNALBUFFER_SIZE;
+                } while (input<limit);
+                /* buffer predecessor of last partial stripe */
+                XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+            }
+        }
+
+        /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        memcpy(state->acc, acc, sizeof(acc));
+#endif
+    }
+
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate_512, XXH3_scrambleAcc);
+        /* last stripe */
+        XXH3_accumulate_512(acc,
+                            state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        xxh_u8 lastStripe[XXH_STRIPE_LEN];
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        XXH3_accumulate_512(acc,
+                            lastStripe,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    }
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_mergeAccs(acc,
+                              secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->useSeed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate_512 f_acc512,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * XXH_PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate_512 f_acc512,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc512, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc512, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    return XXH3_64bits_reset(statePtr);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * XXH_PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         secret + state->secretLimit + XXH_STRIPE_LEN
+                                                - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
+{
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+#else
+    /* production mode, assert() are disabled */
+    if (secretBuffer == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+#endif
+
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(customSeed != NULL);
+#else
+    if (customSeed == NULL) return XXH_ERROR;
+#endif
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/common/zstd_common.c b/GraphBLAS/zstd/zstd_subset/common/zstd_common.c
new file mode 100644
index 000000000..3d7e35b30
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/zstd_common.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+#include "error_private.h"
+#include "zstd_internal.h"
+
+
+/*-****************************************
+*  Version
+******************************************/
+unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; }
+
+const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+#undef ZSTD_isError   /* defined within zstd_internal.h */
+/*! ZSTD_isError() :
+ *  tells if a return value is an error code
+ *  symbol is required for external callers */
+unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTD_getErrorName() :
+ *  provides error code string from function result (useful for debugging) */
+const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+/*! ZSTD_getError() :
+ *  convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+
+/*! ZSTD_getErrorString() :
+ *  provides error code string from enum */
+const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+
+
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return ZSTD_malloc(size);
+}
+
+void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        ZSTD_memset(ptr, 0, size);
+        return ptr;
+    }
+    return ZSTD_calloc(1, size);
+}
+
+void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            ZSTD_free(ptr);
+    }
+}
diff --git a/GraphBLAS/zstd/zstd_subset/common/zstd_deps.h b/GraphBLAS/zstd/zstd_subset/common/zstd_deps.h
new file mode 100644
index 000000000..14211344a
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/zstd_deps.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides common libc dependencies that zstd requires.
+ * The purpose is to allow replacing this file with a custom implementation
+ * to compile zstd without libc support.
+ */
+
+/* Need:
+ * NULL
+ * INT_MAX
+ * UINT_MAX
+ * ZSTD_memcpy()
+ * ZSTD_memset()
+ * ZSTD_memmove()
+ */
+#ifndef ZSTD_DEPS_COMMON
+#define ZSTD_DEPS_COMMON
+
+#include <limits.h>
+#include <stddef.h>
+#include <string.h>
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+# define ZSTD_memcpy(d,s,l) __builtin_memcpy((d),(s),(l))
+# define ZSTD_memmove(d,s,l) __builtin_memmove((d),(s),(l))
+# define ZSTD_memset(p,v,l) __builtin_memset((p),(v),(l))
+#else
+# define ZSTD_memcpy(d,s,l) memcpy((d),(s),(l))
+# define ZSTD_memmove(d,s,l) memmove((d),(s),(l))
+# define ZSTD_memset(p,v,l) memset((p),(v),(l))
+#endif
+
+#endif /* ZSTD_DEPS_COMMON */
+
+/* Need:
+ * ZSTD_malloc()
+ * ZSTD_free()
+ * ZSTD_calloc()
+ */
+#ifdef ZSTD_DEPS_NEED_MALLOC
+#ifndef ZSTD_DEPS_MALLOC
+#define ZSTD_DEPS_MALLOC
+
+#include <stdlib.h>
+
+#define ZSTD_malloc(s) malloc(s)
+#define ZSTD_calloc(n,s) calloc((n), (s))
+#define ZSTD_free(p) free((p))
+
+#endif /* ZSTD_DEPS_MALLOC */
+#endif /* ZSTD_DEPS_NEED_MALLOC */
+
+/*
+ * Provides 64-bit math support.
+ * Need:
+ * U64 ZSTD_div64(U64 dividend, U32 divisor)
+ */
+#ifdef ZSTD_DEPS_NEED_MATH64
+#ifndef ZSTD_DEPS_MATH64
+#define ZSTD_DEPS_MATH64
+
+#define ZSTD_div64(dividend, divisor) ((dividend) / (divisor))
+
+#endif /* ZSTD_DEPS_MATH64 */
+#endif /* ZSTD_DEPS_NEED_MATH64 */
+
+/* Need:
+ * assert()
+ */
+#ifdef ZSTD_DEPS_NEED_ASSERT
+#ifndef ZSTD_DEPS_ASSERT
+#define ZSTD_DEPS_ASSERT
+
+#include <assert.h>
+
+#endif /* ZSTD_DEPS_ASSERT */
+#endif /* ZSTD_DEPS_NEED_ASSERT */
+
+/* Need:
+ * ZSTD_DEBUG_PRINT()
+ */
+#ifdef ZSTD_DEPS_NEED_IO
+#ifndef ZSTD_DEPS_IO
+#define ZSTD_DEPS_IO
+
+#include <stdio.h>
+#define ZSTD_DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+
+#endif /* ZSTD_DEPS_IO */
+#endif /* ZSTD_DEPS_NEED_IO */
+
+/* Only requested when <stdint.h> is known to be present.
+ * Need:
+ * intptr_t
+ */
+#ifdef ZSTD_DEPS_NEED_STDINT
+#ifndef ZSTD_DEPS_STDINT
+#define ZSTD_DEPS_STDINT
+
+#include <stdint.h>
+
+#endif /* ZSTD_DEPS_STDINT */
+#endif /* ZSTD_DEPS_NEED_STDINT */
diff --git a/GraphBLAS/zstd/zstd_subset/common/zstd_internal.h b/GraphBLAS/zstd/zstd_subset/common/zstd_internal.h
new file mode 100644
index 000000000..e89226702
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/zstd_internal.h
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* this module contains definitions which must be identical
+ * across compression, decompression and dictBuilder.
+ * It also contains a few functions useful to at least 2 of them
+ * and which benefit from being inlined */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "compiler.h"
+#include "cpu.h"
+#include "mem.h"
+#include "debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
+#include "error_private.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "../zstd.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY  /* XXH64_state_t */
+#endif
+#include "xxhash.h"                /* XXH_reset, update, digest */
+#ifndef ZSTD_NO_TRACE
+#  include "zstd_trace.h"
+#else
+#  define ZSTD_TRACE 0
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+#define ZSTD_isError ERR_isError   /* for inlining */
+#define FSE_isError  ERR_isError
+#define HUF_isError  ERR_isError
+
+
+/*-*************************************
+*  shared macros
+***************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+#define BOUNDED(min,val,max) (MAX(min,MIN(val,max)))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM    (1<<12)
+
+#define ZSTD_REP_NUM      3                 /* number of repcodes */
+static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static UNUSED_ATTR const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static UNUSED_ATTR const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
+
+#define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static UNUSED_ATTR const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define ZSTD_FRAMECHECKSUMSIZE 4
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
+
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+
+#define Litbits  8
+#define LitHufLog 11
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML   52
+#define MaxLL   35
+#define DefaultMaxOff 28
+#define MaxOff  31
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+#define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+
+#define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+/* Each table cannot take more than #symbols * FSELog bits */
+#define ZSTD_MAX_FSE_HEADERS_SIZE (((MaxML + 1) * MLFSELog + (MaxLL + 1) * LLFSELog + (MaxOff + 1) * OffFSELog + 7) / 8)
+
+static UNUSED_ATTR const U8 LL_bits[MaxLL+1] = {
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 2, 2, 3, 3,
+     4, 6, 7, 8, 9,10,11,12,
+    13,14,15,16
+};
+static UNUSED_ATTR const S16 LL_defaultNorm[MaxLL+1] = {
+     4, 3, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 1, 1, 1,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     2, 3, 2, 1, 1, 1, 1, 1,
+    -1,-1,-1,-1
+};
+#define LL_DEFAULTNORMLOG 6  /* for static allocation */
+static UNUSED_ATTR const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static UNUSED_ATTR const U8 ML_bits[MaxML+1] = {
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 2, 2, 3, 3,
+     4, 4, 5, 7, 8, 9,10,11,
+    12,13,14,15,16
+};
+static UNUSED_ATTR const S16 ML_defaultNorm[MaxML+1] = {
+     1, 4, 3, 2, 2, 2, 2, 2,
+     2, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1,-1,-1,
+    -1,-1,-1,-1,-1
+};
+#define ML_DEFAULTNORMLOG 6  /* for static allocation */
+static UNUSED_ATTR const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static UNUSED_ATTR const S16 OF_defaultNorm[DefaultMaxOff+1] = {
+     1, 1, 1, 1, 1, 1, 2, 2,
+     2, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+    -1,-1,-1,-1,-1
+};
+#define OF_DEFAULTNORMLOG 5  /* for static allocation */
+static UNUSED_ATTR const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) {
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
+    ZSTD_memcpy(dst, src, 8);
+#endif
+}
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/* Need to use memmove here since the literal buffer can now be located within
+   the dst buffer. In circumstances where the op "catches up" to where the
+   literal buffer is, there can be partial overlaps in this call on the final
+   copy if the literal is being shifted by less than 16 bytes. */
+static void ZSTD_copy16(void* dst, const void* src) {
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#elif defined(ZSTD_ARCH_X86_SSE2)
+    _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
+#elif defined(__clang__)
+    ZSTD_memmove(dst, src, 16);
+#else
+    /* ZSTD_memmove is not inlined properly by gcc */
+    BYTE copy16_buf[16];
+    ZSTD_memcpy(copy16_buf, src, 16);
+    ZSTD_memcpy(dst, copy16_buf, 16);
+#endif
+}
+#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
+
+#define WILDCOPY_OVERLENGTH 32
+#define WILDCOPY_VECLEN 16
+
+typedef enum {
+    ZSTD_no_overlap,
+    ZSTD_overlap_src_before_dst
+    /*  ZSTD_overlap_dst_before_src, */
+} ZSTD_overlap_e;
+
+/*! ZSTD_wildcopy() :
+ *  Custom version of ZSTD_memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0)
+ *  @param ovtype controls the overlap detection
+ *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ *         - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
+ *           The src buffer must be before the dst buffer.
+ */
+MEM_STATIC FORCE_INLINE_ATTR
+void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype)
+{
+    ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+
+    if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+        /* Handle short offset copies. */
+        do {
+            COPY8(op, ip)
+        } while (op < oend);
+    } else {
+        assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+        /* Separate out the first COPY16() call because the copy length is
+         * almost certain to be short, so the branches have different
+         * probabilities. Since it is almost certain to be short, only do
+         * one COPY16() in the first call. Then, do two calls per loop since
+         * at that point it is more likely to have a high trip count.
+         */
+        ZSTD_copy16(op, ip);
+        if (16 >= length) return;
+        op += 16;
+        ip += 16;
+        do {
+            COPY16(op, ip);
+            COPY16(op, ip);
+        }
+        while (op < oend);
+    }
+}
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    if (length > 0) {
+        ZSTD_memcpy(dst, src, length);
+    }
+    return length;
+}
+
+/* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3
+
+/* when workspace is continuously too large
+ * during at least this number of times,
+ * context's memory usage is considered wasteful,
+ * because it's sized to handle a worst case scenario which rarely happens.
+ * In which case, resize it down to free some memory */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128
+
+/* Controls whether the input/output buffer is buffered or stable. */
+typedef enum {
+    ZSTD_bm_buffered = 0,  /* Buffer the input/output */
+    ZSTD_bm_stable = 1     /* ZSTD_inBuffer/ZSTD_outBuffer is stable */
+} ZSTD_bufferMode_e;
+
+
+/*-*******************************************
+*  Private declarations
+*********************************************/
+typedef struct seqDef_s {
+    U32 offBase;   /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
+    U16 litLength;
+    U16 mlBase;    /* mlBase == matchLength - MINMATCH */
+} seqDef;
+
+/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */
+typedef enum {
+    ZSTD_llt_none = 0,             /* no longLengthType */
+    ZSTD_llt_literalLength = 1,    /* represents a long literal */
+    ZSTD_llt_matchLength = 2       /* represents a long match */
+} ZSTD_longLengthType_e;
+
+typedef struct {
+    seqDef* sequencesStart;
+    seqDef* sequences;      /* ptr to end of sequences */
+    BYTE*  litStart;
+    BYTE*  lit;             /* ptr to end of literals */
+    BYTE*  llCode;
+    BYTE*  mlCode;
+    BYTE*  ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
+
+    /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
+     * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+     * the existing value of the litLength or matchLength by 0x10000.
+     */
+    ZSTD_longLengthType_e longLengthType;
+    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+} seqStore_t;
+
+typedef struct {
+    U32 litLength;
+    U32 matchLength;
+} ZSTD_sequenceLength;
+
+/**
+ * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
+ */
+MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+{
+    ZSTD_sequenceLength seqLen;
+    seqLen.litLength = seq->litLength;
+    seqLen.matchLength = seq->mlBase + MINMATCH;
+    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+        if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+            seqLen.litLength += 0x10000;
+        }
+        if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+            seqLen.matchLength += 0x10000;
+        }
+    }
+    return seqLen;
+}
+
+/**
+ * Contains the compressed frame size and an upper-bound for the decompressed frame size.
+ * Note: before using `compressedSize`, check for errors using ZSTD_isError().
+ *       similarly, before using `decompressedBound`, check for errors using:
+ *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+ */
+typedef struct {
+    size_t compressedSize;
+    unsigned long long decompressedBound;
+} ZSTD_frameSizeInfo;   /* decompress & legacy */
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+/* custom memory allocation functions */
+void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx);   /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
+
+
+typedef struct {
+    blockType_e blockType;
+    U32 lastBlock;
+    U32 origSize;
+} blockProperties_t;   /* declared here for decompress and fullbench */
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr);
+
+/*! ZSTD_decodeSeqHeaders() :
+ *  decode sequence header from src */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                       const void* src, size_t srcSize);
+
+/**
+ * @returns true iff the CPU supports dynamic BMI2 dispatch.
+ */
+MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
+{
+    ZSTD_cpuid_t cpuid = ZSTD_cpuid();
+    return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
diff --git a/GraphBLAS/zstd/zstd_subset/common/zstd_trace.h b/GraphBLAS/zstd/zstd_subset/common/zstd_trace.h
new file mode 100644
index 000000000..6215f1e70
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/common/zstd_trace.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_TRACE_H
+#define ZSTD_TRACE_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include <stddef.h>
+
+/* weak symbol support
+ * For now, enable conservatively:
+ * - Only GNUC
+ * - Only ELF
+ * - Only x86-64, i386 and aarch64
+ * Also, explicitly disable on platforms known not to work so they aren't
+ * forgotten in the future.
+ */
+#if !defined(ZSTD_HAVE_WEAK_SYMBOLS) && \
+    defined(__GNUC__) && defined(__ELF__) && \
+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) || defined(__aarch64__)) && \
+    !defined(__APPLE__) && !defined(_WIN32) && !defined(__MINGW32__) && \
+    !defined(__CYGWIN__) && !defined(_AIX)
+#  define ZSTD_HAVE_WEAK_SYMBOLS 1
+#else
+#  define ZSTD_HAVE_WEAK_SYMBOLS 0
+#endif
+#if ZSTD_HAVE_WEAK_SYMBOLS
+#  define ZSTD_WEAK_ATTR __attribute__((__weak__))
+#else
+#  define ZSTD_WEAK_ATTR
+#endif
+
+/* Only enable tracing when weak symbols are available. */
+#ifndef ZSTD_TRACE
+#  define ZSTD_TRACE ZSTD_HAVE_WEAK_SYMBOLS
+#endif
+
+#if ZSTD_TRACE
+
+struct ZSTD_CCtx_s;
+struct ZSTD_DCtx_s;
+struct ZSTD_CCtx_params_s;
+
+typedef struct {
+    /**
+     * ZSTD_VERSION_NUMBER
+     *
+     * This is guaranteed to be the first member of ZSTD_trace.
+     * Otherwise, this struct is not stable between versions. If
+     * the version number does not match your expectation, you
+     * should not interpret the rest of the struct.
+     */
+    unsigned version;
+    /**
+     * Non-zero if streaming (de)compression is used.
+     */
+    unsigned streaming;
+    /**
+     * The dictionary ID.
+     */
+    unsigned dictionaryID;
+    /**
+     * Is the dictionary cold?
+     * Only set on decompression.
+     */
+    unsigned dictionaryIsCold;
+    /**
+     * The dictionary size or zero if no dictionary.
+     */
+    size_t dictionarySize;
+    /**
+     * The uncompressed size of the data.
+     */
+    size_t uncompressedSize;
+    /**
+     * The compressed size of the data.
+     */
+    size_t compressedSize;
+    /**
+     * The fully resolved CCtx parameters (NULL on decompression).
+     */
+    struct ZSTD_CCtx_params_s const* params;
+    /**
+     * The ZSTD_CCtx pointer (NULL on decompression).
+     */
+    struct ZSTD_CCtx_s const* cctx;
+    /**
+     * The ZSTD_DCtx pointer (NULL on compression).
+     */
+    struct ZSTD_DCtx_s const* dctx;
+} ZSTD_Trace;
+
+/**
+ * A tracing context. It must be 0 when tracing is disabled.
+ * Otherwise, any non-zero value returned by a tracing begin()
+ * function is presented to any subsequent calls to end().
+ *
+ * Any non-zero value is treated as tracing is enabled and not
+ * interpreted by the library.
+ *
+ * Two possible uses are:
+ * * A timestamp for when the begin() function was called.
+ * * A unique key identifying the (de)compression, like the
+ *   address of the [dc]ctx pointer if you need to track
+ *   more information than just a timestamp.
+ */
+typedef unsigned long long ZSTD_TraceCtx;
+
+/**
+ * Trace the beginning of a compression call.
+ * @param cctx The dctx pointer for the compression.
+ *             It can be used as a key to map begin() to end().
+ * @returns Non-zero if tracing is enabled. The return value is
+ *          passed to ZSTD_trace_compress_end().
+ */
+ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_compress_begin(
+    struct ZSTD_CCtx_s const* cctx);
+
+/**
+ * Trace the end of a compression call.
+ * @param ctx The return value of ZSTD_trace_compress_begin().
+ * @param trace The zstd tracing info.
+ */
+ZSTD_WEAK_ATTR void ZSTD_trace_compress_end(
+    ZSTD_TraceCtx ctx,
+    ZSTD_Trace const* trace);
+
+/**
+ * Trace the beginning of a decompression call.
+ * @param dctx The dctx pointer for the decompression.
+ *             It can be used as a key to map begin() to end().
+ * @returns Non-zero if tracing is enabled. The return value is
+ *          passed to ZSTD_trace_compress_end().
+ */
+ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_decompress_begin(
+    struct ZSTD_DCtx_s const* dctx);
+
+/**
+ * Trace the end of a decompression call.
+ * @param ctx The return value of ZSTD_trace_decompress_begin().
+ * @param trace The zstd tracing info.
+ */
+ZSTD_WEAK_ATTR void ZSTD_trace_decompress_end(
+    ZSTD_TraceCtx ctx,
+    ZSTD_Trace const* trace);
+
+#endif /* ZSTD_TRACE */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_TRACE_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/clevels.h b/GraphBLAS/zstd/zstd_subset/compress/clevels.h
new file mode 100644
index 000000000..7ed2e0049
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/clevels.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CLEVELS_H
+#define ZSTD_CLEVELS_H
+
+#define ZSTD_STATIC_LINKING_ONLY  /* ZSTD_compressionParameters  */
+#include "../zstd.h"
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_MAX_CLEVEL     22
+
+#ifdef __GNUC__
+__attribute__((__unused__))
+#endif
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" - for any srcSize > 256 KB */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */
+    { 21, 18, 19,  3,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6 */
+    { 21, 19, 20,  4,  5,  8, ZSTD_lazy    },  /* level  7 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 22, 21, 22,  6,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 22, 23,  6,  5, 32, ZSTD_lazy2   },  /* level 12 */
+    { 22, 22, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
+    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
+    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */
+    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */
+},
+{   /* for srcSize <= 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  3,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 17, 18,  5,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */
+    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 16, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 16, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 16, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 16, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 16, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */
+    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/
+    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/
+    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/
+    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+};
+
+
+
+#endif  /* ZSTD_CLEVELS_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/fse_compress.c b/GraphBLAS/zstd/zstd_subset/compress/fse_compress.c
new file mode 100644
index 000000000..21be8c54f
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/fse_compress.c
@@ -0,0 +1,742 @@
+/* ******************************************************************
+ * FSE : Finite State Entropy encoder
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "../common/compiler.h"
+#include "../common/mem.h"        /* U32, U16, etc. */
+#include "../common/debug.h"      /* assert, DEBUGLOG */
+#include "hist.h"       /* HIST_count_wksp */
+#include "../common/bitstream.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#include "../common/error_private.h"
+#define ZSTD_DEPS_NEED_MALLOC
+#define ZSTD_DEPS_NEED_MATH64
+#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
+#include "../common/bits.h" /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 const maxSV1 = maxSymbolValue+1;
+
+    U16* cumul = (U16*)workSpace;   /* size = maxSV1 */
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1));  /* size = tableSize */
+
+    U32 highThreshold = tableSize-1;
+
+    assert(((size_t)workSpace & 1) == 0);  /* Must be 2 bytes-aligned */
+    if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge);
+    /* CTable header */
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+    assert(tableLog < 16);   /* required for threshold strategy to work */
+
+    /* For explanations on how to distribute symbol values over the table :
+     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u <= maxSV1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                assert(normalizedCounter[u-1] >= 0);
+                cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1];
+                assert(cumul[u] >= cumul[u-1]);  /* no overflow */
+        }   }
+        cumul[maxSV1] = (U16)(tableSize+1);
+    }
+
+    /* Spread symbols */
+    if (highThreshold == tableSize - 1) {
+        /* Case for no low prob count symbols. Lay down 8 bytes at a time
+         * to reduce branch misses since we are operating on a small block
+         */
+        BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */
+        {   U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                assert(n>=0);
+                pos += (size_t)n;
+            }
+        }
+        /* Spread symbols across the table. Lack of lowprob symbols means that
+         * we don't need variable sized inner loop, so we can unroll the loop and
+         * reduce branch misses.
+         */
+        {   size_t position = 0;
+            size_t s;
+            size_t const unroll = 2; /* Experimentally determined optimal unroll */
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableSymbol[uPosition] = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);   /* Must have initialized all positions */
+        }
+    } else {
+        U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<maxSV1; symbol++) {
+            int nbOccurrences;
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+        assert(position==0);  /* Must have initialized all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+                break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                assert(total <= INT_MAX);
+                symbolTT[s].deltaFindState = (int)(total - 1);
+                total ++;
+                break;
+            default :
+                assert(normalizedCounter[s] > 1);
+                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                    U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+                    total +=  (unsigned)normalizedCounter[s];
+    }   }   }   }
+
+#if 0  /* debug : symbol costs */
+    DEBUGLOG(5, "\n --- table statistics : ");
+    {   U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
+                symbol, normalizedCounter[symbol],
+                FSE_getMaxNbBits(symbolTT, symbol),
+                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+    }   }
+#endif
+
+    return 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-**************************************************************
+*  FSE NCount encoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                   + 4 /* bitCount initialized at 4 */
+                                   + 2 /* first two symbols may use one additional bit each */) / 8)
+                                    + 1 /* round up to whole nb bytes */
+                                    + 2 /* additional two bytes for bitstream flush */;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
+
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (symbol >= start+3) {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (symbol-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previousIs0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
+}
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return (FSE_CTable*)ZSTD_malloc(size);
+}
+
+void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue, short lowProbCount)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = lowProbCount;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if (ToDistribute == 0)
+        return 0;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ZSTD_div64((((U64)1<<vStepLog) * ToDistribute) + mid, (U32)total);   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue, unsigned useLowProbCount)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   static U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        short const lowProbCount = useLowProbCount ? -1 : 1;
+        U64 const scale = 62 - tableLog;
+        U64 const step = ZSTD_div64((U64)1<<62, (U32)total);   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = lowProbCount;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) { largestP=proba; largest=s; }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+
+    /* header */
+    tableU16[-2] = (U16) nbBits;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* Build table */
+    for (s=0; s<tableSize; s++)
+        tableU16[s] = (U16)(tableSize + s);
+
+    /* Build Symbol Transformation Table */
+    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+        for (s=0; s<=maxSymbolValue; s++) {
+            symbolTT[s].deltaNbBits = deltaNbBits;
+            symbolTT[s].deltaFindState = s-1;
+    }   }
+
+    return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+#ifndef ZSTD_NO_UNUSED_FUNCTIONS
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` size must be `(1<<tableLog)`.
+ */
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    unsigned count[FSE_MAX_SYMBOL_VALUE+1];
+    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
+    FSE_CTable* CTable = (FSE_CTable*)workSpace;
+    size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
+    void* scratchBuffer = (void*)(CTable + CTableSize);
+    size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
+
+    /* init conditions */
+    if (wkspSize < FSE_COMPRESS_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
+    if (srcSize <= 1) return 0;  /* Not compressible */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, HIST_count_wksp(count, &maxSymbolValue, src, srcSize, scratchBuffer, scratchBufferSize) );
+        if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue, /* useLowProbCount */ srcSize >= 2048) );
+
+    /* Write table description header */
+    {   CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += nc_err;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    /* check compressibility */
+    if ( (size_t)(op-ostart) >= srcSize-1 ) return 0;
+
+    return op-ostart;
+}
+
+typedef struct {
+    FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+    union {
+      U32 hist_wksp[HIST_WKSP_SIZE_U32];
+      BYTE scratchBuffer[1 << FSE_MAX_TABLELOG];
+    } workspace;
+} fseWkspMax_t;
+
+size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+    fseWkspMax_t scratchBuffer;
+    DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_COMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
+}
+
+size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+#endif
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/hist.c b/GraphBLAS/zstd/zstd_subset/compress/hist.c
new file mode 100644
index 000000000..073c57e75
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/hist.c
@@ -0,0 +1,181 @@
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "../common/mem.h"             /* U32, BYTE, etc. */
+#include "../common/debug.h"           /* assert, DEBUGLOG */
+#include "../common/error_private.h"   /* ERROR */
+#include "hist.h"
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ *  Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned largestCount=0;
+
+    ZSTD_memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    {   U32 s;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > largestCount) largestCount = count[s];
+    }
+
+    return largestCount;
+}
+
+typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e;
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` must be a U32 table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ *           or an error code (notably when histogram's alphabet is larger than *maxSymbolValuePtr) */
+static size_t HIST_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                HIST_checkInput_e check,
+                                U32* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    size_t const countSize = (*maxSymbolValuePtr + 1) * sizeof(*count);
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    /* safety checks */
+    assert(*maxSymbolValuePtr <= 255);
+    if (!sourceSize) {
+        ZSTD_memset(count, 0, countSize);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    ZSTD_memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    {   U32 s;
+        for (s=0; s<256; s++) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s] > max) max = Counting1[s];
+    }   }
+
+    {   unsigned maxSymbolValue = 255;
+        while (!Counting1[maxSymbolValue]) maxSymbolValue--;
+        if (check && maxSymbolValue > *maxSymbolValuePtr) return ERROR(maxSymbolValue_tooSmall);
+        *maxSymbolValuePtr = maxSymbolValue;
+        ZSTD_memmove(count, Counting1, countSize);   /* in case count & Counting1 are overlapping */
+    }
+    return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          void* workSpace, size_t workSpaceSize)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace);
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* source, size_t sourceSize,
+                       void* workSpace, size_t workSpaceSize)
+{
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    if (*maxSymbolValuePtr < 255)
+        return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace);
+    *maxSymbolValuePtr = 255;
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize);
+}
+
+#ifndef ZSTD_NO_UNUSED_FUNCTIONS
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters));
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters));
+}
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/compress/hist.h b/GraphBLAS/zstd/zstd_subset/compress/hist.h
new file mode 100644
index 000000000..228ed48a7
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/hist.h
@@ -0,0 +1,75 @@
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "../common/zstd_deps.h"   /* size_t */
+
+
+/* --- simple histogram functions --- */
+
+/*! HIST_count():
+ *  Provides the precise count of each byte within a table 'count'.
+ * 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+ *  Updates *maxSymbolValuePtr with actual largest symbol value detected.
+ * @return : count of the most frequent symbol (which isn't identified).
+ *           or an error code, which can be tested using HIST_isError().
+ *           note : if return == srcSize, there is only one symbol.
+ */
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                  const void* src, size_t srcSize);
+
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
+
+
+/* --- advanced histogram functions --- */
+
+#define HIST_WKSP_SIZE_U32 1024
+#define HIST_WKSP_SIZE    (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
+/** HIST_count_wksp() :
+ *  Same as HIST_count(), but using an externally provided scratch buffer.
+ *  Benefit is this function will use very little stack space.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* src, size_t srcSize,
+                       void* workSpace, size_t workSpaceSize);
+
+/** HIST_countFast() :
+ *  same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
+ *  This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
+ */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                      const void* src, size_t srcSize);
+
+/** HIST_countFast_wksp() :
+ *  Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize,
+                           void* workSpace, size_t workSpaceSize);
+
+/*! HIST_count_simple() :
+ *  Same as HIST_countFast(), this function is unsafe,
+ *  and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
+ *  It is also a bit slower for large inputs.
+ *  However, it does not need any additional memory (not even on stack).
+ * @return : count of the most frequent symbol.
+ *  Note this function doesn't produce any error (i.e. it must succeed).
+ */
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize);
diff --git a/GraphBLAS/zstd/zstd_subset/compress/huf_compress.c b/GraphBLAS/zstd/zstd_subset/compress/huf_compress.c
new file mode 100644
index 000000000..5d90c162e
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/huf_compress.c
@@ -0,0 +1,1450 @@
+/* ******************************************************************
+ * Huffman encoder, part of New Generation Entropy library
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "../common/zstd_deps.h"     /* ZSTD_memcpy, ZSTD_memset */
+#include "../common/compiler.h"
+#include "../common/bitstream.h"
+#include "hist.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+#include "../common/fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "../common/huf.h"
+#include "../common/error_private.h"
+#include "../common/bits.h"       /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Required declarations
+****************************************************************/
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+
+#if DEBUGLEVEL >= 2
+
+static size_t showU32(const U32* arr, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", arr[u]); (void)arr;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t HUF_getNbBits(HUF_CElt elt);
+
+static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
+    }
+    RAWLOG(6, " \n");
+    return size;
+
+}
+
+static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t showHNodeBits(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+#endif
+
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+#define HUF_WORKSPACE_MAX_ALIGNMENT 8
+
+static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align)
+{
+    size_t const mask = align - 1;
+    size_t const rem = (size_t)workspace & mask;
+    size_t const add = (align - rem) & mask;
+    BYTE* const aligned = (BYTE*)workspace + add;
+    assert((align & (align - 1)) == 0); /* pow 2 */
+    assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
+    if (*workspaceSizePtr >= add) {
+        assert(add < align);
+        assert(((size_t)aligned & mask) == 0);
+        *workspaceSizePtr -= add;
+        return aligned;
+    } else {
+        *workspaceSizePtr = 0;
+        return NULL;
+    }
+}
+
+
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+
+typedef struct {
+    FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+    U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)];
+    unsigned count[HUF_TABLELOG_MAX+1];
+    S16 norm[HUF_TABLELOG_MAX+1];
+} HUF_CompressWeightsWksp;
+
+static size_t
+HUF_compressWeights(void* dst, size_t dstSize,
+              const void* weightTable, size_t wtSize,
+                    void* workspace, size_t workspaceSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    unsigned maxSymbolValue = HUF_TABLELOG_MAX;
+    U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+    HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+
+    if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC);
+
+    /* init conditions */
+    if (wtSize <= 1) return 0;  /* Not compressible */
+
+    /* Scan input and build symbol stats */
+    {   unsigned const maxCount = HIST_count_simple(wksp->count, &maxSymbolValue, weightTable, wtSize);   /* never fails */
+        if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;        /* each symbol present maximum once => not compressible */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(wksp->norm, tableLog, wksp->count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) );
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), wksp->norm, maxSymbolValue, tableLog) );
+        op += hSize;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(wksp->CTable, wksp->norm, maxSymbolValue, tableLog, wksp->scratchBuffer, sizeof(wksp->scratchBuffer)) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, wksp->CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    return (size_t)(op-ostart);
+}
+
+static size_t HUF_getNbBits(HUF_CElt elt)
+{
+    return elt & 0xFF;
+}
+
+static size_t HUF_getNbBitsFast(HUF_CElt elt)
+{
+    return elt;
+}
+
+static size_t HUF_getValue(HUF_CElt elt)
+{
+    return elt & ~(size_t)0xFF;
+}
+
+static size_t HUF_getValueFast(HUF_CElt elt)
+{
+    return elt;
+}
+
+static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
+{
+    assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
+    *elt = nbBits;
+}
+
+static void HUF_setValue(HUF_CElt* elt, size_t value)
+{
+    size_t const nbBits = HUF_getNbBits(*elt);
+    if (nbBits > 0) {
+        assert((value >> nbBits) == 0);
+        *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
+    }
+}
+
+typedef struct {
+    HUF_CompressWeightsWksp wksp;
+    BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+} HUF_WriteCTableWksp;
+
+size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+                            const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
+                            void* workspace, size_t workspaceSize)
+{
+    HUF_CElt const* const ct = CTable + 1;
+    BYTE* op = (BYTE*)dst;
+    U32 n;
+    HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+
+    /* check conditions */
+    if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+    /* convert to weight */
+    wksp->bitsToWeight[0] = 0;
+    for (n=1; n<huffLog+1; n++)
+        wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
+
+    /* attempt weights compression by FSE */
+    if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
+    {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );
+        if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */
+            op[0] = (BYTE)hSize;
+            return hSize+1;
+    }   }
+
+    /* write raw values as 4-bits (max : 15) */
+    if (maxSymbolValue > (256-128)) return ERROR(GENERIC);   /* should not happen : likely means source cannot be compressed */
+    if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall);   /* not enough space within dst buffer */
+    op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+    wksp->huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause msan issue in final combination */
+    for (n=0; n<maxSymbolValue; n+=2)
+        op[(n/2)+1] = (BYTE)((wksp->huffWeight[n] << 4) + wksp->huffWeight[n+1]);
+    return ((maxSymbolValue+1)/2) + 1;
+}
+
+/*! HUF_writeCTable() :
+    `CTable` : Huffman tree to save, using huf representation.
+    @return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+{
+    HUF_WriteCTableWksp wksp;
+    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];   /* init not required, even though some static analyzer may complain */
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    HUF_CElt* const ct = CTable + 1;
+
+    /* get symbol weights */
+    CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+    *hasZeroWeights = (rankVal[0] > 0);
+
+    /* check result */
+    if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+
+    CTable[0] = tableLog;
+
+    /* Prepare base value per rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<=tableLog; n++) {
+            U32 curr = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = curr;
+    }   }
+
+    /* fill nbBits */
+    {   U32 n; for (n=0; n<nbSymbols; n++) {
+            const U32 w = huffWeight[n];
+            HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
+    }   }
+
+    /* fill val */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */
+        U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }
+        /* determine stating value per rank */
+        valPerRank[tableLog+1] = 0;   /* for w==0 */
+        {   U16 min = 0;
+            U32 n; for (n=tableLog; n>0; n--) {  /* start at n=tablelog <-> w=1 */
+                valPerRank[n] = min;     /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        /* assign value within rank, symbol order */
+        { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+    }
+
+    *maxSymbolValuePtr = nbSymbols - 1;
+    return readSize;
+}
+
+U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+{
+    const HUF_CElt* const ct = CTable + 1;
+    assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+    return (U32)HUF_getNbBits(ct[symbolValue]);
+}
+
+
+/**
+ * HUF_setMaxHeight():
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+ *
+ * It attempts to convert all nodes with nbBits > @targetNbBits
+ * to employ @targetNbBits instead. Then it adjusts the tree
+ * so that it remains a valid canonical Huffman tree.
+ *
+ * @pre               The sum of the ranks of each symbol == 2^largestBits,
+ *                    where largestBits == huffNode[lastNonNull].nbBits.
+ * @post              The sum of the ranks of each symbol == 2^largestBits,
+ *                    where largestBits is the return value (expected <= targetNbBits).
+ *
+ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
+ *                    It's presumed sorted, from most frequent to rarest symbol.
+ * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+ *                    may not respect. After this function the Huffman tree will
+ *                    respect targetNbBits.
+ * @return            The maximum number of bits of the Huffman tree after adjustment.
+ */
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+{
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+    /* early exit : no elt > targetNbBits, so the tree is already valid. */
+    if (largestBits <= targetNbBits) return largestBits;
+
+    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+
+    /* there are several too large elements (at least >= 2) */
+    {   int totalCost = 0;
+        const U32 baseCost = 1 << (largestBits - targetNbBits);
+        int n = (int)lastNonNull;
+
+        /* Adjust any ranks > targetNbBits to targetNbBits.
+         * Compute totalCost, which is how far the sum of the ranks is
+         * we are over 2^largestBits after adjust the offending ranks.
+         */
+        while (huffNode[n].nbBits > targetNbBits) {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)targetNbBits;
+            n--;
+        }
+        /* n stops at huffNode[n].nbBits <= targetNbBits */
+        assert(huffNode[n].nbBits <= targetNbBits);
+        /* n end at index of smallest symbol using < targetNbBits */
+        while (huffNode[n].nbBits == targetNbBits) --n;
+
+        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+         * note : totalCost is necessarily a multiple of baseCost */
+        assert((totalCost & (baseCost - 1)) == 0);
+        totalCost >>= (largestBits - targetNbBits);
+        assert(totalCost > 0);
+
+        /* repay normalized cost */
+        {   U32 const noSymbol = 0xF0F0F0F0;
+            U32 rankLast[HUF_TABLELOG_MAX+2];
+
+            /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+            ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+            {   U32 currentNbBits = targetNbBits;
+                int pos;
+                for (pos=n ; pos >= 0; pos--) {
+                    if (huffNode[pos].nbBits >= currentNbBits) continue;
+                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
+                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+            }   }
+
+            while (totalCost > 0) {
+                /* Try to reduce the next power of 2 above totalCost because we
+                 * gain back half the rank.
+                 */
+                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                    U32 const highPos = rankLast[nBitsToDecrease];
+                    U32 const lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noSymbol) continue;
+                    /* Decrease highPos if no symbols of lowPos or if it is
+                     * not cheaper to remove 2 lowPos than highPos.
+                     */
+                    if (lowPos == noSymbol) break;
+                    {   U32 const highTotal = huffNode[highPos].count;
+                        U32 const lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                }   }
+                /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+                assert(rankLast[nBitsToDecrease] != noSymbol || nBitsToDecrease == 1);
+                /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+                while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+                    nBitsToDecrease++;
+                assert(rankLast[nBitsToDecrease] != noSymbol);
+                /* Increase the number of bits to gain back half the rank cost. */
+                totalCost -= 1 << (nBitsToDecrease-1);
+                huffNode[rankLast[nBitsToDecrease]].nbBits++;
+
+                /* Fix up the new rank.
+                 * If the new rank was empty, this symbol is now its smallest.
+                 * Otherwise, this symbol will be the largest in the new rank so no adjustment.
+                 */
+                if (rankLast[nBitsToDecrease-1] == noSymbol)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];
+                /* Fix up the old rank.
+                 * If the symbol was at position 0, meaning it was the highest weight symbol in the tree,
+                 * it must be the only symbol in its rank, so the old rank now has no symbols.
+                 * Otherwise, since the Huffman nodes are sorted by count, the previous position is now
+                 * the smallest node in the rank. If the previous position belongs to a different rank,
+                 * then the rank is now empty.
+                 */
+                if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
+                    rankLast[nBitsToDecrease] = noSymbol;
+                else {
+                    rankLast[nBitsToDecrease]--;
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                        rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                }
+            }   /* while (totalCost > 0) */
+
+            /* If we've removed too much weight, then we have to add it back.
+             * To avoid overshooting again, we only adjust the smallest rank.
+             * We take the largest nodes from the lowest rank 0 and move them
+             * to rank 1. There's guaranteed to be enough rank 0 symbols because
+             * TODO.
+             */
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                /* special case : no rank 1 symbol (using targetNbBits-1);
+                 * let's create one from largest rank 0 (using targetNbBits).
+                 */
+                if (rankLast[1] == noSymbol) {
+                    while (huffNode[n].nbBits == targetNbBits) n--;
+                    huffNode[n+1].nbBits--;
+                    assert(n >= 0);
+                    rankLast[1] = (U32)(n+1);
+                    totalCost++;
+                    continue;
+                }
+                huffNode[ rankLast[1] + 1 ].nbBits--;
+                rankLast[1]++;
+                totalCost ++;
+            }
+        }   /* repay normalized cost */
+    }   /* there are several too large elements (at least >= 2) */
+
+    return targetNbBits;
+}
+
+typedef struct {
+    U16 base;
+    U16 curr;
+} rankPos;
+
+typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+
+/* Number of buckets available for HUF_sort() */
+#define RANK_POSITION_TABLE_SIZE 192
+
+typedef struct {
+  huffNodeTable huffNodeTbl;
+  rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
+} HUF_buildCTable_wksp_tables;
+
+/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
+ * Strategy is to use as many buckets as possible for representing distinct
+ * counts while using the remainder to represent all "large" counts.
+ *
+ * To satisfy this requirement for 192 buckets, we can do the following:
+ * Let buckets 0-166 represent distinct counts of [0, 166]
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+ */
+#define RANK_POSITION_MAX_COUNT_LOG 32
+#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
+
+/* Return the appropriate bucket index for a given count. See definition of
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+ */
+static U32 HUF_getIndex(U32 const count) {
+    return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+        ? count
+        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+}
+
+/* Helper swap function for HUF_quickSortPartition() */
+static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
+	nodeElt tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+/* Returns 0 if the huffNode array is not sorted by descending count */
+MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) {
+    U32 i;
+    for (i = 1; i < maxSymbolValue1; ++i) {
+        if (huffNode[i].count > huffNode[i-1].count) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+/* Insertion sort by descending order */
+HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) {
+    int i;
+    int const size = high-low+1;
+    huffNode += low;
+    for (i = 1; i < size; ++i) {
+        nodeElt const key = huffNode[i];
+        int j = i - 1;
+        while (j >= 0 && huffNode[j].count < key.count) {
+            huffNode[j + 1] = huffNode[j];
+            j--;
+        }
+        huffNode[j + 1] = key;
+    }
+}
+
+/* Pivot helper function for quicksort. */
+static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) {
+    /* Simply select rightmost element as pivot. "Better" selectors like
+     * median-of-three don't experimentally appear to have any benefit.
+     */
+    U32 const pivot = arr[high].count;
+    int i = low - 1;
+    int j = low;
+    for ( ; j < high; j++) {
+        if (arr[j].count > pivot) {
+            i++;
+            HUF_swapNodes(&arr[i], &arr[j]);
+        }
+    }
+    HUF_swapNodes(&arr[i + 1], &arr[high]);
+    return i + 1;
+}
+
+/* Classic quicksort by descending with partially iterative calls
+ * to reduce worst case callstack size.
+ */
+static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
+    int const kInsertionSortThreshold = 8;
+    if (high - low < kInsertionSortThreshold) {
+        HUF_insertionSort(arr, low, high);
+        return;
+    }
+    while (low < high) {
+        int const idx = HUF_quickSortPartition(arr, low, high);
+        if (idx - low < high - idx) {
+            HUF_simpleQuickSort(arr, low, idx - 1);
+            low = idx + 1;
+        } else {
+            HUF_simpleQuickSort(arr, idx + 1, high);
+            high = idx - 1;
+        }
+    }
+}
+
+/**
+ * HUF_sort():
+ * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
+ *
+ * @param[out] huffNode       Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
+ *                            Must have (maxSymbolValue + 1) entries.
+ * @param[in]  count          Histogram of the symbols.
+ * @param[in]  maxSymbolValue Maximum symbol value.
+ * @param      rankPosition   This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
+ */
+static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
+    U32 n;
+    U32 const maxSymbolValue1 = maxSymbolValue+1;
+
+    /* Compute base and set curr to base.
+     * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
+     * See HUF_getIndex to see bucketing strategy.
+     * We attribute each symbol to lowerRank's base value, because we want to know where
+     * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
+     */
+    ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
+    for (n = 0; n < maxSymbolValue1; ++n) {
+        U32 lowerRank = HUF_getIndex(count[n]);
+        assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
+        rankPosition[lowerRank].base++;
+    }
+
+    assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
+    /* Set up the rankPosition table */
+    for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
+        rankPosition[n-1].base += rankPosition[n].base;
+        rankPosition[n-1].curr = rankPosition[n-1].base;
+    }
+
+    /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
+    for (n = 0; n < maxSymbolValue1; ++n) {
+        U32 const c = count[n];
+        U32 const r = HUF_getIndex(c) + 1;
+        U32 const pos = rankPosition[r].curr++;
+        assert(pos < maxSymbolValue1);
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+
+    /* Sort each bucket. */
+    for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+        U32 const bucketStartIdx = rankPosition[n].base;
+        if (bucketSize > 1) {
+            assert(bucketStartIdx < maxSymbolValue1);
+            HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1);
+        }
+    }
+
+    assert(HUF_isSorted(huffNode, maxSymbolValue1));
+}
+
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+
+/* HUF_buildTree():
+ * Takes the huffNode array sorted by HUF_sort() and builds an unlimited-depth Huffman tree.
+ *
+ * @param huffNode        The array sorted by HUF_sort(). Builds the Huffman tree in this array.
+ * @param maxSymbolValue  The maximum symbol value.
+ * @return                The smallest node in the Huffman tree (by count).
+ */
+static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+{
+    nodeElt* const huffNode0 = huffNode - 1;
+    int nonNullRank;
+    int lowS, lowN;
+    int nodeNb = STARTNODE;
+    int n, nodeRoot;
+    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+    /* init for parents */
+    nonNullRank = (int)maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
+
+    /* create parents */
+    while (nodeNb <= nodeRoot) {
+        int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb;
+        nodeNb++;
+    }
+
+    /* distribute weights (unlimited tree height) */
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
+
+    return nonNullRank;
+}
+
+/**
+ * HUF_buildCTableFromTree():
+ * Build the CTable given the Huffman tree in huffNode.
+ *
+ * @param[out] CTable         The output Huffman CTable.
+ * @param      huffNode       The Huffman tree.
+ * @param      nonNullRank    The last and smallest node in the Huffman tree.
+ * @param      maxSymbolValue The maximum symbol value.
+ * @param      maxNbBits      The exact maximum number of bits used in the Huffman tree.
+ */
+static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
+{
+    HUF_CElt* const ct = CTable + 1;
+    /* fill result into ctable (val, nbBits) */
+    int n;
+    U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+    U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+    int const alphabetSize = (int)(maxSymbolValue + 1);
+    for (n=0; n<=nonNullRank; n++)
+        nbPerRank[huffNode[n].nbBits]++;
+    /* determine starting value per rank */
+    {   U16 min = 0;
+        for (n=(int)maxNbBits; n>0; n--) {
+            valPerRank[n] = min;      /* get starting value within each rank */
+            min += nbPerRank[n];
+            min >>= 1;
+    }   }
+    for (n=0; n<alphabetSize; n++)
+        HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+    for (n=0; n<alphabetSize; n++)
+        HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+    CTable[0] = maxNbBits;
+}
+
+size_t
+HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                     void* workSpace, size_t wkspSize)
+{
+    HUF_buildCTable_wksp_tables* const wksp_tables =
+        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+    nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+    nodeElt* const huffNode = huffNode0+1;
+    int nonNullRank;
+
+    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
+
+    /* safety checks */
+    if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+        return ERROR(workSpace_tooSmall);
+    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+        return ERROR(maxSymbolValue_tooLarge);
+    ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+
+    /* sort, decreasing order */
+    HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+
+    /* build tree */
+    nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+
+    /* determine and enforce maxTableLog */
+    maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+    if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+
+    HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
+
+    return maxNbBits;
+}
+
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+    HUF_CElt const* ct = CTable + 1;
+    size_t nbBits = 0;
+    int s;
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        nbBits += HUF_getNbBits(ct[s]) * count[s];
+    }
+    return nbBits >> 3;
+}
+
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+  HUF_CElt const* ct = CTable + 1;
+  int bad = 0;
+  int s;
+  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+  }
+  return !bad;
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+/** HUF_CStream_t:
+ * Huffman uses its own BIT_CStream_t implementation.
+ * There are three major differences from BIT_CStream_t:
+ *   1. HUF_addBits() takes a HUF_CElt (size_t) which is
+ *      the pair (nbBits, value) in the format:
+ *      format:
+ *        - Bits [0, 4)            = nbBits
+ *        - Bits [4, 64 - nbBits)  = 0
+ *        - Bits [64 - nbBits, 64) = value
+ *   2. The bitContainer is built from the upper bits and
+ *      right shifted. E.g. to add a new value of N bits
+ *      you right shift the bitContainer by N, then or in
+ *      the new value into the N upper bits.
+ *   3. The bitstream has two bit containers. You can add
+ *      bits to the second container and merge them into
+ *      the first container.
+ */
+
+#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
+
+typedef struct {
+    size_t bitContainer[2];
+    size_t bitPos[2];
+
+    BYTE* startPtr;
+    BYTE* ptr;
+    BYTE* endPtr;
+} HUF_CStream_t;
+
+/**! HUF_initCStream():
+ * Initializes the bitstream.
+ * @returns 0 or an error code.
+ */
+static size_t HUF_initCStream(HUF_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    ZSTD_memset(bitC, 0, sizeof(*bitC));
+    bitC->startPtr = (BYTE*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
+    if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+/*! HUF_addBits():
+ * Adds the symbol stored in HUF_CElt elt to the bitstream.
+ *
+ * @param elt   The element we're adding. This is a (nbBits, value) pair.
+ *              See the HUF_CStream_t docs for the format.
+ * @param idx   Insert into the bitstream at this idx.
+ * @param kFast This is a template parameter. If the bitstream is guaranteed
+ *              to have at least 4 unused bits after this call it may be 1,
+ *              otherwise it must be 0. HUF_addBits() is faster when fast is set.
+ */
+FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast)
+{
+    assert(idx <= 1);
+    assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
+    /* This is efficient on x86-64 with BMI2 because shrx
+     * only reads the low 6 bits of the register. The compiler
+     * knows this and elides the mask. When fast is set,
+     * every operation can use the same value loaded from elt.
+     */
+    bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
+    bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
+    /* We only read the low 8 bits of bitC->bitPos[idx] so it
+     * doesn't matter that the high bits have noise from the value.
+     */
+    bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
+    assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+    /* The last 4-bits of elt are dirty if fast is set,
+     * so we must not be overwriting bits that have already been
+     * inserted into the bit container.
+     */
+#if DEBUGLEVEL >= 1
+    {
+        size_t const nbBits = HUF_getNbBits(elt);
+        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+        (void)dirtyBits;
+        /* Middle bits are 0. */
+        assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+        /* We didn't overwrite any bits in the bit container. */
+        assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+        (void)dirtyBits;
+    }
+#endif
+}
+
+FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
+{
+    bitC->bitContainer[1] = 0;
+    bitC->bitPos[1] = 0;
+}
+
+/*! HUF_mergeIndex1() :
+ * Merges the bit container @ index 1 into the bit container @ index 0
+ * and zeros the bit container @ index 1.
+ */
+FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
+{
+    assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
+    bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
+    bitC->bitContainer[0] |= bitC->bitContainer[1];
+    bitC->bitPos[0] += bitC->bitPos[1];
+    assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+}
+
+/*! HUF_flushBits() :
+* Flushes the bits in the bit container @ index 0.
+*
+* @post bitPos will be < 8.
+* @param kFast If kFast is set then we must know a-priori that
+*              the bit container will not overflow.
+*/
+FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
+{
+    /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
+    size_t const nbBits = bitC->bitPos[0] & 0xFF;
+    size_t const nbBytes = nbBits >> 3;
+    /* The top nbBits bits of bitContainer are the ones we need. */
+    size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
+    /* Mask bitPos to account for the bytes we consumed. */
+    bitC->bitPos[0] &= 7;
+    assert(nbBits > 0);
+    assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitContainer);
+    bitC->ptr += nbBytes;
+    assert(!kFast || bitC->ptr <= bitC->endPtr);
+    if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    /* bitContainer doesn't need to be modified because the leftover
+     * bits are already the top bitPos bits. And we don't care about
+     * noise in the lower values.
+     */
+}
+
+/*! HUF_endMark()
+ * @returns The Huffman stream end mark: A 1-bit value = 1.
+ */
+static HUF_CElt HUF_endMark(void)
+{
+    HUF_CElt endMark;
+    HUF_setNbBits(&endMark, 1);
+    HUF_setValue(&endMark, 1);
+    return endMark;
+}
+
+/*! HUF_closeCStream() :
+ *  @return Size of CStream, in bytes,
+ *          or 0 if it could not fit into dstBuffer */
+static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+{
+    HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
+    HUF_flushBits(bitC, /* kFast */ 0);
+    {
+        size_t const nbBits = bitC->bitPos[0] & 0xFF;
+        if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+    }
+}
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)
+{
+    HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
+}
+
+FORCE_INLINE_TEMPLATE void
+HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
+                                   const BYTE* ip, size_t srcSize,
+                                   const HUF_CElt* ct,
+                                   int kUnroll, int kFastFlush, int kLastFast)
+{
+    /* Join to kUnroll */
+    int n = (int)srcSize;
+    int rem = n % kUnroll;
+    if (rem > 0) {
+        for (; rem > 0; --rem) {
+            HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
+        }
+        HUF_flushBits(bitC, kFastFlush);
+    }
+    assert(n % kUnroll == 0);
+
+    /* Join to 2 * kUnroll */
+    if (n % (2 * kUnroll)) {
+        int u;
+        for (u = 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
+        HUF_flushBits(bitC, kFastFlush);
+        n -= kUnroll;
+    }
+    assert(n % (2 * kUnroll) == 0);
+
+    for (; n>0; n-= 2 * kUnroll) {
+        /* Encode kUnroll symbols into the bitstream @ index 0. */
+        int u;
+        for (u = 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
+        HUF_flushBits(bitC, kFastFlush);
+        /* Encode kUnroll symbols into the bitstream @ index 1.
+         * This allows us to start filling the bit container
+         * without any data dependencies.
+         */
+        HUF_zeroIndex1(bitC);
+        for (u = 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
+        /* Merge bitstream @ index 1 into the bitstream @ index 0 */
+        HUF_mergeIndex1(bitC);
+        HUF_flushBits(bitC, kFastFlush);
+    }
+    assert(n == 0);
+
+}
+
+/**
+ * Returns a tight upper bound on the output space needed by Huffman
+ * with 8 bytes buffer to handle over-writes. If the output is at least
+ * this large we don't need to do bounds checks during Huffman encoding.
+ */
+static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
+{
+    return ((srcSize * tableLog) >> 3) + 8;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    U32 const tableLog = (U32)CTable[0];
+    HUF_CElt const* ct = CTable + 1;
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+    HUF_CStream_t bitC;
+
+    /* init */
+    if (dstSize < 8) return 0;   /* not enough space to compress */
+    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+      if (HUF_isError(initErr)) return 0; }
+
+    if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+        HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
+    else {
+        if (MEM_32bits()) {
+            switch (tableLog) {
+            case 11:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 10: ZSTD_FALLTHROUGH;
+            case 9: ZSTD_FALLTHROUGH;
+            case 8:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            case 7: ZSTD_FALLTHROUGH;
+            default:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            }
+        } else {
+            switch (tableLog) {
+            case 11:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 10:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            case 9:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 8:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 7:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 6: ZSTD_FALLTHROUGH;
+            default:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            }
+        }
+    }
+    assert(bitC.ptr <= bitC.endPtr);
+
+    return HUF_closeCStream(&bitC);
+}
+
+#if DYNAMIC_BMI2
+
+static BMI2_TARGET_ATTRIBUTE size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+                                      const void* src, size_t srcSize,
+                                      const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    if (bmi2) {
+        return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+    }
+    return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    (void)bmi2;
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+}
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, int bmi2)
+{
+    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    if (dstSize < 6 + 1 + 1 + 1 + 8) return 0;   /* minimum space to compress successfully */
+    if (srcSize < 12) return 0;   /* no saving possible : too small input */
+    op += 6;   /* jumpTable */
+
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        MEM_writeLE16(ostart, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        MEM_writeLE16(ostart+2, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        MEM_writeLE16(ostart+4, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    assert(ip <= iend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
+        if (cSize == 0 || cSize > 65535) return 0;
+        op += cSize;
+    }
+
+    return (size_t)(op-ostart);
+}
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+}
+
+typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+
+static size_t HUF_compressCTable_internal(
+                BYTE* const ostart, BYTE* op, BYTE* const oend,
+                const void* src, size_t srcSize,
+                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
+{
+    size_t const cSize = (nbStreams==HUF_singleStream) ?
+                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
+    if (HUF_isError(cSize)) { return cSize; }
+    if (cSize==0) { return 0; }   /* uncompressible */
+    op += cSize;
+    /* check compressibility */
+    assert(op >= ostart);
+    if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+    return (size_t)(op-ostart);
+}
+
+
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    unsigned tableLog = FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    assert(tableLog <= HUF_TABLELOG_MAX);
+
+    return tableLog;
+}
+
+typedef struct {
+    unsigned count[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
+    union {
+        HUF_buildCTable_wksp_tables buildCTable_wksp;
+        HUF_WriteCTableWksp writeCTable_wksp;
+        U32 hist_wksp[HIST_WKSP_SIZE_U32];
+    } wksps;
+} HUF_compress_tables_t;
+
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+
+/* HUF_compress_internal() :
+ * `workSpace_align4` must be aligned on 4-bytes boundaries,
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+static size_t
+HUF_compress_internal (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       HUF_nbStreams_e nbStreams,
+                       void* workSpace, size_t wkspSize,
+                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+                 const int bmi2, unsigned suspectUncompressible)
+{
+    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+    HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+
+    /* checks & inits */
+    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
+    if (!srcSize) return 0;  /* Uncompressed */
+    if (!dstSize) return 0;  /* cannot fit anything within dst budget */
+    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
+    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+    /* Heuristic : If old table is valid, use it for small inputs */
+    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           nbStreams, oldHufTable, bmi2);
+    }
+
+    /* If uncompressible data is suspected, do a smaller sampling first */
+    DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+        size_t largestTotal = 0;
+        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+        {   unsigned maxSymbolValueBegin = maxSymbolValue;
+            CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+            largestTotal += largestBegin;
+        }
+        {   unsigned maxSymbolValueEnd = maxSymbolValue;
+            CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+            largestTotal += largestEnd;
+        }
+        if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+    }
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );
+        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+        if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+    }
+    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+
+    /* Check validity of previous table */
+    if ( repeat
+      && *repeat == HUF_repeat_check
+      && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
+        *repeat = HUF_repeat_none;
+    }
+    /* Heuristic : use existing table for small inputs */
+    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           nbStreams, oldHufTable, bmi2);
+    }
+
+    /* Build Huffman Tree */
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                            maxSymbolValue, huffLog,
+                                            &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+        CHECK_F(maxBits);
+        huffLog = (U32)maxBits;
+        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+    }
+    /* Zero unused symbols in CTable, so we can check it for validity */
+    {
+        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
+    }
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog,
+                                              &table->wksps.writeCTable_wksp, sizeof(table->wksps.writeCTable_wksp)) );
+        /* Check if using previous huffman table is beneficial */
+        if (repeat && *repeat != HUF_repeat_none) {
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
+            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                return HUF_compressCTable_internal(ostart, op, oend,
+                                                   src, srcSize,
+                                                   nbStreams, oldHufTable, bmi2);
+        }   }
+
+        /* Use the new huffman table */
+        if (hSize + 12ul >= srcSize) { return 0; }
+        op += hSize;
+        if (repeat) { *repeat = HUF_repeat_none; }
+        if (oldHufTable)
+            ZSTD_memcpy(oldHufTable, table->CTable, sizeof(table->CTable));  /* Save new table */
+    }
+    return HUF_compressCTable_internal(ostart, op, oend,
+                                       src, srcSize,
+                                       nbStreams, table->CTable, bmi2);
+}
+
+
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_singleStream,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+                      int bmi2, unsigned suspectUncompressible)
+{
+    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_singleStream,
+                                 workSpace, wkspSize, hufTable,
+                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * provide workspace to generate compression tables */
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    DEBUGLOG(5, "HUF_compress4X_wksp (srcSize = %zu)", srcSize);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_fourStreams,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * consider skipping quickly
+ * re-use an existing huffman compression table */
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
+{
+    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_fourStreams,
+                                 workSpace, wkspSize,
+                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
+}
+
+#ifndef ZSTD_NO_UNUSED_FUNCTIONS
+/** HUF_buildCTable() :
+ * @return : maxNbBits
+ *  Note : count is used before tree is written, so they can safely overlap
+ */
+size_t HUF_buildCTable (HUF_CElt* tree, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits)
+{
+    HUF_buildCTable_wksp_tables workspace;
+    return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, &workspace, sizeof(workspace));
+}
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                 unsigned maxSymbolValue, unsigned huffLog)
+{
+    U64 workSpace[HUF_WORKSPACE_SIZE_U64];
+    return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    U64 workSpace[HUF_WORKSPACE_SIZE_U64];
+    return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
+}
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress.c
new file mode 100644
index 000000000..59d441b2a
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress.c
@@ -0,0 +1,6520 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+#include "../common/mem.h"
+#include "hist.h"           /* HIST_countFast_wksp */
+#define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+#include "../common/fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "../common/huf.h"
+#include "zstd_compress_internal.h"
+#include "zstd_compress_sequences.h"
+#include "zstd_compress_literals.h"
+#include "zstd_fast.h"
+#include "zstd_double_fast.h"
+#include "zstd_lazy.h"
+#include "zstd_opt.h"
+#include "zstd_ldm.h"
+#include "zstd_compress_superblock.h"
+#include  "../common/bits.h"      /* ZSTD_highbit32 */
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * COMPRESS_HEAPMODE :
+ * Select how default decompression function ZSTD_compress() allocates its context,
+ * on stack (0, default), or into heap (1).
+ * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected.
+ */
+#ifndef ZSTD_COMPRESS_HEAPMODE
+#  define ZSTD_COMPRESS_HEAPMODE 0
+#endif
+
+/*!
+ * ZSTD_HASHLOG3_MAX :
+ * Maximum size of the hash table dedicated to find 3-bytes matches,
+ * in log format, aka 17 => 1 << 17 == 128Ki positions.
+ * This structure is only used in zstd_opt.
+ * Since allocation is centralized for all strategies, it has to be known here.
+ * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3,
+ * so that zstd_opt.c doesn't need to know about this constant.
+ */
+#ifndef ZSTD_HASHLOG3_MAX
+#  define ZSTD_HASHLOG3_MAX 17
+#endif
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/* ZSTD_compressBound()
+ * Note that the result from this function is only compatible with the "normal"
+ * full-block strategy.
+ * When there are a lot of small blocks due to frequent flush in streaming mode
+ * the overhead of headers can make the compressed data to be larger than the
+ * return value of ZSTD_compressBound().
+ */
+size_t ZSTD_compressBound(size_t srcSize) {
+    return ZSTD_COMPRESSBOUND(srcSize);
+}
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CDict_s {
+    const void* dictContent;
+    size_t dictContentSize;
+    ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */
+    U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+    ZSTD_cwksp workspace;
+    ZSTD_matchState_t matchState;
+    ZSTD_compressedBlockState_t cBlockState;
+    ZSTD_customMem customMem;
+    U32 dictID;
+    int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
+    ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
+                                           * row-based matchfinder. Unless the cdict is reloaded, we will use
+                                           * the same greedy/lazy matchfinder at compression time.
+                                           */
+};  /* typedef'd to ZSTD_CDict within "zstd.h" */
+
+ZSTD_CCtx* ZSTD_createCCtx(void)
+{
+    return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
+}
+
+static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
+{
+    assert(cctx != NULL);
+    ZSTD_memset(cctx, 0, sizeof(*cctx));
+    cctx->customMem = memManager;
+    cctx->bmi2 = ZSTD_cpuSupportsBmi2();
+    {   size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
+ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_STATIC_ASSERT(zcss_init==0);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_customMalloc(sizeof(ZSTD_CCtx), customMem);
+        if (!cctx) return NULL;
+        ZSTD_initCCtx(cctx, customMem);
+        return cctx;
+    }
+}
+
+ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize)
+{
+    ZSTD_cwksp ws;
+    ZSTD_CCtx* cctx;
+    if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL;  /* minimum size */
+    if ((size_t)workspace & 7) return NULL;  /* must be 8-aligned */
+    ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc);
+
+    cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx));
+    if (cctx == NULL) return NULL;
+
+    ZSTD_memset(cctx, 0, sizeof(ZSTD_CCtx));
+    ZSTD_cwksp_move(&cctx->workspace, &ws);
+    cctx->staticSize = workspaceSize;
+
+    /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+    if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
+    cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+    cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+    cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE);
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    return cctx;
+}
+
+/**
+ * Clears and frees all of the dictionaries in the CCtx.
+ */
+static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx)
+{
+    ZSTD_customFree(cctx->localDict.dictBuffer, cctx->customMem);
+    ZSTD_freeCDict(cctx->localDict.cdict);
+    ZSTD_memset(&cctx->localDict, 0, sizeof(cctx->localDict));
+    ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));
+    cctx->cdict = NULL;
+}
+
+static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict)
+{
+    size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0;
+    size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict);
+    return bufferSize + cdictSize;
+}
+
+static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+{
+    assert(cctx != NULL);
+    assert(cctx->staticSize == 0);
+    ZSTD_clearAllDicts(cctx);
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
+#endif
+    ZSTD_cwksp_free(&cctx->workspace, cctx->customMem);
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                    "not compatible with static CCtx");
+    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+        ZSTD_freeCCtxContent(cctx);
+        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+    }
+    return 0;
+}
+
+
+static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    return ZSTDMT_sizeof_CCtx(cctx->mtctx);
+#else
+    (void)cctx;
+    return 0;
+#endif
+}
+
+
+size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support sizeof on NULL */
+    /* cctx may be in the workspace */
+    return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx))
+           + ZSTD_cwksp_sizeof(&cctx->workspace)
+           + ZSTD_sizeof_localDict(cctx->localDict)
+           + ZSTD_sizeof_mtctx(cctx);
+}
+
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+{
+    return ZSTD_sizeof_CCtx(zcs);  /* same object */
+}
+
+/* private API call, for dictBuilder only */
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+
+/* Returns true if the strategy supports using a row based matchfinder */
+static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+    return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2);
+}
+
+/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder
+ * for this compression.
+ */
+static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) {
+    assert(mode != ZSTD_ps_auto);
+    return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable);
+}
+
+/* Returns row matchfinder usage given an initial mode and cParams */
+static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode,
+                                                         const ZSTD_compressionParameters* const cParams) {
+#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON)
+    int const kHasSIMD128 = 1;
+#else
+    int const kHasSIMD128 = 0;
+#endif
+    if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
+    mode = ZSTD_ps_disable;
+    if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode;
+    if (kHasSIMD128) {
+        if (cParams->windowLog > 14) mode = ZSTD_ps_enable;
+    } else {
+        if (cParams->windowLog > 17) mode = ZSTD_ps_enable;
+    }
+    return mode;
+}
+
+/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */
+static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
+                                                        const ZSTD_compressionParameters* const cParams) {
+    if (mode != ZSTD_ps_auto) return mode;
+    return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable;
+}
+
+/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */
+static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+                                   const ZSTD_paramSwitch_e useRowMatchFinder,
+                                   const U32 forDDSDict) {
+    assert(useRowMatchFinder != ZSTD_ps_auto);
+    /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate.
+     * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder.
+     */
+    return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+}
+
+/* Returns 1 if compression parameters are such that we should
+ * enable long distance matching (wlog >= 27, strategy >= btopt).
+ * Returns 0 otherwise.
+ */
+static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                 const ZSTD_compressionParameters* const cParams) {
+    if (mode != ZSTD_ps_auto) return mode;
+    return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+}
+
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
+    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
+}
+
+static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+        ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params cctxParams;
+    /* should not matter, as all cParams are presumed properly defined */
+    ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT);
+    cctxParams.cParams = cParams;
+
+    /* Adjust advanced params according to cParams */
+    cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams);
+    if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams);
+        assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
+        assert(cctxParams.ldmParams.hashRateLog < 32);
+    }
+    cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+    assert(!ZSTD_checkCParams(cParams));
+    return cctxParams;
+}
+
+static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced(
+        ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params* params;
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+    params = (ZSTD_CCtx_params*)ZSTD_customCalloc(
+            sizeof(ZSTD_CCtx_params), customMem);
+    if (!params) { return NULL; }
+    ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+    params->customMem = customMem;
+    return params;
+}
+
+ZSTD_CCtx_params* ZSTD_createCCtxParams(void)
+{
+    return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params)
+{
+    if (params == NULL) { return 0; }
+    ZSTD_customFree(params, params->customMem);
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params)
+{
+    return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+}
+
+size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) {
+    RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+    ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->compressionLevel = compressionLevel;
+    cctxParams->fParams.contentSizeFlag = 1;
+    return 0;
+}
+
+#define ZSTD_NO_CLEVEL 0
+
+/**
+ * Initializes the cctxParams from params and compressionLevel.
+ * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+ */
+static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
+{
+    assert(!ZSTD_checkCParams(params->cParams));
+    ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->cParams = params->cParams;
+    cctxParams->fParams = params->fParams;
+    /* Should not matter, as all cParams are presumed properly defined.
+     * But, set it for tracing anyway.
+     */
+    cctxParams->compressionLevel = compressionLevel;
+    cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+    cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+    cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
+    DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+                cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+}
+
+size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+{
+    RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+    ZSTD_CCtxParams_init_internal(cctxParams, &params, ZSTD_NO_CLEVEL);
+    return 0;
+}
+
+/**
+ * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+ * @param param Validated zstd parameters.
+ */
+static void ZSTD_CCtxParams_setZstdParams(
+        ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+{
+    assert(!ZSTD_checkCParams(params->cParams));
+    cctxParams->cParams = params->cParams;
+    cctxParams->fParams = params->fParams;
+    /* Should not matter, as all cParams are presumed properly defined.
+     * But, set it for tracing anyway.
+     */
+    cctxParams->compressionLevel = ZSTD_NO_CLEVEL;
+}
+
+ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+{
+    ZSTD_bounds bounds = { 0, 0, 0 };
+
+    switch(param)
+    {
+    case ZSTD_c_compressionLevel:
+        bounds.lowerBound = ZSTD_minCLevel();
+        bounds.upperBound = ZSTD_maxCLevel();
+        return bounds;
+
+    case ZSTD_c_windowLog:
+        bounds.lowerBound = ZSTD_WINDOWLOG_MIN;
+        bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_hashLog:
+        bounds.lowerBound = ZSTD_HASHLOG_MIN;
+        bounds.upperBound = ZSTD_HASHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_chainLog:
+        bounds.lowerBound = ZSTD_CHAINLOG_MIN;
+        bounds.upperBound = ZSTD_CHAINLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_searchLog:
+        bounds.lowerBound = ZSTD_SEARCHLOG_MIN;
+        bounds.upperBound = ZSTD_SEARCHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_minMatch:
+        bounds.lowerBound = ZSTD_MINMATCH_MIN;
+        bounds.upperBound = ZSTD_MINMATCH_MAX;
+        return bounds;
+
+    case ZSTD_c_targetLength:
+        bounds.lowerBound = ZSTD_TARGETLENGTH_MIN;
+        bounds.upperBound = ZSTD_TARGETLENGTH_MAX;
+        return bounds;
+
+    case ZSTD_c_strategy:
+        bounds.lowerBound = ZSTD_STRATEGY_MIN;
+        bounds.upperBound = ZSTD_STRATEGY_MAX;
+        return bounds;
+
+    case ZSTD_c_contentSizeFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_checksumFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_dictIDFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_nbWorkers:
+        bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+        bounds.upperBound = ZSTDMT_NBWORKERS_MAX;
+#else
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_jobSize:
+        bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+        bounds.upperBound = ZSTDMT_JOBSIZE_MAX;
+#else
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_overlapLog:
+#ifdef ZSTD_MULTITHREAD
+        bounds.lowerBound = ZSTD_OVERLAPLOG_MIN;
+        bounds.upperBound = ZSTD_OVERLAPLOG_MAX;
+#else
+        bounds.lowerBound = 0;
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_enableDedicatedDictSearch:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_enableLongDistanceMatching:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_ldmHashLog:
+        bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN;
+        bounds.upperBound = ZSTD_LDM_HASHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmMinMatch:
+        bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN;
+        bounds.upperBound = ZSTD_LDM_MINMATCH_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmBucketSizeLog:
+        bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN;
+        bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmHashRateLog:
+        bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN;
+        bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX;
+        return bounds;
+
+    /* experimental parameters */
+    case ZSTD_c_rsyncable:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_forceMaxWindow :
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_format:
+        ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+        bounds.lowerBound = ZSTD_f_zstd1;
+        bounds.upperBound = ZSTD_f_zstd1_magicless;   /* note : how to ensure at compile time that this is the highest value enum ? */
+        return bounds;
+
+    case ZSTD_c_forceAttachDict:
+        ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceLoad);
+        bounds.lowerBound = ZSTD_dictDefaultAttach;
+        bounds.upperBound = ZSTD_dictForceLoad;       /* note : how to ensure at compile time that this is the highest value enum ? */
+        return bounds;
+
+    case ZSTD_c_literalCompressionMode:
+        ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable);
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_targetCBlockSize:
+        bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN;
+        bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_srcSizeHint:
+        bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN;
+        bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
+        return bounds;
+
+    case ZSTD_c_stableInBuffer:
+    case ZSTD_c_stableOutBuffer:
+        bounds.lowerBound = (int)ZSTD_bm_buffered;
+        bounds.upperBound = (int)ZSTD_bm_stable;
+        return bounds;
+
+    case ZSTD_c_blockDelimiters:
+        bounds.lowerBound = (int)ZSTD_sf_noBlockDelimiters;
+        bounds.upperBound = (int)ZSTD_sf_explicitBlockDelimiters;
+        return bounds;
+
+    case ZSTD_c_validateSequences:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_useBlockSplitter:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_useRowMatchFinder:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_deterministicRefPrefix:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_prefetchCDictTables:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    default:
+        bounds.error = ERROR(parameter_unsupported);
+        return bounds;
+    }
+}
+
+/* ZSTD_cParam_clampBounds:
+ * Clamps the value into the bounded range.
+ */
+static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+{
+    ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+    if (ZSTD_isError(bounds.error)) return bounds.error;
+    if (*value < bounds.lowerBound) *value = bounds.lowerBound;
+    if (*value > bounds.upperBound) *value = bounds.upperBound;
+    return 0;
+}
+
+#define BOUNDCHECK(cParam, val) { \
+    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+                    parameter_outOfBound, "Param out of bounds"); \
+}
+
+
+static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+{
+    switch(param)
+    {
+    case ZSTD_c_compressionLevel:
+    case ZSTD_c_hashLog:
+    case ZSTD_c_chainLog:
+    case ZSTD_c_searchLog:
+    case ZSTD_c_minMatch:
+    case ZSTD_c_targetLength:
+    case ZSTD_c_strategy:
+        return 1;
+
+    case ZSTD_c_format:
+    case ZSTD_c_windowLog:
+    case ZSTD_c_contentSizeFlag:
+    case ZSTD_c_checksumFlag:
+    case ZSTD_c_dictIDFlag:
+    case ZSTD_c_forceMaxWindow :
+    case ZSTD_c_nbWorkers:
+    case ZSTD_c_jobSize:
+    case ZSTD_c_overlapLog:
+    case ZSTD_c_rsyncable:
+    case ZSTD_c_enableDedicatedDictSearch:
+    case ZSTD_c_enableLongDistanceMatching:
+    case ZSTD_c_ldmHashLog:
+    case ZSTD_c_ldmMinMatch:
+    case ZSTD_c_ldmBucketSizeLog:
+    case ZSTD_c_ldmHashRateLog:
+    case ZSTD_c_forceAttachDict:
+    case ZSTD_c_literalCompressionMode:
+    case ZSTD_c_targetCBlockSize:
+    case ZSTD_c_srcSizeHint:
+    case ZSTD_c_stableInBuffer:
+    case ZSTD_c_stableOutBuffer:
+    case ZSTD_c_blockDelimiters:
+    case ZSTD_c_validateSequences:
+    case ZSTD_c_useBlockSplitter:
+    case ZSTD_c_useRowMatchFinder:
+    case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    default:
+        return 0;
+    }
+}
+
+size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value);
+    if (cctx->streamStage != zcss_init) {
+        if (ZSTD_isUpdateAuthorized(param)) {
+            cctx->cParamsChanged = 1;
+        } else {
+            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
+    }   }
+
+    switch(param)
+    {
+    case ZSTD_c_nbWorkers:
+        RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported,
+                        "MT not compatible with static alloc");
+        break;
+
+    case ZSTD_c_compressionLevel:
+    case ZSTD_c_windowLog:
+    case ZSTD_c_hashLog:
+    case ZSTD_c_chainLog:
+    case ZSTD_c_searchLog:
+    case ZSTD_c_minMatch:
+    case ZSTD_c_targetLength:
+    case ZSTD_c_strategy:
+    case ZSTD_c_ldmHashRateLog:
+    case ZSTD_c_format:
+    case ZSTD_c_contentSizeFlag:
+    case ZSTD_c_checksumFlag:
+    case ZSTD_c_dictIDFlag:
+    case ZSTD_c_forceMaxWindow:
+    case ZSTD_c_forceAttachDict:
+    case ZSTD_c_literalCompressionMode:
+    case ZSTD_c_jobSize:
+    case ZSTD_c_overlapLog:
+    case ZSTD_c_rsyncable:
+    case ZSTD_c_enableDedicatedDictSearch:
+    case ZSTD_c_enableLongDistanceMatching:
+    case ZSTD_c_ldmHashLog:
+    case ZSTD_c_ldmMinMatch:
+    case ZSTD_c_ldmBucketSizeLog:
+    case ZSTD_c_targetCBlockSize:
+    case ZSTD_c_srcSizeHint:
+    case ZSTD_c_stableInBuffer:
+    case ZSTD_c_stableOutBuffer:
+    case ZSTD_c_blockDelimiters:
+    case ZSTD_c_validateSequences:
+    case ZSTD_c_useBlockSplitter:
+    case ZSTD_c_useRowMatchFinder:
+    case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+        break;
+
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+    return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+                                    ZSTD_cParameter param, int value)
+{
+    DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value);
+    switch(param)
+    {
+    case ZSTD_c_format :
+        BOUNDCHECK(ZSTD_c_format, value);
+        CCtxParams->format = (ZSTD_format_e)value;
+        return (size_t)CCtxParams->format;
+
+    case ZSTD_c_compressionLevel : {
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        if (value == 0)
+            CCtxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* 0 == default */
+        else
+            CCtxParams->compressionLevel = value;
+        if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel;
+        return 0;  /* return type (size_t) cannot represent negative values */
+    }
+
+    case ZSTD_c_windowLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_windowLog, value);
+        CCtxParams->cParams.windowLog = (U32)value;
+        return CCtxParams->cParams.windowLog;
+
+    case ZSTD_c_hashLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_hashLog, value);
+        CCtxParams->cParams.hashLog = (U32)value;
+        return CCtxParams->cParams.hashLog;
+
+    case ZSTD_c_chainLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_chainLog, value);
+        CCtxParams->cParams.chainLog = (U32)value;
+        return CCtxParams->cParams.chainLog;
+
+    case ZSTD_c_searchLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_searchLog, value);
+        CCtxParams->cParams.searchLog = (U32)value;
+        return (size_t)value;
+
+    case ZSTD_c_minMatch :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_minMatch, value);
+        CCtxParams->cParams.minMatch = (U32)value;
+        return CCtxParams->cParams.minMatch;
+
+    case ZSTD_c_targetLength :
+        BOUNDCHECK(ZSTD_c_targetLength, value);
+        CCtxParams->cParams.targetLength = (U32)value;
+        return CCtxParams->cParams.targetLength;
+
+    case ZSTD_c_strategy :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_strategy, value);
+        CCtxParams->cParams.strategy = (ZSTD_strategy)value;
+        return (size_t)CCtxParams->cParams.strategy;
+
+    case ZSTD_c_contentSizeFlag :
+        /* Content size written in frame header _when known_ (default:1) */
+        DEBUGLOG(4, "set content size flag = %u", (value!=0));
+        CCtxParams->fParams.contentSizeFlag = value != 0;
+        return (size_t)CCtxParams->fParams.contentSizeFlag;
+
+    case ZSTD_c_checksumFlag :
+        /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+        CCtxParams->fParams.checksumFlag = value != 0;
+        return (size_t)CCtxParams->fParams.checksumFlag;
+
+    case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+        DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+        CCtxParams->fParams.noDictIDFlag = !value;
+        return !CCtxParams->fParams.noDictIDFlag;
+
+    case ZSTD_c_forceMaxWindow :
+        CCtxParams->forceWindow = (value != 0);
+        return (size_t)CCtxParams->forceWindow;
+
+    case ZSTD_c_forceAttachDict : {
+        const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
+        CCtxParams->attachDictPref = pref;
+        return CCtxParams->attachDictPref;
+    }
+
+    case ZSTD_c_literalCompressionMode : {
+        const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
+        CCtxParams->literalCompressionMode = lcm;
+        return CCtxParams->literalCompressionMode;
+    }
+
+    case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        CCtxParams->nbWorkers = value;
+        return CCtxParams->nbWorkers;
+#endif
+
+    case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        /* Adjust to the minimum non-default value. */
+        if (value != 0 && value < ZSTDMT_JOBSIZE_MIN)
+            value = ZSTDMT_JOBSIZE_MIN;
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        assert(value >= 0);
+        CCtxParams->jobSize = value;
+        return CCtxParams->jobSize;
+#endif
+
+    case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+        CCtxParams->overlapLog = value;
+        return CCtxParams->overlapLog;
+#endif
+
+    case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+        CCtxParams->rsyncable = value;
+        return CCtxParams->rsyncable;
+#endif
+
+    case ZSTD_c_enableDedicatedDictSearch :
+        CCtxParams->enableDedicatedDictSearch = (value!=0);
+        return (size_t)CCtxParams->enableDedicatedDictSearch;
+
+    case ZSTD_c_enableLongDistanceMatching :
+        CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->ldmParams.enableLdm;
+
+    case ZSTD_c_ldmHashLog :
+        if (value!=0)   /* 0 ==> auto */
+            BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+        CCtxParams->ldmParams.hashLog = (U32)value;
+        return CCtxParams->ldmParams.hashLog;
+
+    case ZSTD_c_ldmMinMatch :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+        CCtxParams->ldmParams.minMatchLength = (U32)value;
+        return CCtxParams->ldmParams.minMatchLength;
+
+    case ZSTD_c_ldmBucketSizeLog :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+        return CCtxParams->ldmParams.bucketSizeLog;
+
+    case ZSTD_c_ldmHashRateLog :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+        CCtxParams->ldmParams.hashRateLog = (U32)value;
+        return CCtxParams->ldmParams.hashRateLog;
+
+    case ZSTD_c_targetCBlockSize :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+        CCtxParams->targetCBlockSize = (U32)value;
+        return CCtxParams->targetCBlockSize;
+
+    case ZSTD_c_srcSizeHint :
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+        CCtxParams->srcSizeHint = value;
+        return (size_t)CCtxParams->srcSizeHint;
+
+    case ZSTD_c_stableInBuffer:
+        BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+        CCtxParams->inBufferMode = (ZSTD_bufferMode_e)value;
+        return CCtxParams->inBufferMode;
+
+    case ZSTD_c_stableOutBuffer:
+        BOUNDCHECK(ZSTD_c_stableOutBuffer, value);
+        CCtxParams->outBufferMode = (ZSTD_bufferMode_e)value;
+        return CCtxParams->outBufferMode;
+
+    case ZSTD_c_blockDelimiters:
+        BOUNDCHECK(ZSTD_c_blockDelimiters, value);
+        CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value;
+        return CCtxParams->blockDelimiters;
+
+    case ZSTD_c_validateSequences:
+        BOUNDCHECK(ZSTD_c_validateSequences, value);
+        CCtxParams->validateSequences = value;
+        return CCtxParams->validateSequences;
+
+    case ZSTD_c_useBlockSplitter:
+        BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+        CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->useBlockSplitter;
+
+    case ZSTD_c_useRowMatchFinder:
+        BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
+        CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->useRowMatchFinder;
+
+    case ZSTD_c_deterministicRefPrefix:
+        BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+        CCtxParams->deterministicRefPrefix = !!value;
+        return CCtxParams->deterministicRefPrefix;
+
+    case ZSTD_c_prefetchCDictTables:
+        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
+        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->prefetchCDictTables;
+
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+}
+
+size_t ZSTD_CCtx_getParameter(ZSTD_CCtx const* cctx, ZSTD_cParameter param, int* value)
+{
+    return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_getParameter(
+        ZSTD_CCtx_params const* CCtxParams, ZSTD_cParameter param, int* value)
+{
+    switch(param)
+    {
+    case ZSTD_c_format :
+        *value = CCtxParams->format;
+        break;
+    case ZSTD_c_compressionLevel :
+        *value = CCtxParams->compressionLevel;
+        break;
+    case ZSTD_c_windowLog :
+        *value = (int)CCtxParams->cParams.windowLog;
+        break;
+    case ZSTD_c_hashLog :
+        *value = (int)CCtxParams->cParams.hashLog;
+        break;
+    case ZSTD_c_chainLog :
+        *value = (int)CCtxParams->cParams.chainLog;
+        break;
+    case ZSTD_c_searchLog :
+        *value = CCtxParams->cParams.searchLog;
+        break;
+    case ZSTD_c_minMatch :
+        *value = CCtxParams->cParams.minMatch;
+        break;
+    case ZSTD_c_targetLength :
+        *value = CCtxParams->cParams.targetLength;
+        break;
+    case ZSTD_c_strategy :
+        *value = (unsigned)CCtxParams->cParams.strategy;
+        break;
+    case ZSTD_c_contentSizeFlag :
+        *value = CCtxParams->fParams.contentSizeFlag;
+        break;
+    case ZSTD_c_checksumFlag :
+        *value = CCtxParams->fParams.checksumFlag;
+        break;
+    case ZSTD_c_dictIDFlag :
+        *value = !CCtxParams->fParams.noDictIDFlag;
+        break;
+    case ZSTD_c_forceMaxWindow :
+        *value = CCtxParams->forceWindow;
+        break;
+    case ZSTD_c_forceAttachDict :
+        *value = CCtxParams->attachDictPref;
+        break;
+    case ZSTD_c_literalCompressionMode :
+        *value = CCtxParams->literalCompressionMode;
+        break;
+    case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        assert(CCtxParams->nbWorkers == 0);
+#endif
+        *value = CCtxParams->nbWorkers;
+        break;
+    case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        assert(CCtxParams->jobSize <= INT_MAX);
+        *value = (int)CCtxParams->jobSize;
+        break;
+#endif
+    case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        *value = CCtxParams->overlapLog;
+        break;
+#endif
+    case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        *value = CCtxParams->rsyncable;
+        break;
+#endif
+    case ZSTD_c_enableDedicatedDictSearch :
+        *value = CCtxParams->enableDedicatedDictSearch;
+        break;
+    case ZSTD_c_enableLongDistanceMatching :
+        *value = CCtxParams->ldmParams.enableLdm;
+        break;
+    case ZSTD_c_ldmHashLog :
+        *value = CCtxParams->ldmParams.hashLog;
+        break;
+    case ZSTD_c_ldmMinMatch :
+        *value = CCtxParams->ldmParams.minMatchLength;
+        break;
+    case ZSTD_c_ldmBucketSizeLog :
+        *value = CCtxParams->ldmParams.bucketSizeLog;
+        break;
+    case ZSTD_c_ldmHashRateLog :
+        *value = CCtxParams->ldmParams.hashRateLog;
+        break;
+    case ZSTD_c_targetCBlockSize :
+        *value = (int)CCtxParams->targetCBlockSize;
+        break;
+    case ZSTD_c_srcSizeHint :
+        *value = (int)CCtxParams->srcSizeHint;
+        break;
+    case ZSTD_c_stableInBuffer :
+        *value = (int)CCtxParams->inBufferMode;
+        break;
+    case ZSTD_c_stableOutBuffer :
+        *value = (int)CCtxParams->outBufferMode;
+        break;
+    case ZSTD_c_blockDelimiters :
+        *value = (int)CCtxParams->blockDelimiters;
+        break;
+    case ZSTD_c_validateSequences :
+        *value = (int)CCtxParams->validateSequences;
+        break;
+    case ZSTD_c_useBlockSplitter :
+        *value = (int)CCtxParams->useBlockSplitter;
+        break;
+    case ZSTD_c_useRowMatchFinder :
+        *value = (int)CCtxParams->useRowMatchFinder;
+        break;
+    case ZSTD_c_deterministicRefPrefix:
+        *value = (int)CCtxParams->deterministicRefPrefix;
+        break;
+    case ZSTD_c_prefetchCDictTables:
+        *value = (int)CCtxParams->prefetchCDictTables;
+        break;
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+    return 0;
+}
+
+/** ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  just applies `params` into `cctx`
+ *  no action is performed, parameters are merely stored.
+ *  If ZSTDMT is enabled, parameters are pushed to cctx->mtctx.
+ *    This is possible even if a compression is ongoing.
+ *    In which case, new parameters will be applied on the fly, starting with next compression job.
+ */
+size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "The context is in the wrong stage!");
+    RETURN_ERROR_IF(cctx->cdict, stage_wrong,
+                    "Can't override parameters with cdict attached (some must "
+                    "be inherited from the cdict).");
+
+    cctx->requestedParams = *params;
+    return 0;
+}
+
+size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't set pledgedSrcSize when not in init stage.");
+    cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+    return 0;
+}
+
+static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(
+        int const compressionLevel,
+        size_t const dictSize);
+static int ZSTD_dedicatedDictSearch_isSupported(
+        const ZSTD_compressionParameters* cParams);
+static void ZSTD_dedicatedDictSearch_revertCParams(
+        ZSTD_compressionParameters* cParams);
+
+/**
+ * Initializes the local dict using the requested parameters.
+ * NOTE: This does not use the pledged src size, because it may be used for more
+ * than one compression.
+ */
+static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+{
+    ZSTD_localDict* const dl = &cctx->localDict;
+    if (dl->dict == NULL) {
+        /* No local dictionary. */
+        assert(dl->dictBuffer == NULL);
+        assert(dl->cdict == NULL);
+        assert(dl->dictSize == 0);
+        return 0;
+    }
+    if (dl->cdict != NULL) {
+        assert(cctx->cdict == dl->cdict);
+        /* Local dictionary already initialized. */
+        return 0;
+    }
+    assert(dl->dictSize > 0);
+    assert(cctx->cdict == NULL);
+    assert(cctx->prefixDict.dict == NULL);
+
+    dl->cdict = ZSTD_createCDict_advanced2(
+            dl->dict,
+            dl->dictSize,
+            ZSTD_dlm_byRef,
+            dl->dictContentType,
+            &cctx->requestedParams,
+            cctx->customMem);
+    RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed");
+    cctx->cdict = dl->cdict;
+    return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_advanced(
+        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't load a dictionary when ctx is not in init stage.");
+    DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
+        return 0;
+    if (dictLoadMethod == ZSTD_dlm_byRef) {
+        cctx->localDict.dict = dict;
+    } else {
+        void* dictBuffer;
+        RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                        "no malloc for static CCtx");
+        dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
+        ZSTD_memcpy(dictBuffer, dict, dictSize);
+        cctx->localDict.dictBuffer = dictBuffer;
+        cctx->localDict.dict = dictBuffer;
+    }
+    cctx->localDict.dictSize = dictSize;
+    cctx->localDict.dictContentType = dictContentType;
+    return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_byReference(
+      ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+
+size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a dict when ctx not in init stage.");
+    /* Free the existing local cdict (if any) to save memory. */
+    ZSTD_clearAllDicts(cctx);
+    cctx->cdict = cdict;
+    return 0;
+}
+
+size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a pool when ctx not in init stage.");
+    cctx->pool = pool;
+    return 0;
+}
+
+size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+size_t ZSTD_CCtx_refPrefix_advanced(
+        ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a prefix when ctx not in init stage.");
+    ZSTD_clearAllDicts(cctx);
+    if (prefix != NULL && prefixSize > 0) {
+        cctx->prefixDict.dict = prefix;
+        cctx->prefixDict.dictSize = prefixSize;
+        cctx->prefixDict.dictContentType = dictContentType;
+    }
+    return 0;
+}
+
+/*! ZSTD_CCtx_reset() :
+ *  Also dumps dictionary */
+size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+{
+    if ( (reset == ZSTD_reset_session_only)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        cctx->streamStage = zcss_init;
+        cctx->pledgedSrcSizePlusOne = 0;
+    }
+    if ( (reset == ZSTD_reset_parameters)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                        "Can't reset parameters only when not in init stage.");
+        ZSTD_clearAllDicts(cctx);
+        return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+    }
+    return 0;
+}
+
+
+/** ZSTD_checkCParams() :
+    control CParam values remain within authorized range.
+    @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+    BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog);
+    BOUNDCHECK(ZSTD_c_chainLog,  (int)cParams.chainLog);
+    BOUNDCHECK(ZSTD_c_hashLog,   (int)cParams.hashLog);
+    BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog);
+    BOUNDCHECK(ZSTD_c_minMatch,  (int)cParams.minMatch);
+    BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength);
+    BOUNDCHECK(ZSTD_c_strategy,  cParams.strategy);
+    return 0;
+}
+
+/** ZSTD_clampCParams() :
+ *  make CParam values within valid range.
+ *  @return : valid CParams */
+static ZSTD_compressionParameters
+ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+{
+#   define CLAMP_TYPE(cParam, val, type) {                                \
+        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+    }
+#   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+    CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+    CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+    CLAMP(ZSTD_c_hashLog,   cParams.hashLog);
+    CLAMP(ZSTD_c_searchLog, cParams.searchLog);
+    CLAMP(ZSTD_c_minMatch,  cParams.minMatch);
+    CLAMP(ZSTD_c_targetLength,cParams.targetLength);
+    CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy);
+    return cParams;
+}
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+    U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+    return hashLog - btScale;
+}
+
+/** ZSTD_dictAndWindowLog() :
+ * Returns an adjusted window log that is large enough to fit the source and the dictionary.
+ * The zstd format says that the entire dictionary is valid if one byte of the dictionary
+ * is within the window. So the hashLog and chainLog should be large enough to reference both
+ * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing
+ * the hashLog and windowLog.
+ * NOTE: srcSize must not be ZSTD_CONTENTSIZE_UNKNOWN.
+ */
+static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize)
+{
+    const U64 maxWindowSize = 1ULL << ZSTD_WINDOWLOG_MAX;
+    /* No dictionary ==> No change */
+    if (dictSize == 0) {
+        return windowLog;
+    }
+    assert(windowLog <= ZSTD_WINDOWLOG_MAX);
+    assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */
+    {
+        U64 const windowSize = 1ULL << windowLog;
+        U64 const dictAndWindowSize = dictSize + windowSize;
+        /* If the window size is already large enough to fit both the source and the dictionary
+         * then just use the window size. Otherwise adjust so that it fits the dictionary and
+         * the window.
+         */
+        if (windowSize >= dictSize + srcSize) {
+            return windowLog; /* Window size large enough already */
+        } else if (dictAndWindowSize >= maxWindowSize) {
+            return ZSTD_WINDOWLOG_MAX; /* Larger than max window log */
+        } else  {
+            return ZSTD_highbit32((U32)dictAndWindowSize - 1) + 1;
+        }
+    }
+}
+
+/** ZSTD_adjustCParams_internal() :
+ *  optimize `cPar` for a specified input (`srcSize` and `dictSize`).
+ *  mostly downsize to reduce memory consumption and initialization latency.
+ * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
+ * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`.
+ *  note : `srcSize==0` means 0!
+ *  condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
+static ZSTD_compressionParameters
+ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                            unsigned long long srcSize,
+                            size_t dictSize,
+                            ZSTD_cParamMode_e mode)
+{
+    const U64 minSrcSize = 513; /* (1<<9) + 1 */
+    const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+    assert(ZSTD_checkCParams(cPar)==0);
+
+    switch (mode) {
+    case ZSTD_cpm_unknown:
+    case ZSTD_cpm_noAttachDict:
+        /* If we don't know the source size, don't make any
+         * assumptions about it. We will already have selected
+         * smaller parameters if a dictionary is in use.
+         */
+        break;
+    case ZSTD_cpm_createCDict:
+        /* Assume a small source size when creating a dictionary
+         * with an unknown source size.
+         */
+        if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            srcSize = minSrcSize;
+        break;
+    case ZSTD_cpm_attachDict:
+        /* Dictionary has its own dedicated parameters which have
+         * already been selected. We are selecting parameters
+         * for only the source.
+         */
+        dictSize = 0;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+
+    /* resize windowLog if input is small enough, to use less memory */
+    if ( (srcSize < maxWindowResize)
+      && (dictSize < maxWindowResize) )  {
+        U32 const tSize = (U32)(srcSize + dictSize);
+        static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+        U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+                            ZSTD_highbit32(tSize-1) + 1;
+        if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
+    }
+    if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize);
+        U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+        if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1;
+        if (cycleLog > dictAndWindowLog)
+            cPar.chainLog -= (cycleLog - dictAndWindowLog);
+    }
+
+    if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+        cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+
+    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
+        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
+        if (cPar.hashLog > maxShortCacheHashLog) {
+            cPar.hashLog = maxShortCacheHashLog;
+        }
+    }
+
+    return cPar;
+}
+
+ZSTD_compressionParameters
+ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+                   unsigned long long srcSize,
+                   size_t dictSize)
+{
+    cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+    if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
+}
+
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+
+static void ZSTD_overrideCParams(
+              ZSTD_compressionParameters* cParams,
+        const ZSTD_compressionParameters* overrides)
+{
+    if (overrides->windowLog)    cParams->windowLog    = overrides->windowLog;
+    if (overrides->hashLog)      cParams->hashLog      = overrides->hashLog;
+    if (overrides->chainLog)     cParams->chainLog     = overrides->chainLog;
+    if (overrides->searchLog)    cParams->searchLog    = overrides->searchLog;
+    if (overrides->minMatch)     cParams->minMatch     = overrides->minMatch;
+    if (overrides->targetLength) cParams->targetLength = overrides->targetLength;
+    if (overrides->strategy)     cParams->strategy     = overrides->strategy;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
+{
+    ZSTD_compressionParameters cParams;
+    if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
+      srcSizeHint = CCtxParams->srcSizeHint;
+    }
+    cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode);
+    if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+    ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+    assert(!ZSTD_checkCParams(cParams));
+    /* srcSizeHint == 0 means 0 */
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
+}
+
+static size_t
+ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const ZSTD_paramSwitch_e useRowMatchFinder,
+                       const U32 enableDedicatedDictSearch,
+                       const U32 forCCtx)
+{
+    /* chain table size should be 0 for fast or row-hash strategies */
+    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx)
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+    /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't
+     * surrounded by redzones in ASAN. */
+    size_t const tableSpace = chainSize * sizeof(U32)
+                            + hSize * sizeof(U32)
+                            + h3Size * sizeof(U32);
+    size_t const optPotentialSpace =
+        ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+    size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
+                                            : 0;
+    size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                ? optPotentialSpace
+                                : 0;
+    size_t const slackSpace = ZSTD_cwksp_slack_space_required();
+
+    /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */
+    ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4);
+    assert(useRowMatchFinder != ZSTD_ps_auto);
+
+    DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
+                (U32)chainSize, (U32)hSize, (U32)h3Size);
+    return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+}
+
+static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+        const ZSTD_compressionParameters* cParams,
+        const ldmParams_t* ldmParams,
+        const int isStatic,
+        const ZSTD_paramSwitch_e useRowMatchFinder,
+        const size_t buffInSize,
+        const size_t buffOutSize,
+        const U64 pledgedSrcSize)
+{
+    size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+    size_t const maxNbSeq = blockSize / divider;
+    size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                            + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+                            + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+    size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE);
+    size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+    size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);
+
+    size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);
+    size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize);
+    size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ?
+        ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
+
+
+    size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize)
+                             + ZSTD_cwksp_alloc_size(buffOutSize);
+
+    size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+
+    size_t const neededSpace =
+        cctxSpace +
+        entropySpace +
+        blockStateSpace +
+        ldmSpace +
+        ldmSeqSpace +
+        matchStateSize +
+        tokenSpace +
+        bufferSpace;
+
+    DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+    return neededSpace;
+}
+
+size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
+                                                                               &cParams);
+
+    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+    /* estimateCCtxSize is for one-shot compression. So no buffers should
+     * be needed. However, we still allocate two 0-sized buffers, which can
+     * take space under ASAN. */
+    return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder = ZSTD_ps_disable;
+        noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+        initialParams.useRowMatchFinder = ZSTD_ps_enable;
+        rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+    }
+}
+
+static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
+{
+    int tier = 0;
+    size_t largestSize = 0;
+    static const unsigned long long srcSizeTiers[4] = {16 KB, 128 KB, 256 KB, ZSTD_CONTENTSIZE_UNKNOWN};
+    for (; tier < 4; ++tier) {
+        /* Choose the set of cParams for a given level across all srcSizes that give the largest cctxSize */
+        ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeTiers[tier], 0, ZSTD_cpm_noAttachDict);
+        largestSize = MAX(ZSTD_estimateCCtxSize_usingCParams(cParams), largestSize);
+    }
+    return largestSize;
+}
+
+size_t ZSTD_estimateCCtxSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+        /* Ensure monotonically increasing memory usage as compression level increases */
+        size_t const newMB = ZSTD_estimateCCtxSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+    {   ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                ? ((size_t)1 << cParams.windowLog) + blockSize
+                : 0;
+        size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)
+                ? ZSTD_compressBound(blockSize) + 1
+                : 0;
+        ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
+
+        return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+            &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+            ZSTD_CONTENTSIZE_UNKNOWN);
+    }
+}
+
+size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder = ZSTD_ps_disable;
+        noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+        initialParams.useRowMatchFinder = ZSTD_ps_enable;
+        rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+    }
+}
+
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+    return ZSTD_estimateCStreamSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCStreamSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCStreamSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads (non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_getFrameProgression(cctx->mtctx);
+    }
+#endif
+    {   ZSTD_frameProgression fp;
+        size_t const buffered = (cctx->inBuff == NULL) ? 0 :
+                                cctx->inBuffPos - cctx->inToCompress;
+        if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress);
+        assert(buffered <= ZSTD_BLOCKSIZE_MAX);
+        fp.ingested = cctx->consumedSrcSize + buffered;
+        fp.consumed = cctx->consumedSrcSize;
+        fp.produced = cctx->producedCSize;
+        fp.flushed  = cctx->producedCSize;   /* simplified; some data might still be left within streaming output buffer */
+        fp.currentJobID = 0;
+        fp.nbActiveWorkers = 0;
+        return fp;
+}   }
+
+/*! ZSTD_toFlushNow()
+ *  Only useful for multithreading scenarios currently (nbWorkers >= 1).
+ */
+size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_toFlushNow(cctx->mtctx);
+    }
+#endif
+    (void)cctx;
+    return 0;   /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
+}
+
+static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
+                                    ZSTD_compressionParameters cParams2)
+{
+    (void)cParams1;
+    (void)cParams2;
+    assert(cParams1.windowLog    == cParams2.windowLog);
+    assert(cParams1.chainLog     == cParams2.chainLog);
+    assert(cParams1.hashLog      == cParams2.hashLog);
+    assert(cParams1.searchLog    == cParams2.searchLog);
+    assert(cParams1.minMatch     == cParams2.minMatch);
+    assert(cParams1.targetLength == cParams2.targetLength);
+    assert(cParams1.strategy     == cParams2.strategy);
+}
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+{
+    int i;
+    for (i = 0; i < ZSTD_REP_NUM; ++i)
+        bs->rep[i] = repStartValue[i];
+    bs->entropy.huf.repeatMode = HUF_repeat_none;
+    bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
+}
+
+/*! ZSTD_invalidateMatchState()
+ *  Invalidate all the matches in the match finder tables.
+ *  Requires nextSrc and base to be set (can be NULL).
+ */
+static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
+{
+    ZSTD_window_clear(&ms->window);
+
+    ms->nextToUpdate = ms->window.dictLimit;
+    ms->loadedDictEnd = 0;
+    ms->opt.litLengthSum = 0;  /* force reset of btopt stats */
+    ms->dictMatchState = NULL;
+}
+
+/**
+ * Controls, for this matchState reset, whether the tables need to be cleared /
+ * prepared for the coming compression (ZSTDcrp_makeClean), or whether the
+ * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a
+ * subsequent operation will overwrite the table space anyways (e.g., copying
+ * the matchState contents in from a CDict).
+ */
+typedef enum {
+    ZSTDcrp_makeClean,
+    ZSTDcrp_leaveDirty
+} ZSTD_compResetPolicy_e;
+
+/**
+ * Controls, for this matchState reset, whether indexing can continue where it
+ * left off (ZSTDirp_continue), or whether it needs to be restarted from zero
+ * (ZSTDirp_reset).
+ */
+typedef enum {
+    ZSTDirp_continue,
+    ZSTDirp_reset
+} ZSTD_indexResetPolicy_e;
+
+typedef enum {
+    ZSTD_resetTarget_CDict,
+    ZSTD_resetTarget_CCtx
+} ZSTD_resetTarget_e;
+
+
+static size_t
+ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+                      ZSTD_cwksp* ws,
+                const ZSTD_compressionParameters* cParams,
+                const ZSTD_paramSwitch_e useRowMatchFinder,
+                const ZSTD_compResetPolicy_e crp,
+                const ZSTD_indexResetPolicy_e forceResetIndex,
+                const ZSTD_resetTarget_e forWho)
+{
+    /* disable chain table allocation for fast or row-based strategies */
+    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder,
+                                                     ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict))
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+
+    DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
+    assert(useRowMatchFinder != ZSTD_ps_auto);
+    if (forceResetIndex == ZSTDirp_reset) {
+        ZSTD_window_init(&ms->window);
+        ZSTD_cwksp_mark_tables_dirty(ws);
+    }
+
+    ms->hashLog3 = hashLog3;
+
+    ZSTD_invalidateMatchState(ms);
+
+    assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */
+
+    ZSTD_cwksp_clear_tables(ws);
+
+    DEBUGLOG(5, "reserving table space");
+    /* table Space */
+    ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32));
+    ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32));
+    ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32));
+    RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+                    "failed a workspace allocation in ZSTD_reset_matchState");
+
+    DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty);
+    if (crp!=ZSTDcrp_leaveDirty) {
+        /* reset tables only */
+        ZSTD_cwksp_clean_tables(ws);
+    }
+
+    /* opt parser space */
+    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+        DEBUGLOG(4, "reserving optimal parser space");
+        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+    }
+
+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+        {   /* Row match finder needs an additional table of hashes ("tags") */
+            size_t const tagTableSize = hSize*sizeof(U16);
+            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
+        }
+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+            assert(cParams->hashLog >= rowLog);
+            ms->rowHashLog = cParams->hashLog - rowLog;
+        }
+    }
+
+    ms->cParams = *cParams;
+
+    RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+                    "failed a workspace allocation in ZSTD_reset_matchState");
+    return 0;
+}
+
+/* ZSTD_indexTooCloseToMax() :
+ * minor optimization : prefer memset() rather than reduceIndex()
+ * which is measurably slow in some circumstances (reported for Visual Studio).
+ * Works when re-using a context for a lot of smallish inputs :
+ * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN,
+ * memset() will be triggered before reduceIndex().
+ */
+#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB)
+static int ZSTD_indexTooCloseToMax(ZSTD_window_t w)
+{
+    return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN);
+}
+
+/** ZSTD_dictTooBig():
+ * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in
+ * one go generically. So we ensure that in that case we reset the tables to zero,
+ * so that we can load as much of the dictionary as possible.
+ */
+static int ZSTD_dictTooBig(size_t const loadedDictSize)
+{
+    return loadedDictSize > ZSTD_CHUNKSIZE_MAX;
+}
+
+/*! ZSTD_resetCCtx_internal() :
+ * @param loadedDictSize The size of the dictionary to be loaded
+ * into the context, if any. If no dictionary is used, or the
+ * dictionary is being attached / copied, then pass 0.
+ * note : `params` are assumed fully validated at this stage.
+ */
+static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+                                      ZSTD_CCtx_params const* params,
+                                      U64 const pledgedSrcSize,
+                                      size_t const loadedDictSize,
+                                      ZSTD_compResetPolicy_e const crp,
+                                      ZSTD_buffered_policy_e const zbuff)
+{
+    ZSTD_cwksp* const ws = &zc->workspace;
+    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
+                (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+
+    zc->isFirstBlock = 1;
+
+    /* Set applied params early so we can modify them for LDM,
+     * and point params at the applied params.
+     */
+    zc->appliedParams = *params;
+    params = &zc->appliedParams;
+
+    assert(params->useRowMatchFinder != ZSTD_ps_auto);
+    assert(params->useBlockSplitter != ZSTD_ps_auto);
+    assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
+    if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+        /* Adjust long distance matching parameters */
+        ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+        assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog);
+        assert(params->ldmParams.hashRateLog < 32);
+    }
+
+    {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                ? ZSTD_compressBound(blockSize) + 1
+                : 0;
+        size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered)
+                ? windowSize + blockSize
+                : 0;
+        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize);
+
+        int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window);
+        int const dictTooBig = ZSTD_dictTooBig(loadedDictSize);
+        ZSTD_indexResetPolicy_e needsIndexReset =
+            (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue;
+
+        size_t const neededSpace =
+            ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+                buffInSize, buffOutSize, pledgedSrcSize);
+        int resizeWorkspace;
+
+        FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+
+        if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0);
+
+        {   /* Check if workspace is large enough, alloc a new one if needed */
+            int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+            int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+            DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+            DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+
+            if (resizeWorkspace) {
+                DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB",
+                            ZSTD_cwksp_sizeof(ws) >> 10,
+                            neededSpace >> 10);
+
+                RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize");
+
+                needsIndexReset = ZSTDirp_reset;
+
+                ZSTD_cwksp_free(ws, zc->customMem);
+                FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), "");
+
+                DEBUGLOG(5, "reserving object space");
+                /* Statically sized space.
+                 * entropyWorkspace never moves,
+                 * though prev/next block swap places */
+                assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t)));
+                zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock");
+                zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
+                zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE);
+                RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
+        }   }
+
+        ZSTD_cwksp_clear(ws);
+
+        /* init params */
+        zc->blockState.matchState.cParams = params->cParams;
+        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+        zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+        zc->consumedSrcSize = 0;
+        zc->producedCSize = 0;
+        if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            zc->appliedParams.fParams.contentSizeFlag = 0;
+        DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+            (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+        zc->blockSize = blockSize;
+
+        XXH64_reset(&zc->xxhState, 0);
+        zc->stage = ZSTDcs_init;
+        zc->dictID = 0;
+        zc->dictContentSize = 0;
+
+        ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+        zc->seqStore.maxNbLit = blockSize;
+
+        /* buffers */
+        zc->bufferedPolicy = zbuff;
+        zc->inBuffSize = buffInSize;
+        zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+        zc->outBuffSize = buffOutSize;
+        zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);
+
+        /* ldm bucketOffsets table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const numBuckets =
+                  ((size_t)1) << (params->ldmParams.hashLog -
+                                  params->ldmParams.bucketSizeLog);
+            zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets);
+            ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets);
+        }
+
+        /* sequences storage */
+        ZSTD_referenceExternalSequences(zc, NULL, 0);
+        zc->seqStore.maxNbSeq = maxNbSeq;
+        zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+            &zc->blockState.matchState,
+            ws,
+            &params->cParams,
+            params->useRowMatchFinder,
+            crp,
+            needsIndexReset,
+            ZSTD_resetTarget_CCtx), "");
+
+        /* ldm hash table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            ZSTD_window_init(&zc->ldmState.window);
+            zc->ldmState.loadedDictEnd = 0;
+        }
+
+        DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
+
+        zc->initialized = 1;
+
+        return 0;
+    }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
+    int i;
+    for (i=0; i<ZSTD_REP_NUM; i++) cctx->blockState.prevCBlock->rep[i] = 0;
+    assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+}
+
+/* These are the approximate sizes for each strategy past which copying the
+ * dictionary tables into the working context is faster than using them
+ * in-place.
+ */
+static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = {
+    8 KB,  /* unused */
+    8 KB,  /* ZSTD_fast */
+    16 KB, /* ZSTD_dfast */
+    32 KB, /* ZSTD_greedy */
+    32 KB, /* ZSTD_lazy */
+    32 KB, /* ZSTD_lazy2 */
+    32 KB, /* ZSTD_btlazy2 */
+    32 KB, /* ZSTD_btopt */
+    8 KB,  /* ZSTD_btultra */
+    8 KB   /* ZSTD_btultra2 */
+};
+
+static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
+                                 const ZSTD_CCtx_params* params,
+                                 U64 pledgedSrcSize)
+{
+    size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
+    int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch;
+    return dedicatedDictSearch
+        || ( ( pledgedSrcSize <= cutoff
+            || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+            || params->attachDictPref == ZSTD_dictForceAttach )
+          && params->attachDictPref != ZSTD_dictForceCopy
+          && !params->forceWindow ); /* dictMatchState isn't correctly
+                                      * handled in _enforceMaxDist */
+}
+
+static size_t
+ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+                        const ZSTD_CDict* cdict,
+                        ZSTD_CCtx_params params,
+                        U64 pledgedSrcSize,
+                        ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu",
+                (unsigned long long)pledgedSrcSize);
+    {
+        ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams;
+        unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Resize working context table params for input only, since the dict
+         * has its own tables. */
+        /* pledgedSrcSize == 0 means 0! */
+
+        if (cdict->matchState.dedicatedDictSearch) {
+            ZSTD_dedicatedDictSearch_revertCParams(&adjusted_cdict_cParams);
+        }
+
+        params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
+        params.cParams.windowLog = windowLog;
+        params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+                                                 /* loadedDictSize */ 0,
+                                                 ZSTDcrp_makeClean, zbuff), "");
+        assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy);
+    }
+
+    {   const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc
+                                  - cdict->matchState.window.base);
+        const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
+        if (cdictLen == 0) {
+            /* don't even attach dictionaries with no contents */
+            DEBUGLOG(4, "skipping attaching empty dictionary");
+        } else {
+            DEBUGLOG(4, "attaching dictionary into context");
+            cctx->blockState.matchState.dictMatchState = &cdict->matchState;
+
+            /* prep working match state so dict matches never have negative indices
+             * when they are translated to the working context's index space. */
+            if (cctx->blockState.matchState.window.dictLimit < cdictEnd) {
+                cctx->blockState.matchState.window.nextSrc =
+                    cctx->blockState.matchState.window.base + cdictEnd;
+                ZSTD_window_clear(&cctx->blockState.matchState.window);
+            }
+            /* loadedDictEnd is expressed within the referential of the active context */
+            cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
+    }   }
+
+    cctx->dictID = cdict->dictID;
+    cctx->dictContentSize = cdict->dictContentSize;
+
+    /* copy block state */
+    ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
+                                        ZSTD_compressionParameters const* cParams) {
+    if (ZSTD_CDictIndicesAreTagged(cParams)){
+        /* Remove tags from the CDict table if they are present.
+         * See docs on "short cache" in zstd_compress_internal.h for context. */
+        size_t i;
+        for (i = 0; i < tableSize; i++) {
+            U32 const taggedIndex = src[i];
+            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
+            dst[i] = index;
+        }
+    } else {
+        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
+    }
+}
+
+static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+
+    assert(!cdict->matchState.dedicatedDictSearch);
+    DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu",
+                (unsigned long long)pledgedSrcSize);
+
+    {   unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Copy only compression parameters related to tables. */
+        params.cParams = *cdict_cParams;
+        params.cParams.windowLog = windowLog;
+        params.useRowMatchFinder = cdict->useRowMatchFinder;
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+                                                 /* loadedDictSize */ 0,
+                                                 ZSTDcrp_leaveDirty, zbuff), "");
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
+    }
+
+    ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
+    assert(params.useRowMatchFinder != ZSTD_ps_auto);
+
+    /* copy tables */
+    {   size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */)
+                                                            ? ((size_t)1 << cdict_cParams->chainLog)
+                                                            : 0;
+        size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+
+        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
+                                cdict->matchState.hashTable,
+                                hSize, cdict_cParams);
+
+        /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+        if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
+                                    cdict->matchState.chainTable,
+                                    chainSize, cdict_cParams);
+        }
+        /* copy tag table */
+        if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+            size_t const tagTableSize = hSize*sizeof(U16);
+            ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+                cdict->matchState.tagTable,
+                tagTableSize);
+        }
+    }
+
+    /* Zero the hashTable3, since the cdict never fills it */
+    {   int const h3log = cctx->blockState.matchState.hashLog3;
+        size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+        assert(cdict->matchState.hashLog3 == 0);
+        ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+    }
+
+    ZSTD_cwksp_mark_tables_clean(&cctx->workspace);
+
+    /* copy dictionary offsets */
+    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+        ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+
+    cctx->dictID = cdict->dictID;
+    cctx->dictContentSize = cdict->dictContentSize;
+
+    /* copy block state */
+    ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+/* We have a choice between copying the dictionary context into the working
+ * context, or referencing the dictionary context from the working context
+ * in-place. We decide here which strategy to use. */
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            const ZSTD_CCtx_params* params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+
+    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)",
+                (unsigned)pledgedSrcSize);
+
+    if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
+        return ZSTD_resetCCtx_byAttachingCDict(
+            cctx, cdict, *params, pledgedSrcSize, zbuff);
+    } else {
+        return ZSTD_resetCCtx_byCopyingCDict(
+            cctx, cdict, *params, pledgedSrcSize, zbuff);
+    }
+}
+
+/*! ZSTD_copyCCtx_internal() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  The "context", in this case, refers to the hash and chain tables,
+ *  entropy tables, and dictionary references.
+ * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx.
+ * @return : 0, or an error code */
+static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+                            const ZSTD_CCtx* srcCCtx,
+                            ZSTD_frameParameters fParams,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong,
+                    "Can't copy a ctx that's not in init stage.");
+    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
+    ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+    {   ZSTD_CCtx_params params = dstCCtx->requestedParams;
+        /* Copy only compression parameters related to tables. */
+        params.cParams = srcCCtx->appliedParams.cParams;
+        assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
+        assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
+        assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
+        params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
+        params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+        params.ldmParams = srcCCtx->appliedParams.ldmParams;
+        params.fParams = fParams;
+        ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                /* loadedDictSize */ 0,
+                                ZSTDcrp_leaveDirty, zbuff);
+        assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);
+        assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy);
+        assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog);
+        assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog);
+        assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3);
+    }
+
+    ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);
+
+    /* copy tables */
+    {   size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy,
+                                                         srcCCtx->appliedParams.useRowMatchFinder,
+                                                         0 /* forDDSDict */)
+                                    ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog)
+                                    : 0;
+        size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+        int const h3log = srcCCtx->blockState.matchState.hashLog3;
+        size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+
+        ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable,
+               srcCCtx->blockState.matchState.hashTable,
+               hSize * sizeof(U32));
+        ZSTD_memcpy(dstCCtx->blockState.matchState.chainTable,
+               srcCCtx->blockState.matchState.chainTable,
+               chainSize * sizeof(U32));
+        ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable3,
+               srcCCtx->blockState.matchState.hashTable3,
+               h3Size * sizeof(U32));
+    }
+
+    ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace);
+
+    /* copy dictionary offsets */
+    {
+        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
+        ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+    dstCCtx->dictID = srcCCtx->dictID;
+    dstCCtx->dictContentSize = srcCCtx->dictContentSize;
+
+    /* copy block state */
+    ZSTD_memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock));
+
+    return 0;
+}
+
+/*! ZSTD_copyCCtx() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  pledgedSrcSize==0 means "unknown".
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+    ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    ZSTD_buffered_policy_e const zbuff = srcCCtx->bufferedPolicy;
+    ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1);
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN);
+
+    return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx,
+                                fParams, pledgedSrcSize,
+                                zbuff);
+}
+
+
+#define ZSTD_ROWSIZE 16
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
+{
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
+    int cellNb = 0;
+    int rowNb;
+    /* Protect special index values < ZSTD_WINDOW_START_INDEX. */
+    U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the table re-use logic is sound, and that we don't
+     * access table space that we haven't cleaned, we re-"poison" the table
+     * space every time we mark it dirty.
+     *
+     * This function however is intended to operate on those dirty tables and
+     * re-clean them. So when this function is used correctly, we can unpoison
+     * the memory it operated on. This introduces a blind spot though, since
+     * if we now try to operate on __actually__ poisoned memory, we will not
+     * detect that. */
+    __msan_unpoison(table, size * sizeof(U32));
+#endif
+
+    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+        int column;
+        for (column=0; column<ZSTD_ROWSIZE; column++) {
+            U32 newVal;
+            if (preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) {
+                /* This write is pointless, but is required(?) for the compiler
+                 * to auto-vectorize the loop. */
+                newVal = ZSTD_DUBT_UNSORTED_MARK;
+            } else if (table[cellNb] < reducerThreshold) {
+                newVal = 0;
+            } else {
+                newVal = table[cellNb] - reducerValue;
+            }
+            table[cellNb] = newVal;
+            cellNb++;
+    }   }
+}
+
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
+}
+
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
+{
+    {   U32 const hSize = (U32)1 << params->cParams.hashLog;
+        ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+    }
+
+    if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) {
+        U32 const chainSize = (U32)1 << params->cParams.chainLog;
+        if (params->cParams.strategy == ZSTD_btlazy2)
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+    }
+
+    if (ms->hashLog3) {
+        U32 const h3Size = (U32)1 << ms->hashLog3;
+        ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue);
+    }
+}
+
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+{
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    BYTE* const llCodeTable = seqStorePtr->llCode;
+    BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    U32 u;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
+    for (u=0; u<nbSeq; u++) {
+        U32 const llv = sequences[u].litLength;
+        U32 const mlv = sequences[u].mlBase;
+        llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
+        mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+    }
+    if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+        llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+    if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+        mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+/* ZSTD_useTargetCBlockSize():
+ * Returns if target compressed block size param is being used.
+ * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
+{
+    DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize);
+    return (cctxParams->targetCBlockSize != 0);
+}
+
+/* ZSTD_blockSplitterEnabled():
+ * Returns if block splitting param is being used
+ * If used, compression will do best effort to split a block in order to improve compression ratio.
+ * At the time this function is called, the parameter must be finalized.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
+{
+    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
+    assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
+    return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
+}
+
+/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
+ * and size of the sequences statistics
+ */
+typedef struct {
+    U32 LLtype;
+    U32 Offtype;
+    U32 MLtype;
+    size_t size;
+    size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+} ZSTD_symbolEncodingTypeStats_t;
+
+/* ZSTD_buildSequencesStatistics():
+ * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field.
+ * Modifies `nextEntropy` to have the appropriate values as a side effect.
+ * nbSeq must be greater than 0.
+ *
+ * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+ */
+static ZSTD_symbolEncodingTypeStats_t
+ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+                              BYTE* dst, const BYTE* const dstEnd,
+                              ZSTD_strategy strategy, unsigned* countWorkspace,
+                              void* entropyWorkspace, size_t entropyWkspSize) {
+    BYTE* const ostart = dst;
+    const BYTE* const oend = dstEnd;
+    BYTE* op = ostart;
+    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    ZSTD_symbolEncodingTypeStats_t stats;
+
+    stats.lastCountSize = 0;
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+    assert(op <= oend);
+    assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+    /* build CTable for Literal Lengths */
+    {   unsigned max = MaxLL;
+        size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
+        stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode,
+                                        countWorkspace, max, mostFrequent, nbSeq,
+                                        LLFSELog, prevEntropy->litlengthCTable,
+                                        LL_defaultNorm, LL_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype,
+                countWorkspace, max, llCodeTable, nbSeq,
+                LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                prevEntropy->litlengthCTable,
+                sizeof(prevEntropy->litlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed");
+                stats.size = countSize;
+                return stats;
+            }
+            if (stats.LLtype == set_compressed)
+                stats.lastCountSize = countSize;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    /* build CTable for Offsets */
+    {   unsigned max = MaxOff;
+        size_t const mostFrequent = HIST_countFast_wksp(
+            countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);  /* can't fail */
+        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+        stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
+                                        countWorkspace, max, mostFrequent, nbSeq,
+                                        OffFSELog, prevEntropy->offcodeCTable,
+                                        OF_defaultNorm, OF_defaultNormLog,
+                                        defaultPolicy, strategy);
+        assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype,
+                countWorkspace, max, ofCodeTable, nbSeq,
+                OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                prevEntropy->offcodeCTable,
+                sizeof(prevEntropy->offcodeCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed");
+                stats.size = countSize;
+                return stats;
+            }
+            if (stats.Offtype == set_compressed)
+                stats.lastCountSize = countSize;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    /* build CTable for MatchLengths */
+    {   unsigned max = MaxML;
+        size_t const mostFrequent = HIST_countFast_wksp(
+            countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op));
+        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
+        stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode,
+                                        countWorkspace, max, mostFrequent, nbSeq,
+                                        MLFSELog, prevEntropy->matchlengthCTable,
+                                        ML_defaultNorm, ML_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype,
+                countWorkspace, max, mlCodeTable, nbSeq,
+                ML_defaultNorm, ML_defaultNormLog, MaxML,
+                prevEntropy->matchlengthCTable,
+                sizeof(prevEntropy->matchlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed");
+                stats.size = countSize;
+                return stats;
+            }
+            if (stats.MLtype == set_compressed)
+                stats.lastCountSize = countSize;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    stats.size = (size_t)(op-ostart);
+    return stats;
+}
+
+/* ZSTD_entropyCompressSeqStore_internal():
+ * compresses both literals and sequences
+ * Returns compressed size of block, or a zstd error.
+ */
+#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+MEM_STATIC size_t
+ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+                          const ZSTD_entropyCTables_t* prevEntropy,
+                                ZSTD_entropyCTables_t* nextEntropy,
+                          const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                void* entropyWorkspace, size_t entropyWkspSize,
+                          const int bmi2)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    unsigned* count = (unsigned*)entropyWorkspace;
+    FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t lastCountSize;
+
+    entropyWorkspace = count + (MaxSeq + 1);
+    entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+
+    /* Compress literals */
+    {   const BYTE* const literals = seqStorePtr->litStart;
+        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
+        /* Base suspicion of uncompressibility on ratio of literals to sequences */
+        unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+        size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+        size_t const cSize = ZSTD_compressLiterals(
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy,
+                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                    op, dstCapacity,
+                                    literals, litSize,
+                                    entropyWorkspace, entropyWkspSize,
+                                    bmi2, suspectUncompressible);
+        FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+        assert(cSize <= dstCapacity);
+        op += cSize;
+    }
+
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall, "Can't fit seq hdr in output buf!");
+    if (nbSeq < 128) {
+        *op++ = (BYTE)nbSeq;
+    } else if (nbSeq < LONGNBSEQ) {
+        op[0] = (BYTE)((nbSeq>>8) + 0x80);
+        op[1] = (BYTE)nbSeq;
+        op+=2;
+    } else {
+        op[0]=0xFF;
+        MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ));
+        op+=3;
+    }
+    assert(op <= oend);
+    if (nbSeq==0) {
+        /* Copy the old tables over as if we repeated them */
+        ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+        return (size_t)(op - ostart);
+    }
+    {   BYTE* const seqHead = op++;
+        /* build stats for sequences */
+        const ZSTD_symbolEncodingTypeStats_t stats =
+                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                             &prevEntropy->fse, &nextEntropy->fse,
+                                              op, oend,
+                                              strategy, count,
+                                              entropyWorkspace, entropyWkspSize);
+        FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
+        *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+        lastCountSize = stats.lastCountSize;
+        op += stats.size;
+    }
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, (size_t)(oend - op),
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+        op += bitstreamSize;
+        assert(op <= oend);
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+        if (lastCountSize && (lastCountSize + bitstreamSize) < 4) {
+            /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(lastCountSize + bitstreamSize == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+    }
+
+    DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart));
+    return (size_t)(op - ostart);
+}
+
+MEM_STATIC size_t
+ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+                       const ZSTD_entropyCTables_t* prevEntropy,
+                             ZSTD_entropyCTables_t* nextEntropy,
+                       const ZSTD_CCtx_params* cctxParams,
+                             void* dst, size_t dstCapacity,
+                             size_t srcSize,
+                             void* entropyWorkspace, size_t entropyWkspSize,
+                             int bmi2)
+{
+    size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+                            seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+                            dst, dstCapacity,
+                            entropyWorkspace, entropyWkspSize, bmi2);
+    if (cSize == 0) return 0;
+    /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+     * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+     */
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
+        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+        return 0;  /* block not compressed */
+    }
+    FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+
+    /* Check compressibility */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+        if (cSize >= maxCSize) return 0;  /* block not compressed */
+    }
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    return cSize;
+}
+
+/* ZSTD_selectBlockCompressor() :
+ * Not static, but internal use only (used by long distance matcher)
+ * assumption : strat is a valid strategy */
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
+{
+    static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+        { ZSTD_compressBlock_fast  /* default for 0 */,
+          ZSTD_compressBlock_fast,
+          ZSTD_compressBlock_doubleFast,
+          ZSTD_compressBlock_greedy,
+          ZSTD_compressBlock_lazy,
+          ZSTD_compressBlock_lazy2,
+          ZSTD_compressBlock_btlazy2,
+          ZSTD_compressBlock_btopt,
+          ZSTD_compressBlock_btultra,
+          ZSTD_compressBlock_btultra2 },
+        { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+          ZSTD_compressBlock_fast_extDict,
+          ZSTD_compressBlock_doubleFast_extDict,
+          ZSTD_compressBlock_greedy_extDict,
+          ZSTD_compressBlock_lazy_extDict,
+          ZSTD_compressBlock_lazy2_extDict,
+          ZSTD_compressBlock_btlazy2_extDict,
+          ZSTD_compressBlock_btopt_extDict,
+          ZSTD_compressBlock_btultra_extDict,
+          ZSTD_compressBlock_btultra_extDict },
+        { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+          ZSTD_compressBlock_fast_dictMatchState,
+          ZSTD_compressBlock_doubleFast_dictMatchState,
+          ZSTD_compressBlock_greedy_dictMatchState,
+          ZSTD_compressBlock_lazy_dictMatchState,
+          ZSTD_compressBlock_lazy2_dictMatchState,
+          ZSTD_compressBlock_btlazy2_dictMatchState,
+          ZSTD_compressBlock_btopt_dictMatchState,
+          ZSTD_compressBlock_btultra_dictMatchState,
+          ZSTD_compressBlock_btultra_dictMatchState },
+        { NULL  /* default for 0 */,
+          NULL,
+          NULL,
+          ZSTD_compressBlock_greedy_dedicatedDictSearch,
+          ZSTD_compressBlock_lazy_dedicatedDictSearch,
+          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
+          NULL,
+          NULL,
+          NULL,
+          NULL }
+    };
+    ZSTD_blockCompressor selectedCompressor;
+    ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+    if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+        static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+            { ZSTD_compressBlock_greedy_row,
+            ZSTD_compressBlock_lazy_row,
+            ZSTD_compressBlock_lazy2_row },
+            { ZSTD_compressBlock_greedy_extDict_row,
+            ZSTD_compressBlock_lazy_extDict_row,
+            ZSTD_compressBlock_lazy2_extDict_row },
+            { ZSTD_compressBlock_greedy_dictMatchState_row,
+            ZSTD_compressBlock_lazy_dictMatchState_row,
+            ZSTD_compressBlock_lazy2_dictMatchState_row },
+            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
+        };
+        DEBUGLOG(4, "Selecting a row-based matchfinder");
+        assert(useRowMatchFinder != ZSTD_ps_auto);
+        selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy];
+    } else {
+        selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
+    }
+    assert(selectedCompressor != NULL);
+    return selectedCompressor;
+}
+
+static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
+                                   const BYTE* anchor, size_t lastLLSize)
+{
+    ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize);
+    seqStorePtr->lit += lastLLSize;
+}
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+{
+    ssPtr->lit = ssPtr->litStart;
+    ssPtr->sequences = ssPtr->sequencesStart;
+    ssPtr->longLengthType = ZSTD_llt_none;
+}
+
+typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+
+static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+{
+    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize);
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    /* Assert that we have correctly flushed the ctx params into the ms's copy */
+    ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+     * additional 1. We need to revisit and change this logic to be more consistent */
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+        if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+            ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+        } else {
+            ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch);
+        }
+        return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */
+    }
+    ZSTD_resetSeqStore(&(zc->seqStore));
+    /* required for optimal parser to read stats from dictionary */
+    ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;
+    /* tell the optimal parser how we expect to compress literals */
+    ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode;
+    /* a gap between an attached dict and the current window is not safe,
+     * they must remain adjacent,
+     * and when that stops being the case, the dict must be unset */
+    assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
+
+    /* limited update after a very long match */
+    {   const BYTE* const base = ms->window.base;
+        const BYTE* const istart = (const BYTE*)src;
+        const U32 curr = (U32)(istart-base);
+        if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1));   /* ensure no overflow */
+        if (curr > ms->nextToUpdate + 384)
+            ms->nextToUpdate = curr - MIN(192, (U32)(curr - ms->nextToUpdate - 384));
+    }
+
+    /* select and store sequences */
+    {   ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
+        size_t lastLLSize;
+        {   int i;
+            for (i = 0; i < ZSTD_REP_NUM; ++i)
+                zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
+        }
+        if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+            assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&zc->externSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
+                                       src, srcSize);
+            assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+        } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+            rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+
+            ldmSeqStore.seq = zc->ldmSequences;
+            ldmSeqStore.capacity = zc->maxNbLdmSequences;
+            /* Updates ldmSeqStore.size */
+            FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore,
+                                               &zc->appliedParams.ldmParams,
+                                               src, srcSize), "");
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&ldmSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
+                                       src, srcSize);
+            assert(ldmSeqStore.pos == ldmSeqStore.size);
+        } else {   /* not long range mode */
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+                                                                                    zc->appliedParams.useRowMatchFinder,
+                                                                                    dictMode);
+            ms->ldmSeqStore = NULL;
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+        }
+        {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+            ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+    }   }
+    return ZSTDbss_compress;
+}
+
+static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+{
+    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
+    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
+    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
+    size_t literalsRead = 0;
+    size_t lastLLSize;
+
+    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
+    size_t i;
+    repcodes_t updatedRepcodes;
+
+    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+    /* Ensure we have enough space for last literals "sequence" */
+    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
+    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    for (i = 0; i < seqStoreSeqSize; ++i) {
+        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
+        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
+        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
+        outSeqs[i].rep = 0;
+
+        if (i == seqStore->longLengthPos) {
+            if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                outSeqs[i].litLength += 0x10000;
+            } else if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+                outSeqs[i].matchLength += 0x10000;
+            }
+        }
+
+        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
+            /* Derive the correct offset corresponding to a repcode */
+            outSeqs[i].rep = seqStoreSeqs[i].offBase;
+            if (outSeqs[i].litLength != 0) {
+                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
+            } else {
+                if (outSeqs[i].rep == 3) {
+                    rawOffset = updatedRepcodes.rep[0] - 1;
+                } else {
+                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
+                }
+            }
+        }
+        outSeqs[i].offset = rawOffset;
+        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+           so we provide seqStoreSeqs[i].offset - 1 */
+        ZSTD_updateRep(updatedRepcodes.rep,
+                       seqStoreSeqs[i].offBase,
+                       seqStoreSeqs[i].litLength == 0);
+        literalsRead += outSeqs[i].litLength;
+    }
+    /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+     * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+     * for the block boundary, according to the API.
+     */
+    assert(seqStoreLiteralsSize >= literalsRead);
+    lastLLSize = seqStoreLiteralsSize - literalsRead;
+    outSeqs[i].litLength = (U32)lastLLSize;
+    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
+    seqStoreSeqSize++;
+    zc->seqCollector.seqIndex += seqStoreSeqSize;
+}
+
+size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                              size_t outSeqsSize, const void* src, size_t srcSize)
+{
+    const size_t dstCapacity = ZSTD_compressBound(srcSize);
+    void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+    SeqCollector seqCollector;
+
+    RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+
+    seqCollector.collectSequences = 1;
+    seqCollector.seqStart = outSeqs;
+    seqCollector.seqIndex = 0;
+    seqCollector.maxSequences = outSeqsSize;
+    zc->seqCollector = seqCollector;
+
+    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+    ZSTD_customFree(dst, ZSTD_defaultCMem);
+    return zc->seqCollector.seqIndex;
+}
+
+size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) {
+    size_t in = 0;
+    size_t out = 0;
+    for (; in < seqsSize; ++in) {
+        if (sequences[in].offset == 0 && sequences[in].matchLength == 0) {
+            if (in != seqsSize - 1) {
+                sequences[in+1].litLength += sequences[in].litLength;
+            }
+        } else {
+            sequences[out] = sequences[in];
+            ++out;
+        }
+    }
+    return out;
+}
+
+/* Unrolled loop to read four size_ts of input at a time. Returns 1 if is RLE, 0 if not. */
+static int ZSTD_isRLE(const BYTE* src, size_t length) {
+    const BYTE* ip = src;
+    const BYTE value = ip[0];
+    const size_t valueST = (size_t)((U64)value * 0x0101010101010101ULL);
+    const size_t unrollSize = sizeof(size_t) * 4;
+    const size_t unrollMask = unrollSize - 1;
+    const size_t prefixLength = length & unrollMask;
+    size_t i;
+    size_t u;
+    if (length == 1) return 1;
+    /* Check if prefix is RLE first before using unrolled loop */
+    if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+        return 0;
+    }
+    for (i = prefixLength; i != length; i += unrollSize) {
+        for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+            if (MEM_readST(ip + i + u) != valueST) {
+                return 0;
+            }
+        }
+    }
+    return 1;
+}
+
+/* Returns true if the given block may be RLE.
+ * This is just a heuristic based on the compressibility.
+ * It may return both false positives and false negatives.
+ */
+static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+{
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+    size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
+
+    return nbSeqs < 4 && nbLits < 10;
+}
+
+static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+{
+    ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+    bs->prevCBlock = bs->nextCBlock;
+    bs->nextCBlock = tmp;
+}
+
+/* Writes the block header */
+static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
+    U32 const cBlockHeader = cSize == 1 ?
+                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+    MEM_writeLE24(op, cBlockHeader);
+    DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock);
+}
+
+/** ZSTD_buildBlockEntropyStats_literals() :
+ *  Builds entropy for the literals.
+ *  Stores literals block type (raw, rle, compressed, repeat) and
+ *  huffman description table to hufMetadata.
+ *  Requires ENTROPY_WORKSPACE_SIZE workspace
+ *  @return : size of huffman description table or error code */
+static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+                                            const ZSTD_hufCTables_t* prevHuf,
+                                                  ZSTD_hufCTables_t* nextHuf,
+                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                  const int literalsCompressionIsDisabled,
+                                                  void* workspace, size_t wkspSize)
+{
+    BYTE* const wkspStart = (BYTE*)workspace;
+    BYTE* const wkspEnd = wkspStart + wkspSize;
+    BYTE* const countWkspStart = wkspStart;
+    unsigned* const countWksp = (unsigned*)workspace;
+    const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+    BYTE* const nodeWksp = countWkspStart + countWkspSize;
+    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+    unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    unsigned huffLog = LitHufLog;
+    HUF_repeat repeat = prevHuf->repeatMode;
+    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (literalsCompressionIsDisabled) {
+        DEBUGLOG(5, "set_basic - disabled");
+        hufMetadata->hType = set_basic;
+        return 0;
+    }
+
+    /* small ? don't even attempt compression (speed opt) */
+#ifndef COMPRESS_LITERALS_SIZE_MIN
+#define COMPRESS_LITERALS_SIZE_MIN 63
+#endif
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) {
+            DEBUGLOG(5, "set_basic - too small");
+            hufMetadata->hType = set_basic;
+            return 0;
+        }
+    }
+
+    /* Scan input and build symbol stats */
+    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+        FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+        if (largest == srcSize) {
+            DEBUGLOG(5, "set_rle");
+            hufMetadata->hType = set_rle;
+            return 0;
+        }
+        if (largest <= (srcSize >> 7)+4) {
+            DEBUGLOG(5, "set_basic - no gain");
+            hufMetadata->hType = set_basic;
+            return 0;
+        }
+    }
+
+    /* Validate the previous Huffman table */
+    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+        repeat = HUF_repeat_none;
+    }
+
+    /* Build Huffman Tree */
+    ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    assert(huffLog <= LitHufLog);
+    {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                    maxSymbolValue, huffLog,
+                                                    nodeWksp, nodeWkspSize);
+        FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+        huffLog = (U32)maxBits;
+        {   /* Build and write the CTable */
+            size_t const newCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+            size_t const hSize = HUF_writeCTable_wksp(
+                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+                    nodeWksp, nodeWkspSize);
+            /* Check against repeating the previous CTable */
+            if (repeat != HUF_repeat_none) {
+                size_t const oldCSize = HUF_estimateCompressedSize(
+                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                    DEBUGLOG(5, "set_repeat - smaller");
+                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                    hufMetadata->hType = set_repeat;
+                    return 0;
+                }
+            }
+            if (newCSize + hSize >= srcSize) {
+                DEBUGLOG(5, "set_basic - no gains");
+                ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                hufMetadata->hType = set_basic;
+                return 0;
+            }
+            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+            hufMetadata->hType = set_compressed;
+            nextHuf->repeatMode = HUF_repeat_check;
+            return hSize;
+        }
+    }
+}
+
+
+/* ZSTD_buildDummySequencesStatistics():
+ * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic,
+ * and updates nextEntropy to the appropriate repeatMode.
+ */
+static ZSTD_symbolEncodingTypeStats_t
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
+    nextEntropy->litlength_repeatMode = FSE_repeat_none;
+    nextEntropy->offcode_repeatMode = FSE_repeat_none;
+    nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+    return stats;
+}
+
+/** ZSTD_buildBlockEntropyStats_sequences() :
+ *  Builds entropy for the sequences.
+ *  Stores symbol compression modes and fse table to fseMetadata.
+ *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+ *  @return : size of fse tables or error code */
+static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+                                              const ZSTD_fseCTables_t* prevEntropy,
+                                                    ZSTD_fseCTables_t* nextEntropy,
+                                              const ZSTD_CCtx_params* cctxParams,
+                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                    void* workspace, size_t wkspSize)
+{
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    BYTE* const ostart = fseMetadata->fseTablesBuffer;
+    BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+    BYTE* op = ostart;
+    unsigned* countWorkspace = (unsigned*)workspace;
+    unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1);
+    size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace);
+    ZSTD_symbolEncodingTypeStats_t stats;
+
+    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq);
+    stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                          prevEntropy, nextEntropy, op, oend,
+                                          strategy, countWorkspace,
+                                          entropyWorkspace, entropyWorkspaceSize)
+                       : ZSTD_buildDummySequencesStatistics(nextEntropy);
+    FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
+    fseMetadata->llType = (symbolEncodingType_e) stats.LLtype;
+    fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype;
+    fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype;
+    fseMetadata->lastCountSize = stats.lastCountSize;
+    return stats.size;
+}
+
+
+/** ZSTD_buildBlockEntropyStats() :
+ *  Builds entropy for the block.
+ *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+ *
+ *  @return : 0 on success or error code
+ */
+size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+                             const ZSTD_entropyCTables_t* prevEntropy,
+                                   ZSTD_entropyCTables_t* nextEntropy,
+                             const ZSTD_CCtx_params* cctxParams,
+                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                   void* workspace, size_t wkspSize)
+{
+    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+    entropyMetadata->hufMetadata.hufDesSize =
+        ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                            &prevEntropy->huf, &nextEntropy->huf,
+                                            &entropyMetadata->hufMetadata,
+                                            ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                            workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+    entropyMetadata->fseMetadata.fseTablesSize =
+        ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                              cctxParams,
+                                              &entropyMetadata->fseMetadata,
+                                              workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed");
+    return 0;
+}
+
+/* Returns the size estimate for the literals section (header + content) of a block */
+static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+                                                const ZSTD_hufCTables_t* huf,
+                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                void* workspace, size_t wkspSize,
+                                                int writeEntropy)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB);
+    U32 singleStream = litSize < 256;
+
+    if (hufMetadata->hType == set_basic) return litSize;
+    else if (hufMetadata->hType == set_rle) return 1;
+    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
+        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+            if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+                        const FSE_CTable* fseCTable,
+                        const U8* additionalBits,
+                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                        void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    const BYTE* ctp = codeTable;
+    const BYTE* const ctStart = ctp;
+    const BYTE* const ctEnd = ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits = 0;
+    unsigned max = maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+    if (type == set_basic) {
+        /* We selected this encoding type, so it must be valid. */
+        assert(max <= defaultMax);
+        (void)defaultMax;
+        cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max);
+    } else if (type == set_rle) {
+        cSymbolTypeSizeEstimateInBits = 0;
+    } else if (type == set_compressed || type == set_repeat) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) {
+        return nbSeq * 10;
+    }
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits >> 3;
+}
+
+/* Returns the size estimate for the sequences section (header + content) of a block */
+static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+                                                  const BYTE* llCodeTable,
+                                                  const BYTE* mlCodeTable,
+                                                  size_t nbSeq,
+                                                  const ZSTD_fseCTables_t* fseTables,
+                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                  void* workspace, size_t wkspSize,
+                                                  int writeEntropy)
+{
+    size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+    size_t cSeqSizeEstimate = 0;
+    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+                                         fseTables->offcodeCTable, NULL,
+                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+                                         fseTables->litlengthCTable, LL_bits,
+                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+                                         fseTables->matchlengthCTable, ML_bits,
+                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                         workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+/* Returns the size estimate for a given stream of literals, of, ll, ml */
+static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+                                     const BYTE* ofCodeTable,
+                                     const BYTE* llCodeTable,
+                                     const BYTE* mlCodeTable,
+                                     size_t nbSeq,
+                                     const ZSTD_entropyCTables_t* entropy,
+                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                     void* workspace, size_t wkspSize,
+                                     int writeLitEntropy, int writeSeqEntropy) {
+    size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+                                                         workspace, wkspSize, writeLitEntropy);
+    size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                         workspace, wkspSize, writeSeqEntropy);
+    return seqSize + literalsSize + ZSTD_blockHeaderSize;
+}
+
+/* Builds entropy statistics and uses them for blocksize estimation.
+ *
+ * Returns the estimated compressed size of the seqStore, or a zstd error.
+ */
+static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+    DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                    &zc->blockState.prevCBlock->entropy,
+                    &zc->blockState.nextCBlock->entropy,
+                    &zc->appliedParams,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                    seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                    (size_t)(seqStore->sequences - seqStore->sequencesStart),
+                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                    (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+}
+
+/* Returns literals bytes represented in a seqStore */
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
+    size_t literalsBytes = 0;
+    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t i;
+    for (i = 0; i < nbSeqs; ++i) {
+        seqDef seq = seqStore->sequencesStart[i];
+        literalsBytes += seq.litLength;
+        if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+            literalsBytes += 0x10000;
+        }
+    }
+    return literalsBytes;
+}
+
+/* Returns match bytes represented in a seqStore */
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+    size_t matchBytes = 0;
+    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t i;
+    for (i = 0; i < nbSeqs; ++i) {
+        seqDef seq = seqStore->sequencesStart[i];
+        matchBytes += seq.mlBase + MINMATCH;
+        if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+            matchBytes += 0x10000;
+        }
+    }
+    return matchBytes;
+}
+
+/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx).
+ * Stores the result in resultSeqStore.
+ */
+static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                               const seqStore_t* originalSeqStore,
+                                     size_t startIdx, size_t endIdx)
+{
+    *resultSeqStore = *originalSeqStore;
+    if (startIdx > 0) {
+        resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+    }
+
+    /* Move longLengthPos into the correct position if necessary */
+    if (originalSeqStore->longLengthType != ZSTD_llt_none) {
+        if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) {
+            resultSeqStore->longLengthType = ZSTD_llt_none;
+        } else {
+            resultSeqStore->longLengthPos -= (U32)startIdx;
+        }
+    }
+    resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+    resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+    if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+        /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+        assert(resultSeqStore->lit == originalSeqStore->lit);
+    } else {
+        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+    }
+    resultSeqStore->llCode += startIdx;
+    resultSeqStore->mlCode += startIdx;
+    resultSeqStore->ofCode += startIdx;
+}
+
+/**
+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+ */
+static U32
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
+{
+    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
+    assert(OFFBASE_IS_REPCODE(offBase));
+    if (adjustedRepCode == ZSTD_REP_NUM) {
+        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
+         * This is only valid if it results in a valid offset value, aka > 0.
+         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
+         * In which case this function will return 0, which is an invalid offset.
+         * It's not an issue though, since this value will be
+         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
+         */
+        return rep[0] - 1;
+    }
+    return rep[adjustedRepCode];
+}
+
+/**
+ * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise
+ * due to emission of RLE/raw blocks that disturb the offset history,
+ * and replaces any repcodes within the seqStore that may be invalid.
+ *
+ * dRepcodes are updated as would be on the decompression side.
+ * cRepcodes are updated exactly in accordance with the seqStore.
+ *
+ * Note : this function assumes seq->offBase respects the following numbering scheme :
+ *        0 : invalid
+ *        1-3 : repcode 1-3
+ *        4+ : real_offset+3
+ */
+static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+                                          seqStore_t* const seqStore, U32 const nbSeq) {
+    U32 idx = 0;
+    for (; idx < nbSeq; ++idx) {
+        seqDef* const seq = seqStore->sequencesStart + idx;
+        U32 const ll0 = (seq->litLength == 0);
+        U32 const offBase = seq->offBase;
+        assert(seq->offBase > 0);
+        if (OFFBASE_IS_REPCODE(offBase)) {
+            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
+            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+            /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+             * the repcode with the offset it actually references, determined by the compression
+             * repcode history.
+             */
+            if (dRawOffset != cRawOffset) {
+                seq->offBase = cRawOffset + ZSTD_REP_NUM;
+            }
+        }
+        /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+         * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+         */
+        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
+        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+    }
+}
+
+/* ZSTD_compressSeqStore_singleBlock():
+ * Compresses a seqStore into a block with a block header, into the buffer dst.
+ *
+ * Returns the total size of that block (including header) or a ZSTD error code.
+ */
+static size_t
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+                                  repcodes_t* const dRep, repcodes_t* const cRep,
+                                  void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  U32 lastBlock, U32 isPartition)
+{
+    const U32 rleMaxLength = 25;
+    BYTE* op = (BYTE*)dst;
+    const BYTE* ip = (const BYTE*)src;
+    size_t cSize;
+    size_t cSeqsSize;
+
+    /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */
+    repcodes_t const dRepOriginal = *dRep;
+    DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock");
+    if (isPartition)
+        ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
+
+    RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit");
+    cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore,
+                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+                &zc->appliedParams,
+                op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize,
+                srcSize,
+                zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                zc->bmi2);
+    FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");
+
+    if (!zc->isFirstBlock &&
+        cSeqsSize < rleMaxLength &&
+        ZSTD_isRLE((BYTE const*)src, srcSize)) {
+        /* We don't want to emit our first block as a RLE even if it qualifies because
+        * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+        * This is only an issue for zstd <= v1.4.3
+        */
+        cSeqsSize = 1;
+    }
+
+    if (zc->seqCollector.collectSequences) {
+        ZSTD_copyBlockSequences(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        return 0;
+    }
+
+    if (cSeqsSize == 0) {
+        cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+        FORWARD_IF_ERROR(cSize, "Nocompress block failed");
+        DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
+        *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+    } else if (cSeqsSize == 1) {
+        cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock);
+        FORWARD_IF_ERROR(cSize, "RLE compress block failed");
+        DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
+        *dRep = dRepOriginal; /* reset simulated decompression repcode history */
+    } else {
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
+        cSize = ZSTD_blockHeaderSize + cSeqsSize;
+        DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
+    }
+
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+/* Struct to keep track of where we are in our recursive calls. */
+typedef struct {
+    U32* splitLocations;    /* Array of split indices */
+    size_t idx;             /* The current index within splitLocations being worked on */
+} seqStoreSplits;
+
+#define MIN_SEQUENCES_BLOCK_SPLITTING 300
+
+/* Helper function to perform the recursive search for block splits.
+ * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+ * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+ * we do not recurse.
+ *
+ * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+ * In practice, recursion depth usually doesn't go beyond 4.
+ *
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+ * maximum of 128 KB, this value is actually impossible to reach.
+ */
+static void
+ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+                             ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+{
+    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+    size_t estimatedOriginalSize;
+    size_t estimatedFirstHalfSize;
+    size_t estimatedSecondHalfSize;
+    size_t midIdx = (startIdx + endIdx)/2;
+
+    if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
+        return;
+    }
+    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+    ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+    ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+    ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+    estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+    estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+    estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+             estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+    if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+        return;
+    }
+    if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
+        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+        ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+        splits->splitLocations[splits->idx] = (U32)midIdx;
+        splits->idx++;
+        ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore);
+    }
+}
+
+/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
+ *
+ * Returns the number of splits made (which equals the size of the partition table - 1).
+ */
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+{
+    seqStoreSplits splits = {partitions, 0};
+    if (nbSeq <= 4) {
+        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split");
+        /* Refuse to try and split anything with less than 4 sequences */
+        return 0;
+    }
+    ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore);
+    splits.splitLocations[splits.idx] = nbSeq;
+    DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1);
+    return splits.idx;
+}
+
+/* ZSTD_compressBlock_splitBlock():
+ * Attempts to split a given block into multiple blocks to improve compression ratio.
+ *
+ * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+ */
+static size_t
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
+{
+    size_t cSize = 0;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    size_t i = 0;
+    size_t srcBytesTotal = 0;
+    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+
+    /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+     * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+     * separate repcode histories that simulate repcode history on compression and decompression side,
+     * and use the histories to determine whether we must replace a particular repcode with its raw offset.
+     *
+     * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed
+     *    or RLE. This allows us to retrieve the offset value that an invalid repcode references within
+     *    a nocompress/RLE block.
+     * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use
+     *    the replacement offset value rather than the original repcode to update the repcode history.
+     *    dRep also will be the final repcode history sent to the next block.
+     *
+     * See ZSTD_seqStore_resolveOffCodes() for more details.
+     */
+    repcodes_t dRep;
+    repcodes_t cRep;
+    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+
+    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                (unsigned)zc->blockState.matchState.nextToUpdate);
+
+    if (numSplits == 0) {
+        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+                                                                   &dRep, &cRep,
+                                                                    op, dstCapacity,
+                                                                    ip, blockSize,
+                                                                    lastBlock, 0 /* isPartition */);
+        FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+        DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        return cSizeSingleBlock;
+    }
+
+    ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+    for (i = 0; i <= numSplits; ++i) {
+        size_t cSizeChunk;
+        U32 const lastPartition = (i == numSplits);
+        U32 lastBlockEntireSrc = 0;
+
+        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+        srcBytesTotal += srcBytes;
+        if (lastPartition) {
+            /* This is the final partition, need to account for possible last literals */
+            srcBytes += blockSize - srcBytesTotal;
+            lastBlockEntireSrc = lastBlock;
+        } else {
+            ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]);
+        }
+
+        cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore,
+                                                      &dRep, &cRep,
+                                                       op, dstCapacity,
+                                                       ip, srcBytes,
+                                                       lastBlockEntireSrc, 1 /* isPartition */);
+        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+        FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+
+        ip += srcBytes;
+        op += cSizeChunk;
+        dstCapacity -= cSizeChunk;
+        cSize += cSizeChunk;
+        *currSeqStore = *nextSeqStore;
+        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+    }
+    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+     * for the next block.
+     */
+    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+    return cSize;
+}
+
+static size_t
+ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                              void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize, U32 lastBlock)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    U32 nbSeq;
+    size_t cSize;
+    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+    assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
+
+    {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+        FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+        if (bss == ZSTDbss_noCompress) {
+            if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+            FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+            return cSize;
+        }
+        nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart);
+    }
+
+    cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq);
+    FORWARD_IF_ERROR(cSize, "Splitting blocks failed!");
+    return cSize;
+}
+
+static size_t
+ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                            void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize, U32 frame)
+{
+    /* This is an estimated upper bound for the length of an rle block.
+     * This isn't the actual upper bound.
+     * Finding the real threshold needs further investigation.
+     */
+    const U32 rleMaxLength = 25;
+    size_t cSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                (unsigned)zc->blockState.matchState.nextToUpdate);
+
+    {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+        FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
+    }
+
+    if (zc->seqCollector.collectSequences) {
+        ZSTD_copyBlockSequences(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        return 0;
+    }
+
+    /* encode sequences and literals */
+    cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore,
+            &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            srcSize,
+            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+            zc->bmi2);
+
+    if (frame &&
+        /* We don't want to emit our first block as a RLE even if it qualifies because
+         * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+         * This is only an issue for zstd <= v1.4.3
+         */
+        !zc->isFirstBlock &&
+        cSize < rleMaxLength &&
+        ZSTD_isRLE(ip, srcSize))
+    {
+        cSize = 1;
+        op[0] = ip[0];
+    }
+
+out:
+    if (!ZSTD_isError(cSize) && cSize > 1) {
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+    }
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               const size_t bss, U32 lastBlock)
+{
+    DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()");
+    if (bss == ZSTDbss_compress) {
+        if (/* We don't want to emit our first block as a RLE even if it qualifies because
+            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+            * This is only an issue for zstd <= v1.4.3
+            */
+            !zc->isFirstBlock &&
+            ZSTD_maybeRLE(&zc->seqStore) &&
+            ZSTD_isRLE((BYTE const*)src, srcSize))
+        {
+            return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock);
+        }
+        /* Attempt superblock compression.
+         *
+         * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the
+         * standard ZSTD_compressBound(). This is a problem, because even if we have
+         * space now, taking an extra byte now could cause us to run out of space later
+         * and violate ZSTD_compressBound().
+         *
+         * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize.
+         *
+         * In order to respect ZSTD_compressBound() we must attempt to emit a raw
+         * uncompressed block in these cases:
+         *   * cSize == 0: Return code for an uncompressed block.
+         *   * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize).
+         *     ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of
+         *     output space.
+         *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+         *     emit an uncompressed block.
+         */
+        {
+            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+            if (cSize != ERROR(dstSize_tooSmall)) {
+                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                    ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+                    return cSize;
+                }
+            }
+        }
+    }
+
+    DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+    /* Superblock compression failed, attempt to emit a single no compress block.
+     * The decoder will be able to stream this block since it is uncompressed.
+     */
+    return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               U32 lastBlock)
+{
+    size_t cSize = 0;
+    const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+    DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize);
+    FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+
+    cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock);
+    FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed");
+
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+                                         ZSTD_cwksp* ws,
+                                         ZSTD_CCtx_params const* params,
+                                         void const* ip,
+                                         void const* iend)
+{
+    U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy);
+    U32 const maxDist = (U32)1 << params->cParams.windowLog;
+    if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) {
+        U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
+        ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        ZSTD_cwksp_mark_tables_dirty(ws);
+        ZSTD_reduceIndex(ms, params, correction);
+        ZSTD_cwksp_mark_tables_clean(ws);
+        if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+        else ms->nextToUpdate -= correction;
+        /* invalidate dictionaries on overflow correction */
+        ms->loadedDictEnd = 0;
+        ms->dictMatchState = NULL;
+    }
+}
+
+/*! ZSTD_compress_frameChunk() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*   @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32 lastFrameChunk)
+{
+    size_t blockSize = cctx->blockSize;
+    size_t remaining = srcSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
+
+    assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX);
+
+    DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize);
+    if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+        XXH64_update(&cctx->xxhState, src, srcSize);
+
+    while (remaining) {
+        ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                        dstSize_tooSmall,
+                        "not enough space to store compressed block");
+        if (remaining < blockSize) blockSize = remaining;
+
+        ZSTD_overflowCorrectIfNeeded(
+            ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
+        ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+        ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+
+        /* Ensure hash/chain table insertion resumes no sooner than lowlimit */
+        if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
+
+        {   size_t cSize;
+            if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) {
+                cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed");
+                assert(cSize > 0);
+                assert(cSize <= blockSize + ZSTD_blockHeaderSize);
+            } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) {
+                cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed");
+                assert(cSize > 0 || cctx->seqCollector.collectSequences == 1);
+            } else {
+                cSize = ZSTD_compressBlock_internal(cctx,
+                                        op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
+                                        ip, blockSize, 1 /* frame */);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed");
+
+                if (cSize == 0) {  /* block is not compressible */
+                    cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+                    FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+                } else {
+                    U32 const cBlockHeader = cSize == 1 ?
+                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+                    MEM_writeLE24(op, cBlockHeader);
+                    cSize += ZSTD_blockHeaderSize;
+                }
+            }
+
+
+            ip += blockSize;
+            assert(remaining >= blockSize);
+            remaining -= blockSize;
+            op += cSize;
+            assert(dstCapacity >= cSize);
+            dstCapacity -= cSize;
+            cctx->isFirstBlock = 0;
+            DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u",
+                        (unsigned)cSize);
+    }   }
+
+    if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending;
+    return (size_t)(op-ostart);
+}
+
+
+static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID)
+{   BYTE* const op = (BYTE*)dst;
+    U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
+    U32   const checksumFlag = params->fParams.checksumFlag>0;
+    U32   const windowSize = (U32)1 << params->cParams.windowLog;
+    U32   const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+    BYTE  const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+    U32   const fcsCode = params->fParams.contentSizeFlag ?
+                     (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0;  /* 0-3 */
+    BYTE  const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
+    size_t pos=0;
+
+    assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
+    RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall,
+                    "dst buf is too small to fit worst-case frame header size.");
+    DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
+                !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode);
+    if (params->format == ZSTD_f_zstd1) {
+        MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
+        pos = 4;
+    }
+    op[pos++] = frameHeaderDescriptionByte;
+    if (!singleSegment) op[pos++] = windowLogByte;
+    switch(dictIDSizeCode)
+    {
+        default:
+            assert(0); /* impossible */
+            ZSTD_FALLTHROUGH;
+        case 0 : break;
+        case 1 : op[pos] = (BYTE)(dictID); pos++; break;
+        case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
+        case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break;
+    }
+    switch(fcsCode)
+    {
+        default:
+            assert(0); /* impossible */
+            ZSTD_FALLTHROUGH;
+        case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
+        case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
+        case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
+        case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break;
+    }
+    return pos;
+}
+
+/* ZSTD_writeSkippableFrame_advanced() :
+ * Writes out a skippable frame with the specified magic number variant (16 are supported),
+ * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data.
+ *
+ * Returns the total number of bytes written, or a ZSTD error code.
+ */
+size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize, unsigned magicVariant) {
+    BYTE* op = (BYTE*)dst;
+    RETURN_ERROR_IF(dstCapacity < srcSize + ZSTD_SKIPPABLEHEADERSIZE /* Skippable frame overhead */,
+                    dstSize_tooSmall, "Not enough room for skippable frame");
+    RETURN_ERROR_IF(srcSize > (unsigned)0xFFFFFFFF, srcSize_wrong, "Src size too large for skippable frame");
+    RETURN_ERROR_IF(magicVariant > 15, parameter_outOfBound, "Skippable frame magic number variant not supported");
+
+    MEM_writeLE32(op, (U32)(ZSTD_MAGIC_SKIPPABLE_START + magicVariant));
+    MEM_writeLE32(op+4, (U32)srcSize);
+    ZSTD_memcpy(op+8, src, srcSize);
+    return srcSize + ZSTD_SKIPPABLEHEADERSIZE;
+}
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+{
+    RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall,
+                    "dst buf is too small to write frame trailer empty block.");
+    {   U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1);  /* 0 size */
+        MEM_writeLE24(dst, cBlockHeader24);
+        return ZSTD_blockHeaderSize;
+    }
+}
+
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+{
+    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+                    "wrong cctx stage");
+    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
+                    parameter_unsupported,
+                    "incompatible with ldm");
+    cctx->externSeqStore.seq = seq;
+    cctx->externSeqStore.size = nbSeq;
+    cctx->externSeqStore.capacity = nbSeq;
+    cctx->externSeqStore.pos = 0;
+    cctx->externSeqStore.posInSequence = 0;
+    return 0;
+}
+
+
+static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                               U32 frame, U32 lastFrameChunk)
+{
+    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+    size_t fhSize = 0;
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+                cctx->stage, (unsigned)srcSize);
+    RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong,
+                    "missing init (ZSTD_compressBegin)");
+
+    if (frame && (cctx->stage==ZSTDcs_init)) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams,
+                                       cctx->pledgedSrcSizePlusOne-1, cctx->dictID);
+        FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+        assert(fhSize <= dstCapacity);
+        dstCapacity -= fhSize;
+        dst = (char*)dst + fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (!srcSize) return fhSize;  /* do not generate an empty block if no input */
+
+    if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) {
+        ms->forceNonContiguous = 0;
+        ms->nextToUpdate = ms->window.dictLimit;
+    }
+    if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0);
+    }
+
+    if (!frame) {
+        /* overflow check and correction for block mode */
+        ZSTD_overflowCorrectIfNeeded(
+            ms, &cctx->workspace, &cctx->appliedParams,
+            src, (BYTE const*)src + srcSize);
+    }
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
+    {   size_t const cSize = frame ?
+                             ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                             ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */);
+        FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed");
+        cctx->consumedSrcSize += srcSize;
+        cctx->producedCSize += (cSize + fhSize);
+        assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+        if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+            ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+            RETURN_ERROR_IF(
+                cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne,
+                srcSize_wrong,
+                "error : pledgedSrcSize = %u, while realSrcSize >= %u",
+                (unsigned)cctx->pledgedSrcSizePlusOne-1,
+                (unsigned)cctx->consumedSrcSize);
+        }
+        return cSize + fhSize;
+    }
+}
+
+size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+}
+
+
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+    assert(!ZSTD_checkCParams(cParams));
+    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+}
+
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+      RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                         ldmState_t* ls,
+                                         ZSTD_cwksp* ws,
+                                         ZSTD_CCtx_params const* params,
+                                         const void* src, size_t srcSize,
+                                         ZSTD_dictTableLoadMethod_e dtlm,
+                                         ZSTD_tableFillPurpose_e tfp)
+{
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+
+    /* Assert that the ms params match the params we're being given */
+    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
+    {   /* Ensure large dictionaries can't cause index overflow */
+
+        /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+         * Dictionaries right at the edge will immediately trigger overflow
+         * correction, but I don't want to insert extra constraints here.
+         */
+        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
+
+        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
+        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
+            /* Some dictionary matchfinders in zstd use "short cache",
+             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
+             * CDict hashtable entry as a tag rather than as part of an index.
+             * When short cache is used, we need to truncate the dictionary
+             * so that its indices don't overlap with the tag. */
+            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
+            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
+            assert(!loadLdmDict);
+        }
+
+        /* If the dictionary is too large, only load the suffix of the dictionary. */
+        if (srcSize > maxDictSize) {
+            ip = iend - maxDictSize;
+            src = ip;
+            srcSize = maxDictSize;
+    }   }
+
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* We must have cleared our windows when our source is this large. */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
+    }
+
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+    ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+    ms->forceNonContiguous = params->deterministicRefPrefix;
+
+    if (loadLdmDict) {
+        ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+        ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+    }
+
+    if (srcSize <= HASH_READ_SIZE) return 0;
+
+    ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+
+    if (loadLdmDict)
+        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+
+    switch(params->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+        break;
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+        assert(srcSize >= HASH_READ_SIZE);
+        if (ms->dedicatedDictSearch) {
+            assert(ms->chainTable != NULL);
+            ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE);
+        } else {
+            assert(params->useRowMatchFinder != ZSTD_ps_auto);
+            if (params->useRowMatchFinder == ZSTD_ps_enable) {
+                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
+                ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                DEBUGLOG(4, "Using row-based hash table for lazy dict");
+            } else {
+                ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE);
+                DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+            }
+        }
+        break;
+
+    case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+    case ZSTD_btultra2:
+        assert(srcSize >= HASH_READ_SIZE);
+        ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
+        break;
+
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    ms->nextToUpdate = (U32)(iend - ms->window.base);
+    return 0;
+}
+
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+ * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check
+ * and only dictionaries with 100% valid symbols can be assumed valid.
+ */
+static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
+{
+    U32 s;
+    if (dictMaxSymbolValue < maxSymbolValue) {
+        return FSE_repeat_check;
+    }
+    for (s = 0; s <= maxSymbolValue; ++s) {
+        if (normalizedCounter[s] == 0) {
+            return FSE_repeat_check;
+        }
+    }
+    return FSE_repeat_valid;
+}
+
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         const void* const dict, size_t dictSize)
+{
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
+    const BYTE* dictPtr = (const BYTE*)dict;    /* skip magic num and dict ID */
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    dictPtr += 8;
+    bs->entropy.huf.repeatMode = HUF_repeat_check;
+
+    {   unsigned maxSymbolValue = 255;
+        unsigned hasZeroWeights = 1;
+        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr,
+            dictEnd-dictPtr, &hasZeroWeights);
+
+        /* We only set the loaded table as valid if it contains all non-zero
+         * weights. Otherwise, we set it to check */
+        if (!hasZeroWeights)
+            bs->entropy.huf.repeatMode = HUF_repeat_valid;
+
+        RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+        dictPtr += hufHeaderSize;
+    }
+
+    {   unsigned offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+        /* fill all offset symbols to avoid garbage at end of table */
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.offcodeCTable,
+                offcodeNCount, MaxOff, offcodeLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.matchlengthCTable,
+                matchlengthNCount, matchlengthMaxValue, matchlengthLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.litlengthCTable,
+                litlengthNCount, litlengthMaxValue, litlengthLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+    bs->rep[0] = MEM_readLE32(dictPtr+0);
+    bs->rep[1] = MEM_readLE32(dictPtr+4);
+    bs->rep[2] = MEM_readLE32(dictPtr+8);
+    dictPtr += 12;
+
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+        }
+        /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */
+        bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff));
+
+        /* All repCodes must be <= dictContentSize and != 0 */
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
+                RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+    }   }   }
+
+    return dictPtr - (const BYTE*)dict;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : dictID, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed >= 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                      ZSTD_matchState_t* ms,
+                                      ZSTD_cwksp* ws,
+                                      ZSTD_CCtx_params const* params,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictTableLoadMethod_e dtlm,
+                                      ZSTD_tableFillPurpose_e tfp,
+                                      void* workspace)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    size_t dictID;
+    size_t eSize;
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(dictSize >= 8);
+    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
+
+    dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr + 4 /* skip magic number */ );
+    eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize);
+    FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
+    dictPtr += eSize;
+
+    {
+        size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+    }
+    return dictID;
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : dictID, or an error code */
+static size_t
+ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                               ZSTD_matchState_t* ms,
+                               ldmState_t* ls,
+                               ZSTD_cwksp* ws,
+                         const ZSTD_CCtx_params* params,
+                         const void* dict, size_t dictSize,
+                               ZSTD_dictContentType_e dictContentType,
+                               ZSTD_dictTableLoadMethod_e dtlm,
+                               ZSTD_tableFillPurpose_e tfp,
+                               void* workspace)
+{
+    DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+    if ((dict==NULL) || (dictSize<8)) {
+        RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+        return 0;
+    }
+
+    ZSTD_reset_compressedBlockState(bs);
+
+    /* dict restricted modes */
+    if (dictContentType == ZSTD_dct_rawContent)
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+        if (dictContentType == ZSTD_dct_auto) {
+            DEBUGLOG(4, "raw content dictionary detected");
+            return ZSTD_loadDictionaryContent(
+                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+        }
+        RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+        assert(0);   /* impossible */
+    }
+
+    /* dict as full zstd dictionary */
+    return ZSTD_loadZstdDictionary(
+        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+}
+
+#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+
+/*! ZSTD_compressBegin_internal() :
+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+ * @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize,
+                                    ZSTD_buffered_policy_e zbuff)
+{
+    size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize;
+#if ZSTD_TRACE
+    cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0;
+#endif
+    DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+    if ( (cdict)
+      && (cdict->dictContentSize > 0)
+      && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+        || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+        || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+        || cdict->compressionLevel == 0)
+      && (params->attachDictPref != ZSTD_dictForceLoad) ) {
+        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
+    }
+
+    FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                     dictContentSize,
+                                     ZSTDcrp_makeClean, zbuff) , "");
+    {   size_t const dictID = cdict ?
+                ZSTD_compress_insertDictionary(
+                        cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                        &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                        cdict->dictContentSize, cdict->dictContentType, dtlm,
+                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+              : ZSTD_compress_insertDictionary(
+                        cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                        &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+        FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+        assert(dictID <= UINT_MAX);
+        cctx->dictID = (U32)dictID;
+        cctx->dictContentSize = dictContentSize;
+    }
+    return 0;
+}
+
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params,
+                                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog);
+    /* compression parameters verification and optimization */
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , "");
+    return ZSTD_compressBegin_internal(cctx,
+                                       dict, dictSize, dictContentType, dtlm,
+                                       cdict,
+                                       params, pledgedSrcSize,
+                                       ZSTDb_not_buffered);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams;
+    ZSTD_CCtxParams_init_internal(&cctxParams, &params, ZSTD_NO_CLEVEL);
+    return ZSTD_compressBegin_advanced_internal(cctx,
+                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                            NULL /*cdict*/,
+                                            &cctxParams, pledgedSrcSize);
+}
+
+size_t
+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_CCtx_params cctxParams;
+    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+        ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+    }
+    DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                                       &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+}
+
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    size_t fhSize = 0;
+
+    DEBUGLOG(4, "ZSTD_writeEpilogue");
+    RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+
+    /* special case : empty frame */
+    if (cctx->stage == ZSTDcs_init) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+        FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+        dstCapacity -= fhSize;
+        op += fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (cctx->stage != ZSTDcs_ending) {
+        /* write one last empty block, make it the "last" block */
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+        MEM_writeLE32(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+    }
+
+    if (cctx->appliedParams.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+        DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum);
+        MEM_writeLE32(op, checksum);
+        op += 4;
+    }
+
+    cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+    return op-ostart;
+}
+
+void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+{
+#if ZSTD_TRACE
+    if (cctx->traceCtx && ZSTD_trace_compress_end != NULL) {
+        int const streaming = cctx->inBuffSize > 0 || cctx->outBuffSize > 0 || cctx->appliedParams.nbWorkers > 0;
+        ZSTD_Trace trace;
+        ZSTD_memset(&trace, 0, sizeof(trace));
+        trace.version = ZSTD_VERSION_NUMBER;
+        trace.streaming = streaming;
+        trace.dictionaryID = cctx->dictID;
+        trace.dictionarySize = cctx->dictContentSize;
+        trace.uncompressedSize = cctx->consumedSrcSize;
+        trace.compressedSize = cctx->producedCSize + extraCSize;
+        trace.params = &cctx->appliedParams;
+        trace.cctx = cctx;
+        ZSTD_trace_compress_end(cctx->traceCtx, &trace);
+    }
+    cctx->traceCtx = 0;
+#else
+    (void)cctx;
+    (void)extraCSize;
+#endif
+}
+
+size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize)
+{
+    size_t endResult;
+    size_t const cSize = ZSTD_compressContinue_internal(cctx,
+                                dst, dstCapacity, src, srcSize,
+                                1 /* frame mode */, 1 /* last chunk */);
+    FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed");
+    endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
+    FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed");
+    assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+    if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+        ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+        DEBUGLOG(4, "end of frame : controlling src size");
+        RETURN_ERROR_IF(
+            cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1,
+            srcSize_wrong,
+             "error : pledgedSrcSize = %u, while realSrcSize = %u",
+            (unsigned)cctx->pledgedSrcSizePlusOne-1,
+            (unsigned)cctx->consumedSrcSize);
+    }
+    ZSTD_CCtx_trace(cctx, endResult);
+    return cSize + endResult;
+}
+
+size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced");
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, ZSTD_NO_CLEVEL);
+    return ZSTD_compress_advanced_internal(cctx,
+                                           dst, dstCapacity,
+                                           src, srcSize,
+                                           dict, dictSize,
+                                           &cctx->simpleApiParams);
+}
+
+/* Internal */
+size_t ZSTD_compress_advanced_internal(
+        ZSTD_CCtx* cctx,
+        void* dst, size_t dstCapacity,
+        const void* src, size_t srcSize,
+        const void* dict,size_t dictSize,
+        const ZSTD_CCtx_params* params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize);
+    FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                         dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                         params, srcSize, ZSTDb_not_buffered) , "");
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict, size_t dictSize,
+                               int compressionLevel)
+{
+    {
+        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict);
+        assert(params.fParams.contentSizeFlag == 1);
+        ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel);
+    }
+    DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize);
+    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize,
+                         int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize);
+    assert(cctx != NULL);
+    return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_compress(void* dst, size_t dstCapacity,
+               const void* src, size_t srcSize,
+                     int compressionLevel)
+{
+    size_t result;
+#if ZSTD_COMPRESS_HEAPMODE
+    ZSTD_CCtx* cctx = ZSTD_createCCtx();
+    RETURN_ERROR_IF(!cctx, memory_allocation, "ZSTD_createCCtx failed");
+    result = ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_freeCCtx(cctx);
+#else
+    ZSTD_CCtx ctxBody;
+    ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
+    result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_freeCCtxContent(&ctxBody);   /* can't free ctxBody itself, as it's on stack; free only heap content */
+#endif
+    return result;
+}
+
+
+/* =====  Dictionary API  ===== */
+
+/*! ZSTD_estimateCDictSize_advanced() :
+ *  Estimate amount of memory that will be needed to create a dictionary with following arguments */
+size_t ZSTD_estimateCDictSize_advanced(
+        size_t dictSize, ZSTD_compressionParameters cParams,
+        ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));
+    return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+         + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+         /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small
+          * in case we are using DDS with row-hash. */
+         + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams),
+                                  /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0)
+         + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+            : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *))));
+}
+
+size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
+}
+
+size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support sizeof on NULL */
+    DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict));
+    /* cdict may be in the workspace */
+    return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict))
+        + ZSTD_cwksp_sizeof(&cdict->workspace);
+}
+
+static size_t ZSTD_initCDict_internal(
+                    ZSTD_CDict* cdict,
+              const void* dictBuffer, size_t dictSize,
+                    ZSTD_dictLoadMethod_e dictLoadMethod,
+                    ZSTD_dictContentType_e dictContentType,
+                    ZSTD_CCtx_params params)
+{
+    DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType);
+    assert(!ZSTD_checkCParams(params.cParams));
+    cdict->matchState.cParams = params.cParams;
+    cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch;
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
+        cdict->dictContent = dictBuffer;
+    } else {
+         void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*)));
+        RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!");
+        cdict->dictContent = internalBuffer;
+        ZSTD_memcpy(internalBuffer, dictBuffer, dictSize);
+    }
+    cdict->dictContentSize = dictSize;
+    cdict->dictContentType = dictContentType;
+
+    cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE);
+
+
+    /* Reset the state to no dictionary */
+    ZSTD_reset_compressedBlockState(&cdict->cBlockState);
+    FORWARD_IF_ERROR(ZSTD_reset_matchState(
+        &cdict->matchState,
+        &cdict->workspace,
+        &params.cParams,
+        params.useRowMatchFinder,
+        ZSTDcrp_makeClean,
+        ZSTDirp_reset,
+        ZSTD_resetTarget_CDict), "");
+    /* (Maybe) load the dictionary
+     * Skips loading the dictionary if it is < 8 bytes.
+     */
+    {   params.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+        params.fParams.contentSizeFlag = 1;
+        {   size_t const dictID = ZSTD_compress_insertDictionary(
+                    &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                    &params, cdict->dictContent, cdict->dictContentSize,
+                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+            FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+            assert(dictID <= (size_t)(U32)-1);
+            cdict->dictID = (U32)dictID;
+        }
+    }
+
+    return 0;
+}
+
+static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_compressionParameters cParams,
+                                      ZSTD_paramSwitch_e useRowMatchFinder,
+                                      U32 enableDedicatedDictSearch,
+                                      ZSTD_customMem customMem)
+{
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+
+    {   size_t const workspaceSize =
+            ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
+            ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) +
+            ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) +
+            (dictLoadMethod == ZSTD_dlm_byRef ? 0
+             : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));
+        void* const workspace = ZSTD_customMalloc(workspaceSize, customMem);
+        ZSTD_cwksp ws;
+        ZSTD_CDict* cdict;
+
+        if (!workspace) {
+            ZSTD_customFree(workspace, customMem);
+            return NULL;
+        }
+
+        ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_dynamic_alloc);
+
+        cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+        assert(cdict != NULL);
+        ZSTD_cwksp_move(&cdict->workspace, &ws);
+        cdict->customMem = customMem;
+        cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */
+        cdict->useRowMatchFinder = useRowMatchFinder;
+        return cdict;
+    }
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_compressionParameters cParams,
+                                      ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params cctxParams;
+    ZSTD_memset(&cctxParams, 0, sizeof(cctxParams));
+    ZSTD_CCtxParams_init(&cctxParams, 0);
+    cctxParams.cParams = cParams;
+    cctxParams.customMem = customMem;
+    return ZSTD_createCDict_advanced2(
+        dictBuffer, dictSize,
+        dictLoadMethod, dictContentType,
+        &cctxParams, customMem);
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced2(
+        const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod,
+        ZSTD_dictContentType_e dictContentType,
+        const ZSTD_CCtx_params* originalCctxParams,
+        ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params cctxParams = *originalCctxParams;
+    ZSTD_compressionParameters cParams;
+    ZSTD_CDict* cdict;
+
+    DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType);
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    if (cctxParams.enableDedicatedDictSearch) {
+        cParams = ZSTD_dedicatedDictSearch_getCParams(
+            cctxParams.compressionLevel, dictSize);
+        ZSTD_overrideCParams(&cParams, &cctxParams.cParams);
+    } else {
+        cParams = ZSTD_getCParamsFromCCtxParams(
+            &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    }
+
+    if (!ZSTD_dedicatedDictSearch_isSupported(&cParams)) {
+        /* Fall back to non-DDSS params */
+        cctxParams.enableDedicatedDictSearch = 0;
+        cParams = ZSTD_getCParamsFromCCtxParams(
+            &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    }
+
+    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch);
+    cctxParams.cParams = cParams;
+    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+
+    cdict = ZSTD_createCDict_advanced_internal(dictSize,
+                        dictLoadMethod, cctxParams.cParams,
+                        cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
+                        customMem);
+
+    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                    dict, dictSize,
+                                    dictLoadMethod, dictContentType,
+                                    cctxParams) )) {
+        ZSTD_freeCDict(cdict);
+        return NULL;
+    }
+
+    return cdict;
+}
+
+ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize,
+                                                  ZSTD_dlm_byCopy, ZSTD_dct_auto,
+                                                  cParams, ZSTD_defaultCMem);
+    if (cdict)
+        cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
+    return cdict;
+}
+
+ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
+    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize,
+                                     ZSTD_dlm_byRef, ZSTD_dct_auto,
+                                     cParams, ZSTD_defaultCMem);
+    if (cdict)
+        cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
+    return cdict;
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = cdict->customMem;
+        int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict);
+        ZSTD_cwksp_free(&cdict->workspace, cMem);
+        if (!cdictInWorkspace) {
+            ZSTD_customFree(cdict, cMem);
+        }
+        return 0;
+    }
+}
+
+/*! ZSTD_initStaticCDict_advanced() :
+ *  Generate a digested dictionary in provided memory area.
+ *  workspace: The memory area to emplace the dictionary into.
+ *             Provided pointer must 8-bytes aligned.
+ *             It must outlive dictionary usage.
+ *  workspaceSize: Use ZSTD_estimateCDictSize()
+ *                 to determine how large workspace must be.
+ *  cParams : use ZSTD_getCParams() to transform a compression level
+ *            into its relevants cParams.
+ * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+ *  Note : there is no corresponding "free" function.
+ *         Since workspace was allocated externally, it must be freed externally.
+ */
+const ZSTD_CDict* ZSTD_initStaticCDict(
+                                 void* workspace, size_t workspaceSize,
+                           const void* dict, size_t dictSize,
+                                 ZSTD_dictLoadMethod_e dictLoadMethod,
+                                 ZSTD_dictContentType_e dictContentType,
+                                 ZSTD_compressionParameters cParams)
+{
+    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
+    /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */
+    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
+    size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+                            + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+                               : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))))
+                            + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+                            + matchStateSize;
+    ZSTD_CDict* cdict;
+    ZSTD_CCtx_params params;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+
+    {
+        ZSTD_cwksp ws;
+        ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc);
+        cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+        if (cdict == NULL) return NULL;
+        ZSTD_cwksp_move(&cdict->workspace, &ws);
+    }
+
+    DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+        (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize));
+    if (workspaceSize < neededSize) return NULL;
+
+    ZSTD_CCtxParams_init(&params, 0);
+    params.cParams = cParams;
+    params.useRowMatchFinder = useRowMatchFinder;
+    cdict->useRowMatchFinder = useRowMatchFinder;
+
+    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                              dict, dictSize,
+                                              dictLoadMethod, dictContentType,
+                                              params) ))
+        return NULL;
+
+    return cdict;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
+{
+    assert(cdict != NULL);
+    return cdict->matchState.cParams;
+}
+
+/*! ZSTD_getDictID_fromCDict() :
+ *  Provides the dictID of the dictionary loaded into `cdict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;
+    return cdict->dictID;
+}
+
+/* ZSTD_compressBegin_usingCDict_internal() :
+ * Implementation of various ZSTD_compressBegin_usingCDict* functions.
+ */
+static size_t ZSTD_compressBegin_usingCDict_internal(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams;
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal");
+    RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!");
+    /* Initialize the cctxParams from the cdict */
+    {
+        ZSTD_parameters params;
+        params.fParams = fParams;
+        params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+                        || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+                        || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+                        || cdict->compressionLevel == 0 ) ?
+                ZSTD_getCParamsFromCDict(cdict)
+              : ZSTD_getCParams(cdict->compressionLevel,
+                                pledgedSrcSize,
+                                cdict->dictContentSize);
+        ZSTD_CCtxParams_init_internal(&cctxParams, &params, cdict->compressionLevel);
+    }
+    /* Increase window log to fit the entire dictionary and source if the
+     * source size is known. Limit the increase to 19, which is the
+     * window log for compression level 1 with the largest source size.
+     */
+    if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19);
+        U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1;
+        cctxParams.cParams.windowLog = MAX(cctxParams.cParams.windowLog, limitedSrcLog);
+    }
+    return ZSTD_compressBegin_internal(cctx,
+                                        NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                        cdict,
+                                        &cctxParams, pledgedSrcSize,
+                                        ZSTDb_not_buffered);
+}
+
+
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * This function is DEPRECATED.
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize);
+}
+
+/* ZSTD_compressBegin_usingCDict() :
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+/*! ZSTD_compress_usingCDict_internal():
+ * Implementation of various ZSTD_compress_usingCDict* functions.
+ */
+static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/*! ZSTD_compress_usingCDict_advanced():
+ * This function is DEPRECATED.
+ */
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression parameters are decided at CDict creation time
+ *  while frame parameters are hardcoded */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+ZSTD_CStream* ZSTD_createCStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createCStream");
+    return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticCCtx(workspace, workspaceSize);
+}
+
+ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{   /* CStream and CCtx are now same object */
+    return ZSTD_createCCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
+{
+    return ZSTD_freeCCtx(zcs);   /* same object */
+}
+
+
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_CStreamOutSize(void)
+{
+    return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+}
+
+static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize)
+{
+    if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize))
+        return ZSTD_cpm_attachDict;
+    else
+        return ZSTD_cpm_noAttachDict;
+}
+
+/* ZSTD_resetCStream():
+ * pledgedSrcSize == 0 means "unknown" */
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss)
+{
+    /* temporary : 0 interpreted as "unknown" during transition period.
+     * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+     * 0 will be interpreted as "empty" in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize);
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    return 0;
+}
+
+/*! ZSTD_initCStream_internal() :
+ *  Note : for lib/compress only. Used by zstdmt_compress.c.
+ *  Assumption 1 : params are valid
+ *  Assumption 2 : either dict, or cdict, is defined, not both */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize, const ZSTD_CDict* cdict,
+                    const ZSTD_CCtx_params* params,
+                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_internal");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+    zcs->requestedParams = *params;
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+    if (dict) {
+        FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    } else {
+        /* Dictionary is cleared if !cdict */
+        FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    }
+    return 0;
+}
+
+/* ZSTD_initCStream_usingCDict_advanced() :
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                            const ZSTD_CDict* cdict,
+                                            ZSTD_frameParameters fParams,
+                                            unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    zcs->requestedParams.fParams = fParams;
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    return 0;
+}
+
+/* note : cdict must outlive compression session */
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    return 0;
+}
+
+
+/* ZSTD_initCStream_advanced() :
+ * pledgedSrcSize must be exact.
+ * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                                 const void* dict, size_t dictSize,
+                                 ZSTD_parameters params, unsigned long long pss)
+{
+    /* for compatibility with older programs relying on this behavior.
+     * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN.
+     * This line will be removed in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_initCStream_advanced");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+    ZSTD_CCtxParams_setZstdParams(&zcs->requestedParams, &params);
+    FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingDict");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
+{
+    /* temporary : 0 interpreted as "unknown" during transition period.
+     * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+     * 0 will be interpreted as "empty" in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_initCStream_srcSize");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    return 0;
+}
+
+/*======   Compression   ======*/
+
+static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+{
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        return cctx->blockSize - cctx->stableIn_notConsumed;
+    }
+    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
+    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+        if (hintInSize==0) hintInSize = cctx->blockSize;
+        return hintInSize;
+    }
+}
+
+/** ZSTD_compressStream_generic():
+ *  internal function for all *compressStream*() variants
+ * @return : hint size for next input to complete ongoing block */
+static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                          ZSTD_outBuffer* output,
+                                          ZSTD_inBuffer* input,
+                                          ZSTD_EndDirective const flushMode)
+{
+    const char* const istart = (assert(input != NULL), (const char*)input->src);
+    const char* const iend = (istart != NULL) ? istart + input->size : istart;
+    const char* ip = (istart != NULL) ? istart + input->pos : istart;
+    char* const ostart = (assert(output != NULL), (char*)output->dst);
+    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
+    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+    U32 someMoreWork = 1;
+
+    /* check expectations */
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
+    assert(zcs != NULL);
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        assert(input->pos >= zcs->stableIn_notConsumed);
+        input->pos -= zcs->stableIn_notConsumed;
+        ip -= zcs->stableIn_notConsumed;
+        zcs->stableIn_notConsumed = 0;
+    }
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+        assert(zcs->inBuff != NULL);
+        assert(zcs->inBuffSize > 0);
+    }
+    if (zcs->appliedParams.outBufferMode == ZSTD_bm_buffered) {
+        assert(zcs->outBuff !=  NULL);
+        assert(zcs->outBuffSize > 0);
+    }
+    if (input->src == NULL) assert(input->size == 0);
+    assert(input->pos <= input->size);
+    if (output->dst == NULL) assert(output->size == 0);
+    assert(output->pos <= output->size);
+    assert((U32)flushMode <= (U32)ZSTD_e_end);
+
+    while (someMoreWork) {
+        switch(zcs->streamStage)
+        {
+        case zcss_init:
+            RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!");
+
+        case zcss_load:
+            if ( (flushMode == ZSTD_e_end)
+              && ( (size_t)(oend-op) >= ZSTD_compressBound(iend-ip)     /* Enough output space */
+                || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+              && (zcs->inBuffPos == 0) ) {
+                /* shortcut to compression pass directly into output buffer */
+                size_t const cSize = ZSTD_compressEnd(zcs,
+                                                op, oend-op, ip, iend-ip);
+                DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+                ip = iend;
+                op += cSize;
+                zcs->frameEnded = 1;
+                ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                someMoreWork = 0; break;
+            }
+            /* complete loading into inBuffer in buffered mode */
+            if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+                size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                size_t const loaded = ZSTD_limitCopy(
+                                        zcs->inBuff + zcs->inBuffPos, toLoad,
+                                        ip, iend-ip);
+                zcs->inBuffPos += loaded;
+                if (ip) ip += loaded;
+                if ( (flushMode == ZSTD_e_continue)
+                  && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                    /* not enough input to fill full block : stop here */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (zcs->inBuffPos == zcs->inToCompress) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
+            } else {
+                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+                if ( (flushMode == ZSTD_e_continue)
+                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
+                    /* can't compress a full block : stop here */
+                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
+                    ip = iend;  /* pretend to have consumed input */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (ip == iend) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
+            }
+            /* compress current block (note : this stage cannot be stopped in the middle) */
+            DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+            {   int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered);
+                void* cDst;
+                size_t cSize;
+                size_t oSize = oend-op;
+                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
+                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
+                if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                    cDst = op;   /* compress into output buffer, to skip flush stage */
+                else
+                    cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+                if (inputBuffered) {
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                    cSize = lastBlock ?
+                            ZSTD_compressEnd(zcs, cDst, oSize,
+                                        zcs->inBuff + zcs->inToCompress, iSize) :
+                            ZSTD_compressContinue(zcs, cDst, oSize,
+                                        zcs->inBuff + zcs->inToCompress, iSize);
+                    FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                    zcs->frameEnded = lastBlock;
+                    /* prepare next block */
+                    zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+                    if (zcs->inBuffTarget > zcs->inBuffSize)
+                        zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
+                    DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+                            (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize);
+                    if (!lastBlock)
+                        assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                    zcs->inToCompress = zcs->inBuffPos;
+                } else { /* !inputBuffered, hence ZSTD_bm_stable */
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                    cSize = lastBlock ?
+                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
+                    /* Consume the input prior to error checking to mirror buffered mode. */
+                    if (ip) ip += iSize;
+                    FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                    zcs->frameEnded = lastBlock;
+                    if (lastBlock) assert(ip == iend);
+                }
+                if (cDst == op) {  /* no need to flush */
+                    op += cSize;
+                    if (zcs->frameEnded) {
+                        DEBUGLOG(5, "Frame completed directly in outBuffer");
+                        someMoreWork = 0;
+                        ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                    }
+                    break;
+                }
+                zcs->outBuffContentSize = cSize;
+                zcs->outBuffFlushedSize = 0;
+                zcs->streamStage = zcss_flush; /* pass-through to flush stage */
+            }
+	    ZSTD_FALLTHROUGH;
+        case zcss_flush:
+            DEBUGLOG(5, "flush stage");
+            assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered);
+            {   size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+                size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op),
+                            zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+                DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
+                            (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed);
+                if (flushed)
+                    op += flushed;
+                zcs->outBuffFlushedSize += flushed;
+                if (toFlush!=flushed) {
+                    /* flush not fully completed, presumably because dst is too small */
+                    assert(op==oend);
+                    someMoreWork = 0;
+                    break;
+                }
+                zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+                if (zcs->frameEnded) {
+                    DEBUGLOG(5, "Frame completed on flush");
+                    someMoreWork = 0;
+                    ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                    break;
+                }
+                zcs->streamStage = zcss_load;
+                break;
+            }
+
+        default: /* impossible */
+            assert(0);
+        }
+    }
+
+    input->pos = ip - istart;
+    output->pos = op - ostart;
+    if (zcs->frameEnded) return 0;
+    return ZSTD_nextInputSizeHint(zcs);
+}
+
+static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers >= 1) {
+        assert(cctx->mtctx != NULL);
+        return ZSTDMT_nextInputSizeHint(cctx->mtctx);
+    }
+#endif
+    return ZSTD_nextInputSizeHint(cctx);
+
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , "");
+    return ZSTD_nextInputSizeHint_MTorST(zcs);
+}
+
+/* After a compression call set the expected input/output buffer.
+ * This is validated at the start of the next compression call.
+ */
+static void
+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+{
+    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        cctx->expectedInBuffer = *input;
+    }
+    if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+        cctx->expectedOutBufferSize = output->size - output->pos;
+    }
+}
+
+/* Validate that the input/output buffers match the expectations set by
+ * ZSTD_setBufferExpectations.
+ */
+static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+                                        ZSTD_outBuffer const* output,
+                                        ZSTD_inBuffer const* input,
+                                        ZSTD_EndDirective endOp)
+{
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+        if (expect.src != input->src || expect.pos != input->pos)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+    }
+    (void)endOp;
+    if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+        size_t const outBufferSize = output->size - output->pos;
+        if (cctx->expectedOutBufferSize != outBufferSize)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+    }
+    return 0;
+}
+
+static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                             ZSTD_EndDirective endOp,
+                                             size_t inSize)
+{
+    ZSTD_CCtx_params params = cctx->requestedParams;
+    ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+    FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+    ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));   /* single usage */
+    assert(prefixDict.dict==NULL || cctx->cdict==NULL);    /* only one can be set */
+    if (cctx->cdict && !cctx->localDict.cdict) {
+        /* Let the cdict's compression level take priority over the requested params.
+         * But do not take the cdict's compression level if the "cdict" is actually a localDict
+         * generated from ZSTD_initLocalDict().
+         */
+        params.compressionLevel = cctx->cdict->compressionLevel;
+    }
+    DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
+
+    {   size_t const dictSize = prefixDict.dict
+                ? prefixDict.dictSize
+                : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+        ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+        params.cParams = ZSTD_getCParamsFromCCtxParams(
+                &params, cctx->pledgedSrcSizePlusOne-1,
+                dictSize, mode);
+    }
+
+    params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+    params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+    params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
+
+#ifdef ZSTD_MULTITHREAD
+    if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
+        params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
+    }
+    if (params.nbWorkers > 0) {
+#if ZSTD_TRACE
+        cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0;
+#endif
+        /* mt context creation */
+        if (cctx->mtctx == NULL) {
+            DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u",
+                        params.nbWorkers);
+            cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem, cctx->pool);
+            RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!");
+        }
+        /* mt compression */
+        DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers);
+        FORWARD_IF_ERROR( ZSTDMT_initCStream_internal(
+                    cctx->mtctx,
+                    prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+                    cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , "");
+        cctx->dictID = cctx->cdict ? cctx->cdict->dictID : 0;
+        cctx->dictContentSize = cctx->cdict ? cctx->cdict->dictContentSize : prefixDict.dictSize;
+        cctx->consumedSrcSize = 0;
+        cctx->producedCSize = 0;
+        cctx->streamStage = zcss_load;
+        cctx->appliedParams = params;
+    } else
+#endif  /* ZSTD_MULTITHREAD */
+    {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+        assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+        FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, ZSTD_dtlm_fast,
+                cctx->cdict,
+                &params, pledgedSrcSize,
+                ZSTDb_buffered) , "");
+        assert(cctx->appliedParams.nbWorkers == 0);
+        cctx->inToCompress = 0;
+        cctx->inBuffPos = 0;
+        if (cctx->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+            /* for small input: avoid automatic flush on reaching end of block, since
+            * it would require to add a 3-bytes null block to end frame
+            */
+            cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize);
+        } else {
+            cctx->inBuffTarget = 0;
+        }
+        cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0;
+        cctx->streamStage = zcss_load;
+        cctx->frameEnded = 0;
+    }
+    return 0;
+}
+
+/* @return provides a minimum amount of data remaining to be flushed from internal buffers
+ */
+size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                             ZSTD_outBuffer* output,
+                             ZSTD_inBuffer* input,
+                             ZSTD_EndDirective endOp)
+{
+    DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp);
+    /* check conditions */
+    RETURN_ERROR_IF(output->pos > output->size, dstSize_tooSmall, "invalid output buffer");
+    RETURN_ERROR_IF(input->pos  > input->size, srcSize_wrong, "invalid input buffer");
+    RETURN_ERROR_IF((U32)endOp > (U32)ZSTD_e_end, parameter_outOfBound, "invalid endDirective");
+    assert(cctx != NULL);
+
+    /* transparent initialization stage */
+    if (cctx->streamStage == zcss_init) {
+        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
+        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
+        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
+          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
+          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
+            if (cctx->stableIn_notConsumed) {  /* not the first time */
+                /* check stable source guarantees */
+                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
+                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
+            }
+            /* pretend input was consumed, to give a sense forward progress */
+            input->pos = input->size;
+            /* save stable inBuffer, for later control, and flush/end */
+            cctx->expectedInBuffer = *input;
+            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
+            cctx->stableIn_notConsumed += inputSize;
+            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
+            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
+        }
+        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
+        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+    }
+    /* end of transparent initialization stage */
+
+    FORWARD_IF_ERROR(ZSTD_checkBufferStability(cctx, output, input, endOp), "invalid buffers");
+    /* compression stage */
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        size_t flushMin;
+        if (cctx->cParamsChanged) {
+            ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams);
+            cctx->cParamsChanged = 0;
+        }
+        if (cctx->stableIn_notConsumed) {
+            assert(cctx->appliedParams.inBufferMode == ZSTD_bm_stable);
+            /* some early data was skipped - make it available for consumption */
+            assert(input->pos >= cctx->stableIn_notConsumed);
+            input->pos -= cctx->stableIn_notConsumed;
+            cctx->stableIn_notConsumed = 0;
+        }
+        for (;;) {
+            size_t const ipos = input->pos;
+            size_t const opos = output->pos;
+            flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
+            cctx->consumedSrcSize += (U64)(input->pos - ipos);
+            cctx->producedCSize += (U64)(output->pos - opos);
+            if ( ZSTD_isError(flushMin)
+              || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
+                if (flushMin == 0)
+                    ZSTD_CCtx_trace(cctx, 0);
+                ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+            }
+            FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed");
+
+            if (endOp == ZSTD_e_continue) {
+                /* We only require some progress with ZSTD_e_continue, not maximal progress.
+                 * We're done if we've consumed or produced any bytes, or either buffer is
+                 * full.
+                 */
+                if (input->pos != ipos || output->pos != opos || input->pos == input->size || output->pos == output->size)
+                    break;
+            } else {
+                assert(endOp == ZSTD_e_flush || endOp == ZSTD_e_end);
+                /* We require maximal progress. We're done when the flush is complete or the
+                 * output buffer is full.
+                 */
+                if (flushMin == 0 || output->pos == output->size)
+                    break;
+            }
+        }
+        DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic");
+        /* Either we don't require maximum forward progress, we've finished the
+         * flush, or we are out of output space.
+         */
+        assert(endOp == ZSTD_e_continue || flushMin == 0 || output->pos == output->size);
+        ZSTD_setBufferExpectations(cctx, output, input);
+        return flushMin;
+    }
+#endif /* ZSTD_MULTITHREAD */
+    FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , "");
+    DEBUGLOG(5, "completed ZSTD_compressStream2");
+    ZSTD_setBufferExpectations(cctx, output, input);
+    return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
+
+size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+                      void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    ZSTD_bufferMode_e const originalInBufferMode = cctx->requestedParams.inBufferMode;
+    ZSTD_bufferMode_e const originalOutBufferMode = cctx->requestedParams.outBufferMode;
+    DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize);
+    ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+    /* Enable stable input/output buffers. */
+    cctx->requestedParams.inBufferMode = ZSTD_bm_stable;
+    cctx->requestedParams.outBufferMode = ZSTD_bm_stable;
+    {   size_t oPos = 0;
+        size_t iPos = 0;
+        size_t const result = ZSTD_compressStream2_simpleArgs(cctx,
+                                        dst, dstCapacity, &oPos,
+                                        src, srcSize, &iPos,
+                                        ZSTD_e_end);
+        /* Reset to the original values. */
+        cctx->requestedParams.inBufferMode = originalInBufferMode;
+        cctx->requestedParams.outBufferMode = originalOutBufferMode;
+        FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+        if (result != 0) {  /* compression not completed, due to lack of output space */
+            assert(oPos == dstCapacity);
+            RETURN_ERROR(dstSize_tooSmall, "");
+        }
+        assert(iPos == srcSize);   /* all input is expected consumed */
+        return oPos;
+    }
+}
+
+typedef struct {
+    U32 idx;            /* Index in array of ZSTD_Sequence */
+    U32 posInSequence;  /* Position within sequence at idx */
+    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
+} ZSTD_sequencePosition;
+
+/* ZSTD_validateSequence() :
+ * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+ * @returns a ZSTD error code if sequence is not valid
+ */
+static size_t
+ZSTD_validateSequence(U32 offCode, U32 matchLength,
+                      size_t posInSrc, U32 windowLog, size_t dictSize)
+{
+    U32 const windowSize = 1 << windowLog;
+    /* posInSrc represents the amount of data the decoder would decode up to this point.
+     * As long as the amount of data decoded is less than or equal to window size, offsets may be
+     * larger than the total length of output decoded in order to reference the dict, even larger than
+     * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+     */
+    size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), corruption_detected, "Offset too large!");
+    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
+    return 0;
+}
+
+/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+{
+    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+
+    if (!ll0 && rawOffset == rep[0]) {
+        offBase = REPCODE1_TO_OFFBASE;
+    } else if (rawOffset == rep[1]) {
+        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+    } else if (rawOffset == rep[2]) {
+        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+    } else if (ll0 && rawOffset == rep[0] - 1) {
+        offBase = REPCODE3_TO_OFFBASE;
+    }
+    return offBase;
+}
+
+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+ */
+static size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                        const void* src, size_t blockSize)
+{
+    U32 idx = seqPos->idx;
+    BYTE const* ip = (BYTE const*)(src);
+    const BYTE* const iend = ip + blockSize;
+    repcodes_t updatedRepcodes;
+    U32 dictSize;
+
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
+
+    if (cctx->cdict) {
+        dictSize = (U32)cctx->cdict->dictContentSize;
+    } else if (cctx->prefixDict.dict) {
+        dictSize = (U32)cctx->prefixDict.dictSize;
+    } else {
+        dictSize = 0;
+    }
+    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+        U32 const litLength = inSeqs[idx].litLength;
+        U32 const ll0 = (litLength == 0);
+        U32 const matchLength = inSeqs[idx].matchLength;
+        U32 const offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+        ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        if (cctx->appliedParams.validateSequences) {
+            seqPos->posInSrc += litLength + matchLength;
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, seqPos->posInSrc,
+                                                cctx->appliedParams.cParams.windowLog, dictSize),
+                                                "Sequence validation failed");
+        }
+        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+                        "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+        ip += matchLength + litLength;
+    }
+    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+
+    if (inSeqs[idx].litLength) {
+        DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength);
+        ZSTD_storeLastLiterals(&cctx->seqStore, ip, inSeqs[idx].litLength);
+        ip += inSeqs[idx].litLength;
+        seqPos->posInSrc += inSeqs[idx].litLength;
+    }
+    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
+    seqPos->idx = idx+1;
+    return 0;
+}
+
+/* Returns the number of bytes to move the current read position back by.
+ * Only non-zero if we ended up splitting a sequence.
+ * Otherwise, it may return a ZSTD error if something went wrong.
+ *
+ * This function will attempt to scan through blockSize bytes
+ * represented by the sequences in @inSeqs,
+ * storing any (partial) sequences.
+ *
+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+ */
+static size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                   const void* src, size_t blockSize)
+{
+    U32 idx = seqPos->idx;
+    U32 startPosInSequence = seqPos->posInSequence;
+    U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize;
+    size_t dictSize;
+    BYTE const* ip = (BYTE const*)(src);
+    BYTE const* iend = ip + blockSize;  /* May be adjusted if we decide to process fewer than blockSize bytes */
+    repcodes_t updatedRepcodes;
+    U32 bytesAdjustment = 0;
+    U32 finalMatchSplit = 0;
+
+    if (cctx->cdict) {
+        dictSize = cctx->cdict->dictContentSize;
+    } else if (cctx->prefixDict.dict) {
+        dictSize = cctx->prefixDict.dictSize;
+    } else {
+        dictSize = 0;
+    }
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+    DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+    ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+    while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+        const ZSTD_Sequence currSeq = inSeqs[idx];
+        U32 litLength = currSeq.litLength;
+        U32 matchLength = currSeq.matchLength;
+        U32 const rawOffset = currSeq.offset;
+        U32 offBase;
+
+        /* Modify the sequence depending on where endPosInSequence lies */
+        if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+            if (startPosInSequence >= litLength) {
+                startPosInSequence -= litLength;
+                litLength = 0;
+                matchLength -= startPosInSequence;
+            } else {
+                litLength -= startPosInSequence;
+            }
+            /* Move to the next sequence */
+            endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+            startPosInSequence = 0;
+            idx++;
+        } else {
+            /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+               does not reach the end of the match. So, we have to split the sequence */
+            DEBUGLOG(6, "Require a split: diff: %u, idx: %u PIS: %u",
+                     currSeq.litLength + currSeq.matchLength - endPosInSequence, idx, endPosInSequence);
+            if (endPosInSequence > litLength) {
+                U32 firstHalfMatchLength;
+                litLength = startPosInSequence >= litLength ? 0 : litLength - startPosInSequence;
+                firstHalfMatchLength = endPosInSequence - startPosInSequence - litLength;
+                if (matchLength > blockSize && firstHalfMatchLength >= cctx->appliedParams.cParams.minMatch) {
+                    /* Only ever split the match if it is larger than the block size */
+                    U32 secondHalfMatchLength = currSeq.matchLength + currSeq.litLength - endPosInSequence;
+                    if (secondHalfMatchLength < cctx->appliedParams.cParams.minMatch) {
+                        /* Move the endPosInSequence backward so that it creates match of minMatch length */
+                        endPosInSequence -= cctx->appliedParams.cParams.minMatch - secondHalfMatchLength;
+                        bytesAdjustment = cctx->appliedParams.cParams.minMatch - secondHalfMatchLength;
+                        firstHalfMatchLength -= bytesAdjustment;
+                    }
+                    matchLength = firstHalfMatchLength;
+                    /* Flag that we split the last match - after storing the sequence, exit the loop,
+                       but keep the value of endPosInSequence */
+                    finalMatchSplit = 1;
+                } else {
+                    /* Move the position in sequence backwards so that we don't split match, and break to store
+                     * the last literals. We use the original currSeq.litLength as a marker for where endPosInSequence
+                     * should go. We prefer to do this whenever it is not necessary to split the match, or if doing so
+                     * would cause the first half of the match to be too small
+                     */
+                    bytesAdjustment = endPosInSequence - currSeq.litLength;
+                    endPosInSequence = currSeq.litLength;
+                    break;
+                }
+            } else {
+                /* This sequence ends inside the literals, break to store the last literals */
+                break;
+            }
+        }
+        /* Check if this offset can be represented with a repcode */
+        {   U32 const ll0 = (litLength == 0);
+            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+        }
+
+        if (cctx->appliedParams.validateSequences) {
+            seqPos->posInSrc += litLength + matchLength;
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, seqPos->posInSrc,
+                                                   cctx->appliedParams.cParams.windowLog, dictSize),
+                                                   "Sequence validation failed");
+        }
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+                        "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+        ip += matchLength + litLength;
+    }
+    DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+    assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+    seqPos->idx = idx;
+    seqPos->posInSequence = endPosInSequence;
+    ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+
+    iend -= bytesAdjustment;
+    if (ip != iend) {
+        /* Store any last literals */
+        U32 lastLLSize = (U32)(iend - ip);
+        assert(ip <= iend);
+        DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize);
+        ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize);
+        seqPos->posInSrc += lastLLSize;
+    }
+
+    return bytesAdjustment;
+}
+
+typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                       const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                       const void* src, size_t blockSize);
+static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+{
+    ZSTD_sequenceCopier sequenceCopier = NULL;
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));
+    if (mode == ZSTD_sf_explicitBlockDelimiters) {
+        return ZSTD_copySequencesToSeqStoreExplicitBlockDelim;
+    } else if (mode == ZSTD_sf_noBlockDelimiters) {
+        return ZSTD_copySequencesToSeqStoreNoBlockDelim;
+    }
+    assert(sequenceCopier != NULL);
+    return sequenceCopier;
+}
+
+/* Discover the size of next block by searching for the delimiter.
+ * Note that a block delimiter **must** exist in this mode,
+ * otherwise it's an input error.
+ * The block size retrieved will be later compared to ensure it remains within bounds */
+static size_t
+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    int end = 0;
+    size_t blockSize = 0;
+    size_t spos = seqPos.idx;
+    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
+    assert(spos <= inSeqsSize);
+    while (spos < inSeqsSize) {
+        end = (inSeqs[spos].offset == 0);
+        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
+        if (end) {
+            if (inSeqs[spos].matchLength != 0)
+                RETURN_ERROR(corruption_detected, "delimiter format error : both matchlength and offset must be == 0");
+            break;
+        }
+        spos++;
+    }
+    if (!end)
+        RETURN_ERROR(corruption_detected, "Reached end of sequences without finding a block delimiter");
+    return blockSize;
+}
+
+/* More a "target" block size */
+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
+{
+    int const lastBlock = (remaining <= blockSize);
+    return lastBlock ? remaining : blockSize;
+}
+
+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
+                           size_t blockSize, size_t remaining,
+                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
+    if (mode == ZSTD_sf_noBlockDelimiters)
+        return blockSize_noDelimiter(blockSize, remaining);
+    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
+        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
+        if (explicitBlockSize > blockSize)
+            RETURN_ERROR(corruption_detected, "sequences incorrectly define a too large block");
+        if (explicitBlockSize > remaining)
+            RETURN_ERROR(srcSize_wrong, "sequences define a frame longer than source");
+        return explicitBlockSize;
+    }
+}
+
+/* Compress, block-by-block, all of the sequences given.
+ *
+ * Returns the cumulative size of all compressed blocks (including their headers),
+ * otherwise a ZSTD error.
+ */
+static size_t
+ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                          const void* src, size_t srcSize)
+{
+    size_t cSize = 0;
+    size_t remaining = srcSize;
+    ZSTD_sequencePosition seqPos = {0, 0, 0};
+
+    BYTE const* ip = (BYTE const*)src;
+    BYTE* op = (BYTE*)dst;
+    ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
+
+    DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize);
+    /* Special case: empty frame */
+    if (remaining == 0) {
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1);
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "No room for empty frame block header");
+        MEM_writeLE32(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+        cSize += ZSTD_blockHeaderSize;
+    }
+
+    while (remaining) {
+        size_t compressedSeqsSize;
+        size_t cBlockSize;
+        size_t additionalByteAdjustment;
+        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
+                                        cctx->blockSize, remaining,
+                                        inSeqs, inSeqsSize, seqPos);
+        U32 const lastBlock = (blockSize == remaining);
+        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
+        assert(blockSize <= remaining);
+        ZSTD_resetSeqStore(&cctx->seqStore);
+        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+
+        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
+        FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+        blockSize -= additionalByteAdjustment;
+
+        /* If blocks are too small, emit as a nocompress block */
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+            cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+            cSize += cBlockSize;
+            ip += blockSize;
+            op += cBlockSize;
+            remaining -= blockSize;
+            dstCapacity -= cBlockSize;
+            continue;
+        }
+
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+        compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                &cctx->appliedParams,
+                                op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize,
+                                blockSize,
+                                cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                                cctx->bmi2);
+        FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+
+        if (!cctx->isFirstBlock &&
+            ZSTD_maybeRLE(&cctx->seqStore) &&
+            ZSTD_isRLE((BYTE const*)src, srcSize)) {
+            /* We don't want to emit our first block as a RLE even if it qualifies because
+            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+            * This is only an issue for zstd <= v1.4.3
+            */
+            compressedSeqsSize = 1;
+        }
+
+        if (compressedSeqsSize == 0) {
+            /* ZSTD_noCompressBlock writes the block header as well */
+            cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+        } else if (compressedSeqsSize == 1) {
+            cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
+            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+        } else {
+            U32 cBlockHeader;
+            /* Error checking and repcodes update */
+            ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);
+            if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+            /* Write block header into beginning of block*/
+            cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+            MEM_writeLE24(op, cBlockHeader);
+            cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+        }
+
+        cSize += cBlockSize;
+
+        if (lastBlock) {
+            break;
+        } else {
+            ip += blockSize;
+            op += cBlockSize;
+            remaining -= blockSize;
+            dstCapacity -= cBlockSize;
+            cctx->isFirstBlock = 0;
+        }
+        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+    }
+
+    DEBUGLOG(4, "cSize final total: %zu", cSize);
+    return cSize;
+}
+
+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                              const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                              const void* src, size_t srcSize)
+{
+    BYTE* op = (BYTE*)dst;
+    size_t cSize = 0;
+    size_t compressedBlocksSize = 0;
+    size_t frameHeaderSize = 0;
+
+    /* Transparent initialization stage, same as compressStream2() */
+    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+    assert(cctx != NULL);
+    FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+    /* Begin writing output, starting with frame header */
+    frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID);
+    op += frameHeaderSize;
+    dstCapacity -= frameHeaderSize;
+    cSize += frameHeaderSize;
+    if (cctx->appliedParams.fParams.checksumFlag && srcSize) {
+        XXH64_update(&cctx->xxhState, src, srcSize);
+    }
+    /* cSize includes block header size and compressed sequences size */
+    compressedBlocksSize = ZSTD_compressSequences_internal(cctx,
+                                                           op, dstCapacity,
+                                                           inSeqs, inSeqsSize,
+                                                           src, srcSize);
+    FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!");
+    cSize += compressedBlocksSize;
+    dstCapacity -= compressedBlocksSize;
+
+    if (cctx->appliedParams.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+        DEBUGLOG(4, "Write checksum : %08X", (unsigned)checksum);
+        MEM_writeLE32((char*)dst + cSize, checksum);
+        cSize += 4;
+    }
+
+    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+    return cSize;
+}
+
+/*======   Finalize   ======*/
+
+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
+{
+    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
+    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+    return stableInput ? zcs->expectedInBuffer : nullInput;
+}
+
+/*! ZSTD_flushStream() :
+ * @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    input.size = input.pos; /* do not ingest more input during flush */
+    return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+}
+
+
+size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+    if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+    /* single thread mode : attempt to calculate remaining to flush more precisely */
+    {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+        size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4);
+        size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize;
+        DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush);
+        return toFlush;
+    }
+}
+
+
+/*-=====  Pre-defined compression levels  =====-*/
+#include "clevels.h"
+
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
+int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; }
+
+static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict);
+    switch (cParams.strategy) {
+        case ZSTD_fast:
+        case ZSTD_dfast:
+            break;
+        case ZSTD_greedy:
+        case ZSTD_lazy:
+        case ZSTD_lazy2:
+            cParams.hashLog += ZSTD_LAZY_DDSS_BUCKET_LOG;
+            break;
+        case ZSTD_btlazy2:
+        case ZSTD_btopt:
+        case ZSTD_btultra:
+        case ZSTD_btultra2:
+            break;
+    }
+    return cParams;
+}
+
+static int ZSTD_dedicatedDictSearch_isSupported(
+        ZSTD_compressionParameters const* cParams)
+{
+    return (cParams->strategy >= ZSTD_greedy)
+        && (cParams->strategy <= ZSTD_lazy2)
+        && (cParams->hashLog > cParams->chainLog)
+        && (cParams->chainLog <= 24);
+}
+
+/**
+ * Reverses the adjustment applied to cparams when enabling dedicated dict
+ * search. This is used to recover the params set to be used in the working
+ * context. (Otherwise, those tables would also grow.)
+ */
+static void ZSTD_dedicatedDictSearch_revertCParams(
+        ZSTD_compressionParameters* cParams) {
+    switch (cParams->strategy) {
+        case ZSTD_fast:
+        case ZSTD_dfast:
+            break;
+        case ZSTD_greedy:
+        case ZSTD_lazy:
+        case ZSTD_lazy2:
+            cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG;
+            if (cParams->hashLog < ZSTD_HASHLOG_MIN) {
+                cParams->hashLog = ZSTD_HASHLOG_MIN;
+            }
+            break;
+        case ZSTD_btlazy2:
+        case ZSTD_btopt:
+        case ZSTD_btultra:
+        case ZSTD_btultra2:
+            break;
+    }
+}
+
+static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
+{
+    switch (mode) {
+    case ZSTD_cpm_unknown:
+    case ZSTD_cpm_noAttachDict:
+    case ZSTD_cpm_createCDict:
+        break;
+    case ZSTD_cpm_attachDict:
+        dictSize = 0;
+        break;
+    default:
+        assert(0);
+        break;
+    }
+    {   int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN;
+        size_t const addedSize = unknown && dictSize > 0 ? 500 : 0;
+        return unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize;
+    }
+}
+
+/*! ZSTD_getCParams_internal() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ *  Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
+ *        Use dictSize == 0 for unknown or unused.
+ *  Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode)
+{
+    U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode);
+    U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
+    int row;
+    DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel);
+
+    /* row */
+    if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT;   /* 0 == default */
+    else if (compressionLevel < 0) row = 0;   /* entry 0 is baseline for fast mode */
+    else if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL;
+    else row = compressionLevel;
+
+    {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
+        DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy);
+        /* acceleration factor */
+        if (compressionLevel < 0) {
+            int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel);
+            cp.targetLength = (unsigned)(-clampedCompressionLevel);
+        }
+        /* refine parameters based on srcSize & dictSize */
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
+    }
+}
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ *  Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+}
+
+/*! ZSTD_getParams() :
+ *  same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ *  Fields of `ZSTD_frameParameters` are set to default values */
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) {
+    ZSTD_parameters params;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode);
+    DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+    ZSTD_memset(&params, 0, sizeof(params));
+    params.cParams = cParams;
+    params.fParams.contentSizeFlag = 1;
+    return params;
+}
+
+/*! ZSTD_getParams() :
+ *  same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ *  Fields of `ZSTD_frameParameters` are set to default values */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_internal.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_internal.h
new file mode 100644
index 000000000..baa726f7d
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_internal.h
@@ -0,0 +1,1413 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This header contains definitions
+ * that shall **only** be used by modules within lib/compress.
+ */
+
+#ifndef ZSTD_COMPRESS_H
+#define ZSTD_COMPRESS_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "../common/zstd_internal.h"
+#include "zstd_cwksp.h"
+#ifdef ZSTD_MULTITHREAD
+#  include "zstdmt_compress.h"
+#endif
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Constants
+***************************************/
+#define kSearchStrength      8
+#define HASH_READ_SIZE       8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index ZSTD_DUBT_UNSORTED_MARK==1 means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
+                                       This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
+
+typedef struct ZSTD_prefixDict_s {
+    const void* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+} ZSTD_prefixDict;
+
+typedef struct {
+    void* dictBuffer;
+    void const* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+    ZSTD_CDict* cdict;
+} ZSTD_localDict;
+
+typedef struct {
+    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)];
+    HUF_repeat repeatMode;
+} ZSTD_hufCTables_t;
+
+typedef struct {
+    FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+    FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+    FSE_repeat offcode_repeatMode;
+    FSE_repeat matchlength_repeatMode;
+    FSE_repeat litlength_repeatMode;
+} ZSTD_fseCTables_t;
+
+typedef struct {
+    ZSTD_hufCTables_t huf;
+    ZSTD_fseCTables_t fse;
+} ZSTD_entropyCTables_t;
+
+/***********************************************
+*  Entropy buffer statistics structs and funcs *
+***********************************************/
+/** ZSTD_hufCTablesMetadata_t :
+ *  Stores Literals Block Type for a super-block in hType, and
+ *  huffman tree description in hufDesBuffer.
+ *  hufDesSize refers to the size of huffman tree description in bytes.
+ *  This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */
+typedef struct {
+    symbolEncodingType_e hType;
+    BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE];
+    size_t hufDesSize;
+} ZSTD_hufCTablesMetadata_t;
+
+/** ZSTD_fseCTablesMetadata_t :
+ *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and
+ *  fse tables in fseTablesBuffer.
+ *  fseTablesSize refers to the size of fse tables in bytes.
+ *  This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */
+typedef struct {
+    symbolEncodingType_e llType;
+    symbolEncodingType_e ofType;
+    symbolEncodingType_e mlType;
+    BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE];
+    size_t fseTablesSize;
+    size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+} ZSTD_fseCTablesMetadata_t;
+
+typedef struct {
+    ZSTD_hufCTablesMetadata_t hufMetadata;
+    ZSTD_fseCTablesMetadata_t fseMetadata;
+} ZSTD_entropyCTablesMetadata_t;
+
+/** ZSTD_buildBlockEntropyStats() :
+ *  Builds entropy for the block.
+ *  @return : 0 on success or error code */
+size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+                             const ZSTD_entropyCTables_t* prevEntropy,
+                                   ZSTD_entropyCTables_t* nextEntropy,
+                             const ZSTD_CCtx_params* cctxParams,
+                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                   void* workspace, size_t wkspSize);
+
+/*********************************
+*  Compression internals structs *
+*********************************/
+
+typedef struct {
+    U32 off;            /* Offset sumtype code for the match, using ZSTD_storeSeq() format */
+    U32 len;            /* Raw length of match */
+} ZSTD_match_t;
+
+typedef struct {
+    U32 offset;         /* Offset of sequence */
+    U32 litLength;      /* Length of literals prior to match */
+    U32 matchLength;    /* Raw length of match */
+} rawSeq;
+
+typedef struct {
+  rawSeq* seq;          /* The start of the sequences */
+  size_t pos;           /* The index in seq where reading stopped. pos <= size. */
+  size_t posInSequence; /* The position within the sequence at seq[pos] where reading
+                           stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */
+  size_t size;          /* The number of sequences. <= capacity. */
+  size_t capacity;      /* The capacity starting from `seq` pointer */
+} rawSeqStore_t;
+
+UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+
+typedef struct {
+    int price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+
+typedef struct {
+    /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+    unsigned* litFreq;           /* table of literals statistics, of size 256 */
+    unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+    unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+    unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+
+    U32  litSum;                 /* nb of literals */
+    U32  litLengthSum;           /* nb of litLength codes */
+    U32  matchLengthSum;         /* nb of matchLength codes */
+    U32  offCodeSum;             /* nb of offset codes */
+    U32  litSumBasePrice;        /* to compare to log2(litfreq) */
+    U32  litLengthSumBasePrice;  /* to compare to log2(llfreq)  */
+    U32  matchLengthSumBasePrice;/* to compare to log2(mlfreq)  */
+    U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+    ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+    const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
+    ZSTD_paramSwitch_e literalCompressionMode;
+} optState_t;
+
+typedef struct {
+  ZSTD_entropyCTables_t entropy;
+  U32 rep[ZSTD_REP_NUM];
+} ZSTD_compressedBlockState_t;
+
+typedef struct {
+    BYTE const* nextSrc;       /* next block here to continue on current prefix */
+    BYTE const* base;          /* All regular indexes relative to this position */
+    BYTE const* dictBase;      /* extDict indexes relative to this position */
+    U32 dictLimit;             /* below that point, need extDict */
+    U32 lowLimit;              /* below that point, no more valid data */
+    U32 nbOverflowCorrections; /* Number of times overflow correction has run since
+                                * ZSTD_window_init(). Useful for debugging coredumps
+                                * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY.
+                                */
+} ZSTD_window_t;
+
+#define ZSTD_WINDOW_START_INDEX 2
+
+typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+
+#define ZSTD_ROW_HASH_CACHE_SIZE 8       /* Size of prefetching hash cache for row-based matchfinder */
+
+struct ZSTD_matchState_t {
+    ZSTD_window_t window;   /* State for window round buffer management */
+    U32 loadedDictEnd;      /* index of end of dictionary, within context's referential.
+                             * When loadedDictEnd != 0, a dictionary is in use, and still valid.
+                             * This relies on a mechanism to set loadedDictEnd=0 when dictionary is no longer within distance.
+                             * Such mechanism is provided within ZSTD_window_enforceMaxDist() and ZSTD_checkDictValidity().
+                             * When dict referential is copied into active context (i.e. not attached),
+                             * loadedDictEnd == dictSize, since referential starts from zero.
+                             */
+    U32 nextToUpdate;       /* index from which to continue table update */
+    U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+
+    U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+    U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
+
+    U32* hashTable;
+    U32* hashTable3;
+    U32* chainTable;
+
+    U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
+
+    int dedicatedDictSearch;  /* Indicates whether this matchState is using the
+                               * dedicated dictionary search structure.
+                               */
+    optState_t opt;         /* optimal parser state */
+    const ZSTD_matchState_t* dictMatchState;
+    ZSTD_compressionParameters cParams;
+    const rawSeqStore_t* ldmSeqStore;
+
+    /* Controls prefetching in some dictMatchState matchfinders.
+     * This behavior is controlled from the cctx ms.
+     * This parameter has no effect in the cdict ms. */
+    int prefetchCDictTables;
+};
+
+typedef struct {
+    ZSTD_compressedBlockState_t* prevCBlock;
+    ZSTD_compressedBlockState_t* nextCBlock;
+    ZSTD_matchState_t matchState;
+} ZSTD_blockState_t;
+
+typedef struct {
+    U32 offset;
+    U32 checksum;
+} ldmEntry_t;
+
+typedef struct {
+    BYTE const* split;
+    U32 hash;
+    U32 checksum;
+    ldmEntry_t* bucket;
+} ldmMatchCandidate_t;
+
+#define LDM_BATCH_SIZE 64
+
+typedef struct {
+    ZSTD_window_t window;   /* State for the window round buffer management */
+    ldmEntry_t* hashTable;
+    U32 loadedDictEnd;
+    BYTE* bucketOffsets;    /* Next position in bucket to insert entry */
+    size_t splitIndices[LDM_BATCH_SIZE];
+    ldmMatchCandidate_t matchCandidates[LDM_BATCH_SIZE];
+} ldmState_t;
+
+typedef struct {
+    ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
+    U32 hashLog;            /* Log size of hashTable */
+    U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
+    U32 minMatchLength;     /* Minimum match length */
+    U32 hashRateLog;       /* Log number of entries to skip */
+    U32 windowLog;          /* Window log for the LDM */
+} ldmParams_t;
+
+typedef struct {
+    int collectSequences;
+    ZSTD_Sequence* seqStart;
+    size_t seqIndex;
+    size_t maxSequences;
+} SeqCollector;
+
+struct ZSTD_CCtx_params_s {
+    ZSTD_format_e format;
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+
+    int compressionLevel;
+    int forceWindow;           /* force back-references to respect limit of
+                                * 1<<wLog, even for dictionary */
+    size_t targetCBlockSize;   /* Tries to fit compressed block size to be around targetCBlockSize.
+                                * No target when targetCBlockSize == 0.
+                                * There is no guarantee on compressed block size */
+    int srcSizeHint;           /* User's best guess of source size.
+                                * Hint is not valid when srcSizeHint == 0.
+                                * There is no guarantee that hint is close to actual source size */
+
+    ZSTD_dictAttachPref_e attachDictPref;
+    ZSTD_paramSwitch_e literalCompressionMode;
+
+    /* Multithreading: used to pass parameters to mtctx */
+    int nbWorkers;
+    size_t jobSize;
+    int overlapLog;
+    int rsyncable;
+
+    /* Long distance matching parameters */
+    ldmParams_t ldmParams;
+
+    /* Dedicated dict search algorithm trigger */
+    int enableDedicatedDictSearch;
+
+    /* Input/output buffer modes */
+    ZSTD_bufferMode_e inBufferMode;
+    ZSTD_bufferMode_e outBufferMode;
+
+    /* Sequence compression API */
+    ZSTD_sequenceFormat_e blockDelimiters;
+    int validateSequences;
+
+    /* Block splitting */
+    ZSTD_paramSwitch_e useBlockSplitter;
+
+    /* Param for deciding whether to use row-based matchfinder */
+    ZSTD_paramSwitch_e useRowMatchFinder;
+
+    /* Always load a dictionary in ext-dict mode (not prefix mode)? */
+    int deterministicRefPrefix;
+
+    /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+    ZSTD_customMem customMem;
+
+    /* Controls prefetching in some dictMatchState matchfinders */
+    ZSTD_paramSwitch_e prefetchCDictTables;
+};  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+
+#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+#define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE)
+
+/**
+ * Indicates whether this compression proceeds directly from user-provided
+ * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or
+ * whether the context needs to buffer the input/output (ZSTDb_buffered).
+ */
+typedef enum {
+    ZSTDb_not_buffered,
+    ZSTDb_buffered
+} ZSTD_buffered_policy_e;
+
+/**
+ * Struct that contains all elements of block splitter that should be allocated
+ * in a wksp.
+ */
+#define ZSTD_MAX_NB_BLOCK_SPLITS 196
+typedef struct {
+    seqStore_t fullSeqStoreChunk;
+    seqStore_t firstHalfSeqStore;
+    seqStore_t secondHalfSeqStore;
+    seqStore_t currSeqStore;
+    seqStore_t nextSeqStore;
+
+    U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS];
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+} ZSTD_blockSplitCtx;
+
+struct ZSTD_CCtx_s {
+    ZSTD_compressionStage_e stage;
+    int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+    int bmi2;                            /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+    ZSTD_CCtx_params requestedParams;
+    ZSTD_CCtx_params appliedParams;
+    ZSTD_CCtx_params simpleApiParams;    /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */
+    U32   dictID;
+    size_t dictContentSize;
+
+    ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */
+    size_t blockSize;
+    unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+    unsigned long long consumedSrcSize;
+    unsigned long long producedCSize;
+    XXH64_state_t xxhState;
+    ZSTD_customMem customMem;
+    ZSTD_threadPool* pool;
+    size_t staticSize;
+    SeqCollector seqCollector;
+    int isFirstBlock;
+    int initialized;
+
+    seqStore_t seqStore;      /* sequences storage ptrs */
+    ldmState_t ldmState;      /* long distance matching state */
+    rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+    size_t maxNbLdmSequences;
+    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+    ZSTD_blockState_t blockState;
+    U32* entropyWorkspace;  /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */
+
+    /* Whether we are streaming or not */
+    ZSTD_buffered_policy_e bufferedPolicy;
+
+    /* streaming */
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inToCompress;
+    size_t inBuffPos;
+    size_t inBuffTarget;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outBuffContentSize;
+    size_t outBuffFlushedSize;
+    ZSTD_cStreamStage streamStage;
+    U32    frameEnded;
+
+    /* Stable in/out buffer verification */
+    ZSTD_inBuffer expectedInBuffer;
+    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+    size_t expectedOutBufferSize;
+
+    /* Dictionary */
+    ZSTD_localDict localDict;
+    const ZSTD_CDict* cdict;
+    ZSTD_prefixDict prefixDict;   /* single-usage dictionary */
+
+    /* Multi-threading */
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_CCtx* mtctx;
+#endif
+
+    /* Tracing */
+#if ZSTD_TRACE
+    ZSTD_TraceCtx traceCtx;
+#endif
+
+    /* Workspace for block splitter */
+    ZSTD_blockSplitCtx blockSplitCtx;
+};
+
+typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+
+typedef enum {
+    ZSTD_noDict = 0,
+    ZSTD_extDict = 1,
+    ZSTD_dictMatchState = 2,
+    ZSTD_dedicatedDictSearch = 3
+} ZSTD_dictMode_e;
+
+typedef enum {
+    ZSTD_cpm_noAttachDict = 0,  /* Compression with ZSTD_noDict or ZSTD_extDict.
+                                 * In this mode we use both the srcSize and the dictSize
+                                 * when selecting and adjusting parameters.
+                                 */
+    ZSTD_cpm_attachDict = 1,    /* Compression with ZSTD_dictMatchState or ZSTD_dedicatedDictSearch.
+                                 * In this mode we only take the srcSize into account when selecting
+                                 * and adjusting parameters.
+                                 */
+    ZSTD_cpm_createCDict = 2,   /* Creating a CDict.
+                                 * In this mode we take both the source size and the dictionary size
+                                 * into account when selecting and adjusting the parameters.
+                                 */
+    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                 * We don't know what these parameters are for. We default to the legacy
+                                 * behavior of taking both the source size and the dict size into account
+                                 * when selecting and adjusting parameters.
+                                 */
+} ZSTD_cParamMode_e;
+
+typedef size_t (*ZSTD_blockCompressor) (
+        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
+
+
+MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+{
+    static const BYTE LL_Code[64] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                       8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 16, 17, 17, 18, 18, 19, 19,
+                                      20, 20, 20, 20, 21, 21, 21, 21,
+                                      22, 22, 22, 22, 22, 22, 22, 22,
+                                      23, 23, 23, 23, 23, 23, 23, 23,
+                                      24, 24, 24, 24, 24, 24, 24, 24,
+                                      24, 24, 24, 24, 24, 24, 24, 24 };
+    static const U32 LL_deltaCode = 19;
+    return (litLength > 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+}
+
+/* ZSTD_MLcode() :
+ * note : mlBase = matchLength - MINMATCH;
+ *        because it's the format it's stored in seqStore->sequences */
+MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
+{
+    static const BYTE ML_Code[128] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                      32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+                                      38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+                                      40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+                                      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 };
+    static const U32 ML_deltaCode = 36;
+    return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
+}
+
+/* ZSTD_cParam_withinBounds:
+ * @return 1 if value is within cParam bounds,
+ * 0 otherwise */
+MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+{
+    ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+    if (ZSTD_isError(bounds.error)) return 0;
+    if (value < bounds.lowerBound) return 0;
+    if (value > bounds.upperBound) return 0;
+    return 1;
+}
+
+/* ZSTD_noCompressBlock() :
+ * Writes uncompressed block to dst buffer from given src.
+ * Returns the size of the block */
+MEM_STATIC size_t
+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+{
+    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+    RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                    dstSize_tooSmall, "dst buf too small for uncompressed block");
+    MEM_writeLE24(dst, cBlockHeader24);
+    ZSTD_memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+    return ZSTD_blockHeaderSize + srcSize;
+}
+
+MEM_STATIC size_t
+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+{
+    BYTE* const op = (BYTE*)dst;
+    U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+    RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "");
+    MEM_writeLE24(op, cBlockHeader);
+    op[3] = src;
+    return 4;
+}
+
+
+/* ZSTD_minGain() :
+ * minimum compression required
+ * to generate a compress block or a compressed literals section.
+ * note : use same formula for both situations */
+MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+{
+    U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+    ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    return (srcSize >> minlog) + 2;
+}
+
+MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams)
+{
+    switch (cctxParams->literalCompressionMode) {
+    case ZSTD_ps_enable:
+        return 0;
+    case ZSTD_ps_disable:
+        return 1;
+    default:
+        assert(0 /* impossible: pre-validated */);
+        ZSTD_FALLTHROUGH;
+    case ZSTD_ps_auto:
+        return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
+    }
+}
+
+/*! ZSTD_safecopyLiterals() :
+ *  memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w.
+ *  Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single
+ *  large copies.
+ */
+static void
+ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w)
+{
+    assert(iend > ilimit_w);
+    if (ip <= ilimit_w) {
+        ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap);
+        op += ilimit_w - ip;
+        ip = ilimit_w;
+    }
+    while (ip < iend) *op++ = *ip++;
+}
+
+
+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
+#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
+#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
+#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
+
+/*! ZSTD_storeSeq() :
+ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
+ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+ *  @matchLength : must be >= MINMATCH
+ *  Allowed to over-read literals up to litLimit.
+*/
+HINT_INLINE UNUSED_ATTR void
+ZSTD_storeSeq(seqStore_t* seqStorePtr,
+              size_t litLength, const BYTE* literals, const BYTE* litLimit,
+              U32 offBase,
+              size_t matchLength)
+{
+    BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+    BYTE const* const litEnd = literals + litLength;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
+    static const BYTE* g_start = NULL;
+    if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+    {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
+               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+    }
+#endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+    /* copy Literals */
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
+    assert(literals + litLength <= litLimit);
+    if (litEnd <= litLimit_w) {
+        /* Common case we can use wildcopy.
+         * First copy 16 bytes, because literals are likely short.
+         */
+        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+        ZSTD_copy16(seqStorePtr->lit, literals);
+        if (litLength > 16) {
+            ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+        }
+    } else {
+        ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w);
+    }
+    seqStorePtr->lit += litLength;
+
+    /* literal Length */
+    if (litLength>0xFFFF) {
+        assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+        seqStorePtr->longLengthType = ZSTD_llt_literalLength;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+    /* match offset */
+    seqStorePtr->sequences[0].offBase = offBase;
+
+    /* match Length */
+    assert(matchLength >= MINMATCH);
+    {   size_t const mlBase = matchLength - MINMATCH;
+        if (mlBase>0xFFFF) {
+            assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+            seqStorePtr->longLengthType = ZSTD_llt_matchLength;
+            seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+        }
+        seqStorePtr->sequences[0].mlBase = (U16)mlBase;
+    }
+
+    seqStorePtr->sequences++;
+}
+
+/* ZSTD_updateRep() :
+ * updates in-place @rep (array of repeat offsets)
+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+ */
+MEM_STATIC void
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+{
+    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+        rep[2] = rep[1];
+        rep[1] = rep[0];
+        rep[0] = OFFBASE_TO_OFFSET(offBase);
+    } else {   /* repcode */
+        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+        if (repCode > 0) {  /* note : if repCode==0, no change */
+            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+            rep[1] = rep[0];
+            rep[0] = currentOffset;
+        } else {   /* repCode == 0 */
+            /* nothing to do */
+        }
+    }
+}
+
+typedef struct repcodes_s {
+    U32 rep[3];
+} repcodes_t;
+
+MEM_STATIC repcodes_t
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+{
+    repcodes_t newReps;
+    ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+    ZSTD_updateRep(newReps.rep, offBase, ll0);
+    return newReps;
+}
+
+
+/*-*************************************
+*  Match length counter
+***************************************/
+MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+{
+    const BYTE* const pStart = pIn;
+    const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
+
+    if (pIn < pInLoopLimit) {
+        { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+          if (diff) return ZSTD_NbCommonBytes(diff); }
+        pIn+=sizeof(size_t); pMatch+=sizeof(size_t);
+        while (pIn < pInLoopLimit) {
+            size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+            if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+            pIn += ZSTD_NbCommonBytes(diff);
+            return (size_t)(pIn - pStart);
+    }   }
+    if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+ *  can count match length with `ip` & `match` in 2 different segments.
+ *  convention : on reaching mEnd, match count continue starting from iStart
+ */
+MEM_STATIC size_t
+ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+                     const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+{
+    const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
+    size_t const matchLength = ZSTD_count(ip, match, vEnd);
+    if (match + matchLength != mEnd) return matchLength;
+    DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
+    DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+    DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
+    return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+}
+
+
+/*-*************************************
+ *  Hashes
+ ***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32    ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32    ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+MEM_STATIC FORCE_INLINE_ATTR
+size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+{
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
+    switch(mls)
+    {
+    default:
+    case 4: return ZSTD_hash4Ptr(p, hBits);
+    case 5: return ZSTD_hash5Ptr(p, hBits);
+    case 6: return ZSTD_hash6Ptr(p, hBits);
+    case 7: return ZSTD_hash7Ptr(p, hBits);
+    case 8: return ZSTD_hash8Ptr(p, hBits);
+    }
+}
+
+/** ZSTD_ipow() :
+ * Return base^exponent.
+ */
+static U64 ZSTD_ipow(U64 base, U64 exponent)
+{
+    U64 power = 1;
+    while (exponent) {
+      if (exponent & 1) power *= base;
+      exponent >>= 1;
+      base *= base;
+    }
+    return power;
+}
+
+#define ZSTD_ROLL_HASH_CHAR_OFFSET 10
+
+/** ZSTD_rollingHash_append() :
+ * Add the buffer to the hash value.
+ */
+static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size)
+{
+    BYTE const* istart = (BYTE const*)buf;
+    size_t pos;
+    for (pos = 0; pos < size; ++pos) {
+        hash *= prime8bytes;
+        hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET;
+    }
+    return hash;
+}
+
+/** ZSTD_rollingHash_compute() :
+ * Compute the rolling hash value of the buffer.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size)
+{
+    return ZSTD_rollingHash_append(0, buf, size);
+}
+
+/** ZSTD_rollingHash_primePower() :
+ * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash
+ * over a window of length bytes.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length)
+{
+    return ZSTD_ipow(prime8bytes, length - 1);
+}
+
+/** ZSTD_rollingHash_rotate() :
+ * Rotate the rolling hash by one byte.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower)
+{
+    hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower;
+    hash *= prime8bytes;
+    hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET;
+    return hash;
+}
+
+/*-*************************************
+*  Round buffer management
+***************************************/
+#if (ZSTD_WINDOWLOG_MAX_64 > 31)
+# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
+#endif
+/* Max current allowed */
+#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
+/* Maximum chunk size before overflow correction needs to be called again */
+#define ZSTD_CHUNKSIZE_MAX                                                     \
+    ( ((U32)-1)                  /* Maximum ending current index */            \
+    - ZSTD_CURRENT_MAX)          /* Maximum beginning lowLimit */
+
+/**
+ * ZSTD_window_clear():
+ * Clears the window containing the history by simply setting it to empty.
+ */
+MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
+{
+    size_t const endT = (size_t)(window->nextSrc - window->base);
+    U32 const end = (U32)endT;
+
+    window->lowLimit = end;
+    window->dictLimit = end;
+}
+
+MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window)
+{
+    return window.dictLimit == ZSTD_WINDOW_START_INDEX &&
+           window.lowLimit == ZSTD_WINDOW_START_INDEX &&
+           (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX;
+}
+
+/**
+ * ZSTD_window_hasExtDict():
+ * Returns non-zero if the window has a non-empty extDict.
+ */
+MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+{
+    return window.lowLimit < window.dictLimit;
+}
+
+/**
+ * ZSTD_matchState_dictMode():
+ * Inspects the provided matchState and figures out what dictMode should be
+ * passed to the compressor.
+ */
+MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
+{
+    return ZSTD_window_hasExtDict(ms->window) ?
+        ZSTD_extDict :
+        ms->dictMatchState != NULL ?
+            (ms->dictMatchState->dedicatedDictSearch ? ZSTD_dedicatedDictSearch : ZSTD_dictMatchState) :
+            ZSTD_noDict;
+}
+
+/* Defining this macro to non-zero tells zstd to run the overflow correction
+ * code much more frequently. This is very inefficient, and should only be
+ * used for tests and fuzzers.
+ */
+#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY
+#  ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1
+#  else
+#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0
+#  endif
+#endif
+
+/**
+ * ZSTD_window_canOverflowCorrect():
+ * Returns non-zero if the indices are large enough for overflow correction
+ * to work correctly without impacting compression ratio.
+ */
+MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window,
+                                              U32 cycleLog,
+                                              U32 maxDist,
+                                              U32 loadedDictEnd,
+                                              void const* src)
+{
+    U32 const cycleSize = 1u << cycleLog;
+    U32 const curr = (U32)((BYTE const*)src - window.base);
+    U32 const minIndexToOverflowCorrect = cycleSize
+                                        + MAX(maxDist, cycleSize)
+                                        + ZSTD_WINDOW_START_INDEX;
+
+    /* Adjust the min index to backoff the overflow correction frequency,
+     * so we don't waste too much CPU in overflow correction. If this
+     * computation overflows we don't really care, we just need to make
+     * sure it is at least minIndexToOverflowCorrect.
+     */
+    U32 const adjustment = window.nbOverflowCorrections + 1;
+    U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment,
+                                  minIndexToOverflowCorrect);
+    U32 const indexLargeEnough = curr > adjustedIndex;
+
+    /* Only overflow correct early if the dictionary is invalidated already,
+     * so we don't hurt compression ratio.
+     */
+    U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd;
+
+    return indexLargeEnough && dictionaryInvalidated;
+}
+
+/**
+ * ZSTD_window_needOverflowCorrection():
+ * Returns non-zero if the indices are getting too large and need overflow
+ * protection.
+ */
+MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+                                                  U32 cycleLog,
+                                                  U32 maxDist,
+                                                  U32 loadedDictEnd,
+                                                  void const* src,
+                                                  void const* srcEnd)
+{
+    U32 const curr = (U32)((BYTE const*)srcEnd - window.base);
+    if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
+        if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) {
+            return 1;
+        }
+    }
+    return curr > ZSTD_CURRENT_MAX;
+}
+
+/**
+ * ZSTD_window_correctOverflow():
+ * Reduces the indices to protect from index overflow.
+ * Returns the correction made to the indices, which must be applied to every
+ * stored index.
+ *
+ * The least significant cycleLog bits of the indices must remain the same,
+ * which may be 0. Every index up to maxDist in the past must be valid.
+ */
+MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                           U32 maxDist, void const* src)
+{
+    /* preemptive overflow correction:
+     * 1. correction is large enough:
+     *    lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
+     *    1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
+     *
+     *    current - newCurrent
+     *    > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
+     *    > (3<<29) - (1<<chainLog)
+     *    > (3<<29) - (1<<30)             (NOTE: chainLog <= 30)
+     *    > 1<<29
+     *
+     * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
+     *    After correction, current is less than (1<<chainLog + 1<<windowLog).
+     *    In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
+     *    In 32-bit mode we are safe, because (chainLog <= 29), so
+     *    ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
+     * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
+     *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
+     */
+    U32 const cycleSize = 1u << cycleLog;
+    U32 const cycleMask = cycleSize - 1;
+    U32 const curr = (U32)((BYTE const*)src - window->base);
+    U32 const currentCycle = curr & cycleMask;
+    /* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */
+    U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX
+                                     ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX)
+                                     : 0;
+    U32 const newCurrent = currentCycle
+                         + currentCycleCorrection
+                         + MAX(maxDist, cycleSize);
+    U32 const correction = curr - newCurrent;
+    /* maxDist must be a power of two so that:
+     *   (newCurrent & cycleMask) == (curr & cycleMask)
+     * This is required to not corrupt the chains / binary tree.
+     */
+    assert((maxDist & (maxDist - 1)) == 0);
+    assert((curr & cycleMask) == (newCurrent & cycleMask));
+    assert(curr > newCurrent);
+    if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
+        /* Loose bound, should be around 1<<29 (see above) */
+        assert(correction > 1<<28);
+    }
+
+    window->base += correction;
+    window->dictBase += correction;
+    if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) {
+        window->lowLimit = ZSTD_WINDOW_START_INDEX;
+    } else {
+        window->lowLimit -= correction;
+    }
+    if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) {
+        window->dictLimit = ZSTD_WINDOW_START_INDEX;
+    } else {
+        window->dictLimit -= correction;
+    }
+
+    /* Ensure we can still reference the full window. */
+    assert(newCurrent >= maxDist);
+    assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX);
+    /* Ensure that lowLimit and dictLimit didn't underflow. */
+    assert(window->lowLimit <= newCurrent);
+    assert(window->dictLimit <= newCurrent);
+
+    ++window->nbOverflowCorrections;
+
+    DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
+             window->lowLimit);
+    return correction;
+}
+
+/**
+ * ZSTD_window_enforceMaxDist():
+ * Updates lowLimit so that:
+ *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ *
+ * It ensures index is valid as long as index >= lowLimit.
+ * This must be called before a block compression call.
+ *
+ * loadedDictEnd is only defined if a dictionary is in use for current compression.
+ * As the name implies, loadedDictEnd represents the index at end of dictionary.
+ * The value lies within context's referential, it can be directly compared to blockEndIdx.
+ *
+ * If loadedDictEndPtr is NULL, no dictionary is in use, and we use loadedDictEnd == 0.
+ * If loadedDictEndPtr is not NULL, we set it to zero after updating lowLimit.
+ * This is because dictionaries are allowed to be referenced fully
+ * as long as the last byte of the dictionary is in the window.
+ * Once input has progressed beyond window size, dictionary cannot be referenced anymore.
+ *
+ * In normal dict mode, the dictionary lies between lowLimit and dictLimit.
+ * In dictMatchState mode, lowLimit and dictLimit are the same,
+ * and the dictionary is below them.
+ * forceWindow and dictMatchState are therefore incompatible.
+ */
+MEM_STATIC void
+ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                     const void* blockEnd,
+                           U32   maxDist,
+                           U32*  loadedDictEndPtr,
+                     const ZSTD_matchState_t** dictMatchStatePtr)
+{
+    U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+    U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0;
+    DEBUGLOG(5, "ZSTD_window_enforceMaxDist: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+                (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+
+    /* - When there is no dictionary : loadedDictEnd == 0.
+         In which case, the test (blockEndIdx > maxDist) is merely to avoid
+         overflowing next operation `newLowLimit = blockEndIdx - maxDist`.
+       - When there is a standard dictionary :
+         Index referential is copied from the dictionary,
+         which means it starts from 0.
+         In which case, loadedDictEnd == dictSize,
+         and it makes sense to compare `blockEndIdx > maxDist + dictSize`
+         since `blockEndIdx` also starts from zero.
+       - When there is an attached dictionary :
+         loadedDictEnd is expressed within the referential of the context,
+         so it can be directly compared against blockEndIdx.
+    */
+    if (blockEndIdx > maxDist + loadedDictEnd) {
+        U32 const newLowLimit = blockEndIdx - maxDist;
+        if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
+        if (window->dictLimit < window->lowLimit) {
+            DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
+                        (unsigned)window->dictLimit, (unsigned)window->lowLimit);
+            window->dictLimit = window->lowLimit;
+        }
+        /* On reaching window size, dictionaries are invalidated */
+        if (loadedDictEndPtr) *loadedDictEndPtr = 0;
+        if (dictMatchStatePtr) *dictMatchStatePtr = NULL;
+    }
+}
+
+/* Similar to ZSTD_window_enforceMaxDist(),
+ * but only invalidates dictionary
+ * when input progresses beyond window size.
+ * assumption : loadedDictEndPtr and dictMatchStatePtr are valid (non NULL)
+ *              loadedDictEnd uses same referential as window->base
+ *              maxDist is the window size */
+MEM_STATIC void
+ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                       const void* blockEnd,
+                             U32   maxDist,
+                             U32*  loadedDictEndPtr,
+                       const ZSTD_matchState_t** dictMatchStatePtr)
+{
+    assert(loadedDictEndPtr != NULL);
+    assert(dictMatchStatePtr != NULL);
+    {   U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+        U32 const loadedDictEnd = *loadedDictEndPtr;
+        DEBUGLOG(5, "ZSTD_checkDictValidity: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+                    (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+        assert(blockEndIdx >= loadedDictEnd);
+
+        if (blockEndIdx > loadedDictEnd + maxDist) {
+            /* On reaching window size, dictionaries are invalidated.
+             * For simplification, if window size is reached anywhere within next block,
+             * the dictionary is invalidated for the full block.
+             */
+            DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+            *loadedDictEndPtr = 0;
+            *dictMatchStatePtr = NULL;
+        } else {
+            if (*loadedDictEndPtr != 0) {
+                DEBUGLOG(6, "dictionary considered valid for current block");
+    }   }   }
+}
+
+MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+    ZSTD_memset(window, 0, sizeof(*window));
+    window->base = (BYTE const*)" ";
+    window->dictBase = (BYTE const*)" ";
+    ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */
+    window->dictLimit = ZSTD_WINDOW_START_INDEX;    /* start from >0, so that 1st position is valid */
+    window->lowLimit = ZSTD_WINDOW_START_INDEX;     /* it ensures first and later CCtx usages compress the same */
+    window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX;   /* see issue #1241 */
+    window->nbOverflowCorrections = 0;
+}
+
+/**
+ * ZSTD_window_update():
+ * Updates the window by appending [src, src + srcSize) to the window.
+ * If it is not contiguous, the current prefix becomes the extDict, and we
+ * forget about the extDict. Handles overlap of the prefix and extDict.
+ * Returns non-zero if the segment is contiguous.
+ */
+MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+                                  void const* src, size_t srcSize,
+                                  int forceNonContiguous)
+{
+    BYTE const* const ip = (BYTE const*)src;
+    U32 contiguous = 1;
+    DEBUGLOG(5, "ZSTD_window_update");
+    if (srcSize == 0)
+        return contiguous;
+    assert(window->base != NULL);
+    assert(window->dictBase != NULL);
+    /* Check if blocks follow each other */
+    if (src != window->nextSrc || forceNonContiguous) {
+        /* not contiguous */
+        size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
+        window->lowLimit = window->dictLimit;
+        assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
+        window->dictLimit = (U32)distanceFromBase;
+        window->dictBase = window->base;
+        window->base = ip - distanceFromBase;
+        /* ms->nextToUpdate = window->dictLimit; */
+        if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit;   /* too small extDict */
+        contiguous = 0;
+    }
+    window->nextSrc = ip + srcSize;
+    /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+    if ( (ip+srcSize > window->dictBase + window->lowLimit)
+       & (ip < window->dictBase + window->dictLimit)) {
+        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
+        window->lowLimit = lowLimitMax;
+        DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
+    }
+    return contiguous;
+}
+
+/**
+ * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
+{
+    U32 const maxDistance = 1U << windowLog;
+    U32 const lowestValid = ms->window.lowLimit;
+    U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    U32 const isDictionary = (ms->loadedDictEnd != 0);
+    /* When using a dictionary the entire dictionary is valid if a single byte of the dictionary
+     * is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't
+     * valid for the entire block. So this check is sufficient to find the lowest valid match index.
+     */
+    U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
+    return matchLowest;
+}
+
+/**
+ * Returns the lowest allowed match index in the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
+{
+    U32    const maxDistance = 1U << windowLog;
+    U32    const lowestValid = ms->window.dictLimit;
+    U32    const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    U32    const isDictionary = (ms->loadedDictEnd != 0);
+    /* When computing the lowest prefix index we need to take the dictionary into account to handle
+     * the edge case where the dictionary and the source are contiguous in memory.
+     */
+    U32    const matchLowest = isDictionary ? lowestValid : withinWindow;
+    return matchLowest;
+}
+
+
+
+/* debug functions */
+#if (DEBUGLEVEL>=2)
+
+MEM_STATIC double ZSTD_fWeight(U32 rawStat)
+{
+    U32 const fp_accuracy = 8;
+    U32 const fp_multiplier = (1 << fp_accuracy);
+    U32 const newStat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(newStat);
+    U32 const BWeight = hb * fp_multiplier;
+    U32 const FWeight = (newStat << fp_accuracy) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + fp_accuracy < 31);
+    return (double)weight / fp_multiplier;
+}
+
+/* display a table content,
+ * listing each element, its frequency, and its predicted bit cost */
+MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+{
+    unsigned u, sum;
+    for (u=0, sum=0; u<=max; u++) sum += table[u];
+    DEBUGLOG(2, "total nb elts: %u", sum);
+    for (u=0; u<=max; u++) {
+        DEBUGLOG(2, "%2u: %5u  (%.2f)",
+                u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) );
+    }
+}
+
+#endif
+
+/* Short Cache */
+
+/* Normally, zstd matchfinders follow this flow:
+ *     1. Compute hash at ip
+ *     2. Load index from hashTable[hash]
+ *     3. Check if *ip == *(base + index)
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
+ *
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
+ * when the data doesn't actually match. With short cache, the flow becomes:
+ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
+ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
+ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
+ *
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
+ * dictMatchState matchfinders.
+ */
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
+
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
+    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
+    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
+    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
+}
+
+/* Helper function for short cache matchfinders.
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
+    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
+    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
+    return tag1 == tag2;
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+/* ===============================================================
+ * Shared internal declarations
+ * These prototypes may be called from sources not in lib/compress
+ * =============================================================== */
+
+/* ZSTD_loadCEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary.
+ * return : size of dictionary header (size of magic number + dict ID + entropy tables)
+ * assumptions : magic number supposed already checked
+ *               and dictSize >= 8 */
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         const void* const dict, size_t dictSize);
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+
+/* ==============================================================
+ * Private declarations
+ * These prototypes shall only be called from within lib/compress
+ * ============================================================== */
+
+/* ZSTD_getCParamsFromCCtxParams() :
+ * cParams are built depending on compressionLevel, src size hints,
+ * LDM and manually set compression parameters.
+ * Note: srcSizeHint == 0 means 0!
+ */
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+
+/*! ZSTD_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                     const ZSTD_CDict* cdict,
+                     const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize);
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr);
+
+/*! ZSTD_getCParamsFromCDict() :
+ *  as the name implies */
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
+
+/* ZSTD_compressBegin_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params,
+                                    unsigned long long pledgedSrcSize);
+
+/* ZSTD_compress_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict,size_t dictSize,
+                                 const ZSTD_CCtx_params* params);
+
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+
+
+/* ZSTD_referenceExternalSequences() :
+ * Must be called before starting a compression operation.
+ * seqs must parse a prefix of the source.
+ * This cannot be used when long range matching is enabled.
+ * Zstd will use these sequences, and pass the literals to a secondary block
+ * compressor.
+ * @return : An error code on failure.
+ * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+ * access and data corruption.
+ */
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+
+/** ZSTD_CCtx_trace() :
+ *  Trace the end of a compression call.
+ */
+void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+
+#endif /* ZSTD_COMPRESS_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_literals.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_literals.c
new file mode 100644
index 000000000..15bde09e6
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_literals.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "zstd_compress_literals.h"
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+#if DEBUGLEVEL >= 2
+
+static size_t showHexa(const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*)src;
+    size_t u;
+    for (u=0; u<srcSize; u++) {
+        RAWLOG(6, " %02X", ip[u]); (void)ip;
+    }
+    RAWLOG(6, " \n");
+    return srcSize;
+}
+
+#endif
+
+
+/* **************************************************************
+*  Literals compression - special cases
+****************************************************************/
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
+
+    RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ZSTD_memcpy(ostart + flSize, src, srcSize);
+    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    return srcSize + flSize;
+}
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ostart[flSize] = *(const BYTE*)src;
+    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
+    return flSize+1;
+}
+
+size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+                        const int bmi2,
+                        unsigned suspectUncompressible)
+{
+    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+    size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    BYTE*  const ostart = (BYTE*)dst;
+    U32 singleStream = srcSize < 256;
+    symbolEncodingType_e hType = set_compressed;
+    size_t cLitSize;
+
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
+                disableLiteralCompression, (U32)srcSize, dstCapacity);
+
+    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (disableLiteralCompression)
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    /* small ? don't even attempt compression (speed opt) */
+#   define COMPRESS_LITERALS_SIZE_MIN 63
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+
+    RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+    {   HUF_repeat repeat = prevHuf->repeatMode;
+        int const preferRepeat = (strategy < ZSTD_lazy) ? srcSize <= 1024 : 0;
+        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int, int, unsigned);
+        huf_compress_f huf_compress;
+        if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
+        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
+                                src, srcSize,
+                                HUF_SYMBOLVALUE_MAX, LitHufLog,
+                                entropyWorkspace, entropyWorkspaceSize,
+                                (HUF_CElt*)nextHuf->CTable,
+                                &repeat, preferRepeat,
+                                bmi2, suspectUncompressible);
+        if (repeat != HUF_repeat_none) {
+            /* reused the existing table */
+            DEBUGLOG(5, "Reusing previous huffman table");
+            hType = set_repeat;
+        }
+    }
+
+    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+    if (cLitSize==1) {
+        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }
+
+    if (hType == set_compressed) {
+        /* using a newly constructed table */
+        nextHuf->repeatMode = HUF_repeat_check;
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize));
+    return lhSize+cLitSize;
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_literals.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_literals.h
new file mode 100644
index 000000000..9775fb97c
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_literals.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_LITERALS_H
+#define ZSTD_COMPRESS_LITERALS_H
+
+#include "zstd_compress_internal.h" /* ZSTD_hufCTables_t, ZSTD_minGain() */
+
+
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+                        const int bmi2,
+                        unsigned suspectUncompressible);
+
+#endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_sequences.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_sequences.c
new file mode 100644
index 000000000..2c1eee567
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_sequences.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "zstd_compress_sequences.h"
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabilityLog256[256] = {
+    0,    2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+    1130, 1100, 1073, 1047, 1024, 1001, 980,  960,  941,  923,  906,  889,
+    874,  859,  844,  830,  817,  804,  791,  779,  768,  756,  745,  734,
+    724,  714,  704,  694,  685,  676,  667,  658,  650,  642,  633,  626,
+    618,  610,  603,  595,  588,  581,  574,  567,  561,  554,  548,  542,
+    535,  529,  523,  517,  512,  506,  500,  495,  489,  484,  478,  473,
+    468,  463,  458,  453,  448,  443,  438,  434,  429,  424,  420,  415,
+    411,  407,  402,  398,  394,  390,  386,  382,  377,  373,  370,  366,
+    362,  358,  354,  350,  347,  343,  339,  336,  332,  329,  325,  322,
+    318,  315,  311,  308,  305,  302,  298,  295,  292,  289,  286,  282,
+    279,  276,  273,  270,  267,  264,  261,  258,  256,  253,  250,  247,
+    244,  241,  239,  236,  233,  230,  228,  225,  222,  220,  217,  215,
+    212,  209,  207,  204,  202,  199,  197,  194,  192,  190,  187,  185,
+    182,  180,  178,  175,  173,  171,  168,  166,  164,  162,  159,  157,
+    155,  153,  151,  149,  146,  144,  142,  140,  138,  136,  134,  132,
+    130,  128,  126,  123,  121,  119,  117,  115,  114,  112,  110,  108,
+    106,  104,  102,  100,  98,   96,   94,   93,   91,   89,   87,   85,
+    83,   82,   80,   78,   76,   74,   73,   71,   69,   67,   66,   64,
+    62,   61,   59,   57,   55,   54,   52,   50,   49,   47,   46,   44,
+    42,   41,   39,   37,   36,   34,   33,   31,   30,   28,   26,   25,
+    23,   22,   20,   19,   17,   16,   14,   13,   11,   10,   8,    7,
+    5,    4,    2,    1,
+};
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+  void const* ptr = ctable;
+  U16 const* u16ptr = (U16 const*)ptr;
+  U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+  return maxSymbolValue;
+}
+
+/**
+ * Returns true if we should use ncount=-1 else we should
+ * use ncount=1 for low probability symbols instead.
+ */
+static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+{
+    /* Heuristic: This should cover most blocks <= 16K and
+     * start to fade out after 16K to about 32K depending on
+     * compressibility.
+     */
+    return nbSeq >= 2048;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+                              size_t const nbSeq, unsigned const FSELog)
+{
+    BYTE wksp[FSE_NCOUNTBOUND];
+    S16 norm[MaxSeq + 1];
+    const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+    FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), "");
+    return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+    unsigned cost = 0;
+    unsigned s;
+
+    assert(total > 0);
+    for (s = 0; s <= max; ++s) {
+        unsigned norm = (unsigned)((256 * count[s]) / total);
+        if (count[s] != 0 && norm == 0)
+            norm = 1;
+        assert(count[s] < total);
+        cost += count[s] * kInverseProbabilityLog256[norm];
+    }
+    return cost >> 8;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max)
+{
+    unsigned const kAccuracyLog = 8;
+    size_t cost = 0;
+    unsigned s;
+    FSE_CState_t cstate;
+    FSE_initCState(&cstate, ctable);
+    if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+        DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+                    ZSTD_getFSEMaxSymbolValue(ctable), max);
+        return ERROR(GENERIC);
+    }
+    for (s = 0; s <= max; ++s) {
+        unsigned const tableLog = cstate.stateLog;
+        unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+        unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+        if (count[s] == 0)
+            continue;
+        if (bitCost >= badCost) {
+            DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+            return ERROR(GENERIC);
+        }
+        cost += (size_t)count[s] * bitCost;
+    }
+    return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max)
+{
+    unsigned const shift = 8 - accuracyLog;
+    size_t cost = 0;
+    unsigned s;
+    assert(accuracyLog <= 8);
+    for (s = 0; s <= max; ++s) {
+        unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1;
+        unsigned const norm256 = normAcc << shift;
+        assert(norm256 > 0);
+        assert(norm256 < 256);
+        cost += count[s] * kInverseProbabilityLog256[norm256];
+    }
+    return cost >> 8;
+}
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy)
+{
+    ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+    if (mostFrequent == nbSeq) {
+        *repeatMode = FSE_repeat_none;
+        if (isDefaultAllowed && nbSeq <= 2) {
+            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+             * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+             * If basic encoding isn't possible, always choose RLE.
+             */
+            DEBUGLOG(5, "Selected set_basic");
+            return set_basic;
+        }
+        DEBUGLOG(5, "Selected set_rle");
+        return set_rle;
+    }
+    if (strategy < ZSTD_lazy) {
+        if (isDefaultAllowed) {
+            size_t const staticFse_nbSeq_max = 1000;
+            size_t const mult = 10 - strategy;
+            size_t const baseLog = 3;
+            size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog;  /* 28-36 for offset, 56-72 for lengths */
+            assert(defaultNormLog >= 5 && defaultNormLog <= 6);  /* xx_DEFAULTNORMLOG */
+            assert(mult <= 9 && mult >= 7);
+            if ( (*repeatMode == FSE_repeat_valid)
+              && (nbSeq < staticFse_nbSeq_max) ) {
+                DEBUGLOG(5, "Selected set_repeat");
+                return set_repeat;
+            }
+            if ( (nbSeq < dynamicFse_nbSeq_min)
+              || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+                DEBUGLOG(5, "Selected set_basic");
+                /* The format allows default tables to be repeated, but it isn't useful.
+                 * When using simple heuristics to select encoding type, we don't want
+                 * to confuse these tables with dictionaries. When running more careful
+                 * analysis, we don't need to waste time checking both repeating tables
+                 * and default tables.
+                 */
+                *repeatMode = FSE_repeat_none;
+                return set_basic;
+            }
+        }
+    } else {
+        size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+        size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+        size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+        size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+        if (isDefaultAllowed) {
+            assert(!ZSTD_isError(basicCost));
+            assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+        }
+        assert(!ZSTD_isError(NCountCost));
+        assert(compressedCost < ERROR(maxCode));
+        DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+                    (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost);
+        if (basicCost <= repeatCost && basicCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_basic");
+            assert(isDefaultAllowed);
+            *repeatMode = FSE_repeat_none;
+            return set_basic;
+        }
+        if (repeatCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_repeat");
+            assert(!ZSTD_isError(repeatCost));
+            return set_repeat;
+        }
+        assert(compressedCost < basicCost && compressedCost < repeatCost);
+    }
+    DEBUGLOG(5, "Selected set_compressed");
+    *repeatMode = FSE_repeat_check;
+    return set_compressed;
+}
+
+typedef struct {
+    S16 norm[MaxSeq + 1];
+    U32 wksp[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(MaxSeq, MaxFSELog)];
+} ZSTD_BuildCTableWksp;
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize)
+{
+    BYTE* op = (BYTE*)dst;
+    const BYTE* const oend = op + dstCapacity;
+    DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity);
+
+    switch (type) {
+    case set_rle:
+        FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), "");
+        RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space");
+        *op = codeTable[0];
+        return 1;
+    case set_repeat:
+        ZSTD_memcpy(nextCTable, prevCTable, prevCTableSize);
+        return 0;
+    case set_basic:
+        FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), "");  /* note : could be pre-calculated */
+        return 0;
+    case set_compressed: {
+        ZSTD_BuildCTableWksp* wksp = (ZSTD_BuildCTableWksp*)entropyWorkspace;
+        size_t nbSeq_1 = nbSeq;
+        const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+        if (count[codeTable[nbSeq-1]] > 1) {
+            count[codeTable[nbSeq-1]]--;
+            nbSeq_1--;
+        }
+        assert(nbSeq_1 > 1);
+        assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp));
+        (void)entropyWorkspaceSize;
+        FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed");
+        assert(oend >= op);
+        {   size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog);   /* overflow protected */
+            FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
+            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed");
+            return NCountSize;
+        }
+    }
+    default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach");
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_encodeSequences_body(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    BIT_CStream_t blockStream;
+    FSE_CState_t  stateMatchLength;
+    FSE_CState_t  stateOffsetBits;
+    FSE_CState_t  stateLitLength;
+
+    RETURN_ERROR_IF(
+        ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)),
+        dstSize_tooSmall, "not enough space remaining");
+    DEBUGLOG(6, "available space for bitstream : %i  (dstCapacity=%u)",
+                (int)(blockStream.endPtr - blockStream.startPtr),
+                (unsigned)dstCapacity);
+
+    /* first symbols */
+    FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    if (longOffsets) {
+        U32 const ofBits = ofCodeTable[nbSeq-1];
+        unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+        if (extraBits) {
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits);
+            BIT_flushBits(&blockStream);
+        }
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits,
+                    ofBits - extraBits);
+    } else {
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]);
+    }
+    BIT_flushBits(&blockStream);
+
+    {   size_t n;
+        for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
+            BYTE const llCode = llCodeTable[n];
+            BYTE const ofCode = ofCodeTable[n];
+            BYTE const mlCode = mlCodeTable[n];
+            U32  const llBits = LL_bits[llCode];
+            U32  const ofBits = ofCode;
+            U32  const mlBits = ML_bits[mlCode];
+            DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
+                        (unsigned)sequences[n].litLength,
+                        (unsigned)sequences[n].mlBase + MINMATCH,
+                        (unsigned)sequences[n].offBase);
+                                                                            /* 32b*/  /* 64b*/
+                                                                            /* (7)*/  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
+            if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+                BIT_flushBits(&blockStream);                                /* (7)*/
+            BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+            if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+            BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
+            if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
+            if (longOffsets) {
+                unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+                if (extraBits) {
+                    BIT_addBits(&blockStream, sequences[n].offBase, extraBits);
+                    BIT_flushBits(&blockStream);                            /* (7)*/
+                }
+                BIT_addBits(&blockStream, sequences[n].offBase >> extraBits,
+                            ofBits - extraBits);                            /* 31 */
+            } else {
+                BIT_addBits(&blockStream, sequences[n].offBase, ofBits);     /* 31 */
+            }
+            BIT_flushBits(&blockStream);                                    /* (7)*/
+            DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr));
+    }   }
+
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
+    FSE_flushCState(&blockStream, &stateMatchLength);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
+    FSE_flushCState(&blockStream, &stateOffsetBits);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
+    FSE_flushCState(&blockStream, &stateLitLength);
+
+    {   size_t const streamSize = BIT_closeCStream(&blockStream);
+        RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space");
+        return streamSize;
+    }
+}
+
+static size_t
+ZSTD_encodeSequences_default(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+
+#if DYNAMIC_BMI2
+
+static BMI2_TARGET_ATTRIBUTE size_t
+ZSTD_encodeSequences_bmi2(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+#endif
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+{
+    DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
+                                         CTable_MatchLength, mlCodeTable,
+                                         CTable_OffsetBits, ofCodeTable,
+                                         CTable_LitLength, llCodeTable,
+                                         sequences, nbSeq, longOffsets);
+    }
+#endif
+    (void)bmi2;
+    return ZSTD_encodeSequences_default(dst, dstCapacity,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq, longOffsets);
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_sequences.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_sequences.h
new file mode 100644
index 000000000..7991364c2
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_sequences.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_SEQUENCES_H
+#define ZSTD_COMPRESS_SEQUENCES_H
+
+#include "../common/fse.h" /* FSE_repeat, FSE_CTable */
+#include "../common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */
+
+typedef enum {
+    ZSTD_defaultDisallowed = 0,
+    ZSTD_defaultAllowed = 1
+} ZSTD_defaultPolicy_e;
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy);
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize);
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
+
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max);
+
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max);
+#endif /* ZSTD_COMPRESS_SEQUENCES_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_superblock.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_superblock.c
new file mode 100644
index 000000000..eed58e7cf
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_superblock.c
@@ -0,0 +1,579 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "zstd_compress_superblock.h"
+
+#include "../common/zstd_internal.h"  /* ZSTD_getSequenceLength */
+#include "hist.h"                     /* HIST_countFast_wksp */
+#include "zstd_compress_internal.h"   /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */
+#include "zstd_compress_sequences.h"
+#include "zstd_compress_literals.h"
+
+/** ZSTD_compressSubBlock_literal() :
+ *  Compresses literals section for a sub-block.
+ *  When we have to write the Huffman table we will sometimes choose a header
+ *  size larger than necessary. This is because we have to pick the header size
+ *  before we know the table size + compressed size, so we have a bound on the
+ *  table size. If we guessed incorrectly, we fall back to uncompressed literals.
+ *
+ *  We write the header when writeEntropy=1 and set entropyWritten=1 when we succeeded
+ *  in writing the header, otherwise it is set to 0.
+ *
+ *  hufMetadata->hType has literals block type info.
+ *      If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
+ *      If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
+ *      If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
+ *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+ *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+ *  @return : compressed size of literals section of a sub-block
+ *            Or 0 if unable to compress.
+ *            Or error code */
+static size_t
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                              const BYTE* literals, size_t litSize,
+                              void* dst, size_t dstSize,
+                              const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    size_t const header = writeEntropy ? 200 : 0;
+    size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart + lhSize;
+    U32 const singleStream = lhSize == 3;
+    symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+    size_t cLitSize = 0;
+
+    (void)bmi2; /* TODO bmi2... */
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+
+    *entropyWritten = 0;
+    if (litSize == 0 || hufMetadata->hType == set_basic) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
+      return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+    } else if (hufMetadata->hType == set_rle) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
+      return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
+    }
+
+    assert(litSize > 0);
+    assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat);
+
+    if (writeEntropy && hufMetadata->hType == set_compressed) {
+        ZSTD_memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
+        op += hufMetadata->hufDesSize;
+        cLitSize += hufMetadata->hufDesSize;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+    }
+
+    /* TODO bmi2 */
+    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+        op += cSize;
+        cLitSize += cSize;
+        if (cSize == 0 || ERR_isError(cSize)) {
+            DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize));
+            return 0;
+        }
+        /* If we expand and we aren't writing a header then emit uncompressed */
+        if (!writeEntropy && cLitSize >= litSize) {
+            DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        /* If we are writing headers then allow expansion that doesn't change our header size. */
+        if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) {
+            assert(cLitSize > litSize);
+            DEBUGLOG(5, "Literals expanded beyond allowed header size");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    *entropyWritten = 1;
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+    return op-ostart;
+}
+
+static size_t
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
+                   const seqDef* sequences, size_t nbSeq,
+                         size_t litSize, int lastSequence)
+{
+    const seqDef* const sstart = sequences;
+    const seqDef* const send = sequences + nbSeq;
+    const seqDef* sp = sstart;
+    size_t matchLengthSum = 0;
+    size_t litLengthSum = 0;
+    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+    while (send-sp > 0) {
+        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
+        litLengthSum += seqLen.litLength;
+        matchLengthSum += seqLen.matchLength;
+        sp++;
+    }
+    assert(litLengthSum <= litSize);
+    if (!lastSequence) {
+        assert(litLengthSum == litSize);
+    }
+    return matchLengthSum + litSize;
+}
+
+/** ZSTD_compressSubBlock_sequences() :
+ *  Compresses sequences section for a sub-block.
+ *  fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
+ *  symbol compression modes for the super-block.
+ *  The first successfully compressed block will have these in its header.
+ *  We set entropyWritten=1 when we succeed in compressing the sequences.
+ *  The following sub-blocks will always have repeat mode.
+ *  @return : compressed size of sequences section of a sub-block
+ *            Or 0 if it is unable to compress
+ *            Or error code. */
+static size_t
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                const seqDef* sequences, size_t nbSeq,
+                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    BYTE* seqHead;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
+
+    *entropyWritten = 0;
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall, "");
+    if (nbSeq < 0x7F)
+        *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ)
+        op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else
+        op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (nbSeq==0) {
+        return op - ostart;
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart));
+
+    if (writeEntropy) {
+        const U32 LLtype = fseMetadata->llType;
+        const U32 Offtype = fseMetadata->ofType;
+        const U32 MLtype = fseMetadata->mlType;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize);
+        *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+        ZSTD_memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize);
+        op += fseMetadata->fseTablesSize;
+    } else {
+        const U32 repeat = set_repeat;
+        *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2));
+    }
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, oend - op,
+                                        fseTables->matchlengthCTable, mlCode,
+                                        fseTables->offcodeCTable, ofCode,
+                                        fseTables->litlengthCTable, llCode,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+        op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(fseMetadata->lastCountSize + bitstreamSize == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+#endif
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
+    }
+
+    /* zstd versions <= 1.4.0 mistakenly report error when
+     * sequences section body size is less than 3 bytes.
+     * Fixed by https://github.com/facebook/zstd/pull/1664.
+     * This can happen when the previous sequences section block is compressed
+     * with rle mode and the current block's sequences section is compressed
+     * with repeat mode where sequences section body size can be 1 byte.
+     */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (op-seqHead < 4) {
+        DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
+                    "an uncompressed block when sequences are < 4 bytes");
+        return 0;
+    }
+#endif
+
+    *entropyWritten = 1;
+    return op - ostart;
+}
+
+/** ZSTD_compressSubBlock() :
+ *  Compresses a single sub-block.
+ *  @return : compressed size of the sub-block
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                    const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                    const seqDef* sequences, size_t nbSeq,
+                                    const BYTE* literals, size_t litSize,
+                                    const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                    const ZSTD_CCtx_params* cctxParams,
+                                    void* dst, size_t dstCapacity,
+                                    const int bmi2,
+                                    int writeLitEntropy, int writeSeqEntropy,
+                                    int* litEntropyWritten, int* seqEntropyWritten,
+                                    U32 lastBlock)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart + ZSTD_blockHeaderSize;
+    DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
+                litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+    {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                        &entropyMetadata->hufMetadata, literals, litSize,
+                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
+        FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+        if (cLitSize == 0) return 0;
+        op += cLitSize;
+    }
+    {   size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse,
+                                                  &entropyMetadata->fseMetadata,
+                                                  sequences, nbSeq,
+                                                  llCode, mlCode, ofCode,
+                                                  cctxParams,
+                                                  op, oend-op,
+                                                  bmi2, writeSeqEntropy, seqEntropyWritten);
+        FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+        if (cSeqSize == 0) return 0;
+        op += cSeqSize;
+    }
+    /* Write block header */
+    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
+        U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+        MEM_writeLE24(ostart, cBlockHeader24);
+    }
+    return op-ostart;
+}
+
+static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+                                                const ZSTD_hufCTables_t* huf,
+                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                void* workspace, size_t wkspSize,
+                                                int writeEntropy)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    unsigned maxSymbolValue = 255;
+    size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+
+    if (hufMetadata->hType == set_basic) return litSize;
+    else if (hufMetadata->hType == set_rle) return 1;
+    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
+        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
+                        const BYTE* codeTable, unsigned maxCode,
+                        size_t nbSeq, const FSE_CTable* fseCTable,
+                        const U8* additionalBits,
+                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                        void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    const BYTE* ctp = codeTable;
+    const BYTE* const ctStart = ctp;
+    const BYTE* const ctEnd = ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits = 0;
+    unsigned max = maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+    if (type == set_basic) {
+        /* We selected this encoding type, so it must be valid. */
+        assert(max <= defaultMax);
+        cSymbolTypeSizeEstimateInBits = max <= defaultMax
+                ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max)
+                : ERROR(GENERIC);
+    } else if (type == set_rle) {
+        cSymbolTypeSizeEstimateInBits = 0;
+    } else if (type == set_compressed || type == set_repeat) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10;
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits / 8;
+}
+
+static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+                                                  const BYTE* llCodeTable,
+                                                  const BYTE* mlCodeTable,
+                                                  size_t nbSeq,
+                                                  const ZSTD_fseCTables_t* fseTables,
+                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                  void* workspace, size_t wkspSize,
+                                                  int writeEntropy)
+{
+    size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+    size_t cSeqSizeEstimate = 0;
+    if (nbSeq == 0) return sequencesSectionHeaderSize;
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,
+                                         nbSeq, fseTables->offcodeCTable, NULL,
+                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL,
+                                         nbSeq, fseTables->litlengthCTable, LL_bits,
+                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML,
+                                         nbSeq, fseTables->matchlengthCTable, ML_bits,
+                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                         workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                        const BYTE* ofCodeTable,
+                                        const BYTE* llCodeTable,
+                                        const BYTE* mlCodeTable,
+                                        size_t nbSeq,
+                                        const ZSTD_entropyCTables_t* entropy,
+                                        const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                        void* workspace, size_t wkspSize,
+                                        int writeLitEntropy, int writeSeqEntropy) {
+    size_t cSizeEstimate = 0;
+    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+                                                         workspace, wkspSize, writeLitEntropy);
+    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                         workspace, wkspSize, writeSeqEntropy);
+    return cSizeEstimate + ZSTD_blockHeaderSize;
+}
+
+static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+{
+    if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle)
+        return 1;
+    if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle)
+        return 1;
+    if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle)
+        return 1;
+    return 0;
+}
+
+/** ZSTD_compressSubBlock_multi() :
+ *  Breaks super-block into multiple sub-blocks and compresses them.
+ *  Entropy will be written to the first block.
+ *  The following blocks will use repeat mode to compress.
+ *  All sub-blocks are compressed blocks (no raw or rle blocks).
+ *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                            const ZSTD_compressedBlockState_t* prevCBlock,
+                            ZSTD_compressedBlockState_t* nextCBlock,
+                            const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                            const ZSTD_CCtx_params* cctxParams,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const int bmi2, U32 lastBlock,
+                            void* workspace, size_t wkspSize)
+{
+    const seqDef* const sstart = seqStorePtr->sequencesStart;
+    const seqDef* const send = seqStorePtr->sequences;
+    const seqDef* sp = sstart;
+    const BYTE* const lstart = seqStorePtr->litStart;
+    const BYTE* const lend = seqStorePtr->lit;
+    const BYTE* lp = lstart;
+    BYTE const* ip = (BYTE const*)src;
+    BYTE const* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    const BYTE* llCodePtr = seqStorePtr->llCode;
+    const BYTE* mlCodePtr = seqStorePtr->mlCode;
+    const BYTE* ofCodePtr = seqStorePtr->ofCode;
+    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+    size_t litSize, seqCount;
+    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
+    int writeSeqEntropy = 1;
+    int lastSequence = 0;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+                (unsigned)(lend-lp), (unsigned)(send-sstart));
+
+    litSize = 0;
+    seqCount = 0;
+    do {
+        size_t cBlockSizeEstimate = 0;
+        if (sstart == send) {
+            lastSequence = 1;
+        } else {
+            const seqDef* const sequence = sp + seqCount;
+            lastSequence = sequence == send - 1;
+            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+            seqCount++;
+        }
+        if (lastSequence) {
+            assert(lp <= lend);
+            assert(litSize <= (size_t)(lend - lp));
+            litSize = (size_t)(lend - lp);
+        }
+        /* I think there is an optimization opportunity here.
+         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+         * since it recalculates estimate from scratch.
+         * For example, it would recount literal distribution and symbol codes every time.
+         */
+        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+                                                       &nextCBlock->entropy, entropyMetadata,
+                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+            int litEntropyWritten = 0;
+            int seqEntropyWritten = 0;
+            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+                                                       sp, seqCount,
+                                                       lp, litSize,
+                                                       llCodePtr, mlCodePtr, ofCodePtr,
+                                                       cctxParams,
+                                                       op, oend-op,
+                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+                                                       &litEntropyWritten, &seqEntropyWritten,
+                                                       lastBlock && lastSequence);
+            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+            if (cSize > 0 && cSize < decompressedSize) {
+                DEBUGLOG(5, "Committed the sub-block");
+                assert(ip + decompressedSize <= iend);
+                ip += decompressedSize;
+                sp += seqCount;
+                lp += litSize;
+                op += cSize;
+                llCodePtr += seqCount;
+                mlCodePtr += seqCount;
+                ofCodePtr += seqCount;
+                litSize = 0;
+                seqCount = 0;
+                /* Entropy only needs to be written once */
+                if (litEntropyWritten) {
+                    writeLitEntropy = 0;
+                }
+                if (seqEntropyWritten) {
+                    writeSeqEntropy = 0;
+                }
+            }
+        }
+    } while (!lastSequence);
+    if (writeLitEntropy) {
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
+        ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+    }
+    if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+        /* If we haven't written our entropy tables, then we've violated our contract and
+         * must emit an uncompressed block.
+         */
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
+        return 0;
+    }
+    if (ip < iend) {
+        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
+        FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+        assert(cSize != 0);
+        op += cSize;
+        /* We have to regenerate the repcodes because we've skipped some sequences */
+        if (sp < send) {
+            seqDef const* seq;
+            repcodes_t rep;
+            ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+            for (seq = sstart; seq < sp; ++seq) {
+                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+            }
+            ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+        }
+    }
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+    return op-ostart;
+}
+
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
+                               unsigned lastBlock) {
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+
+    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+          &zc->blockState.prevCBlock->entropy,
+          &zc->blockState.nextCBlock->entropy,
+          &zc->appliedParams,
+          &entropyMetadata,
+          zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+
+    return ZSTD_compressSubBlock_multi(&zc->seqStore,
+            zc->blockState.prevCBlock,
+            zc->blockState.nextCBlock,
+            &entropyMetadata,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            src, srcSize,
+            zc->bmi2, lastBlock,
+            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_superblock.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_superblock.h
new file mode 100644
index 000000000..176f9b106
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_compress_superblock.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_ADVANCED_H
+#define ZSTD_COMPRESS_ADVANCED_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+
+#include "../zstd.h" /* ZSTD_CCtx */
+
+/*-*************************************
+*  Target Compressed Block Size
+***************************************/
+
+/* ZSTD_compressSuperBlock() :
+ * Used to compress a super block when targetCBlockSize is being used.
+ * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
+                               unsigned lastBlock);
+
+#endif /* ZSTD_COMPRESS_ADVANCED_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_cwksp.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_cwksp.h
new file mode 100644
index 000000000..47afe3dc7
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_cwksp.h
@@ -0,0 +1,676 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CWKSP_H
+#define ZSTD_CWKSP_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "../common/zstd_internal.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Constants
+***************************************/
+
+/* Since the workspace is effectively its own little malloc implementation /
+ * arena, when we run under ASAN, we should similarly insert redzones between
+ * each internal element of the workspace, so ASAN will catch overruns that
+ * reach outside an object but that stay inside the workspace.
+ *
+ * This defines the size of that redzone.
+ */
+#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE
+#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
+#endif
+
+
+/* Set our tables and aligneds to align by 64 bytes */
+#define ZSTD_CWKSP_ALIGNMENT_BYTES 64
+
+/*-*************************************
+*  Structures
+***************************************/
+typedef enum {
+    ZSTD_cwksp_alloc_objects,
+    ZSTD_cwksp_alloc_buffers,
+    ZSTD_cwksp_alloc_aligned
+} ZSTD_cwksp_alloc_phase_e;
+
+/**
+ * Used to describe whether the workspace is statically allocated (and will not
+ * necessarily ever be freed), or if it's dynamically allocated and we can
+ * expect a well-formed caller to free this.
+ */
+typedef enum {
+    ZSTD_cwksp_dynamic_alloc,
+    ZSTD_cwksp_static_alloc
+} ZSTD_cwksp_static_alloc_e;
+
+/**
+ * Zstd fits all its internal datastructures into a single continuous buffer,
+ * so that it only needs to perform a single OS allocation (or so that a buffer
+ * can be provided to it and it can perform no allocations at all). This buffer
+ * is called the workspace.
+ *
+ * Several optimizations complicate that process of allocating memory ranges
+ * from this workspace for each internal datastructure:
+ *
+ * - These different internal datastructures have different setup requirements:
+ *
+ *   - The static objects need to be cleared once and can then be trivially
+ *     reused for each compression.
+ *
+ *   - Various buffers don't need to be initialized at all--they are always
+ *     written into before they're read.
+ *
+ *   - The matchstate tables have a unique requirement that they don't need
+ *     their memory to be totally cleared, but they do need the memory to have
+ *     some bound, i.e., a guarantee that all values in the memory they've been
+ *     allocated is less than some maximum value (which is the starting value
+ *     for the indices that they will then use for compression). When this
+ *     guarantee is provided to them, they can use the memory without any setup
+ *     work. When it can't, they have to clear the area.
+ *
+ * - These buffers also have different alignment requirements.
+ *
+ * - We would like to reuse the objects in the workspace for multiple
+ *   compressions without having to perform any expensive reallocation or
+ *   reinitialization work.
+ *
+ * - We would like to be able to efficiently reuse the workspace across
+ *   multiple compressions **even when the compression parameters change** and
+ *   we need to resize some of the objects (where possible).
+ *
+ * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp
+ * abstraction was created. It works as follows:
+ *
+ * Workspace Layout:
+ *
+ * [                        ... workspace ...                         ]
+ * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
+ *
+ * The various objects that live in the workspace are divided into the
+ * following categories, and are allocated separately:
+ *
+ * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict,
+ *   so that literally everything fits in a single buffer. Note: if present,
+ *   this must be the first object in the workspace, since ZSTD_customFree{CCtx,
+ *   CDict}() rely on a pointer comparison to see whether one or two frees are
+ *   required.
+ *
+ * - Fixed size objects: these are fixed-size, fixed-count objects that are
+ *   nonetheless "dynamically" allocated in the workspace so that we can
+ *   control how they're initialized separately from the broader ZSTD_CCtx.
+ *   Examples:
+ *   - Entropy Workspace
+ *   - 2 x ZSTD_compressedBlockState_t
+ *   - CDict dictionary contents
+ *
+ * - Tables: these are any of several different datastructures (hash tables,
+ *   chain tables, binary trees) that all respect a common format: they are
+ *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+ *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+ *
+ * - Aligned: these buffers are used for various purposes that require 4 byte
+ *   alignment, but don't require any initialization before they're used. These
+ *   buffers are each aligned to 64 bytes.
+ *
+ * - Buffers: these buffers are used for various purposes that don't require
+ *   any alignment or initialization before they're used. This means they can
+ *   be moved around at no cost for a new compression.
+ *
+ * Allocating Memory:
+ *
+ * The various types of objects must be allocated in order, so they can be
+ * correctly packed into the workspace buffer. That order is:
+ *
+ * 1. Objects
+ * 2. Buffers
+ * 3. Aligned/Tables
+ *
+ * Attempts to reserve objects of different types out of order will fail.
+ */
+typedef struct {
+    void* workspace;
+    void* workspaceEnd;
+
+    void* objectEnd;
+    void* tableEnd;
+    void* tableValidEnd;
+    void* allocStart;
+
+    BYTE allocFailed;
+    int workspaceOversizedDuration;
+    ZSTD_cwksp_alloc_phase_e phase;
+    ZSTD_cwksp_static_alloc_e isStatic;
+} ZSTD_cwksp;
+
+/*-*************************************
+*  Functions
+***************************************/
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+
+MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+    (void)ws;
+    assert(ws->workspace <= ws->objectEnd);
+    assert(ws->objectEnd <= ws->tableEnd);
+    assert(ws->objectEnd <= ws->tableValidEnd);
+    assert(ws->tableEnd <= ws->allocStart);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    assert(ws->allocStart <= ws->workspaceEnd);
+}
+
+/**
+ * Align must be a power of 2.
+ */
+MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
+    size_t const mask = align - 1;
+    assert((align & mask) == 0);
+    return (size + mask) & ~mask;
+}
+
+/**
+ * Use this to determine how much space in the workspace we will consume to
+ * allocate this object. (Normally it should be exactly the size of the object,
+ * but under special conditions, like ASAN, where we pad each object, it might
+ * be larger.)
+ *
+ * Since tables aren't currently redzoned, you don't need to call through this
+ * to figure out how much space you need for the matchState tables. Everything
+ * else is though.
+ *
+ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
+ */
+MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+    if (size == 0)
+        return 0;
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#else
+    return size;
+#endif
+}
+
+/**
+ * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
+ * Used to determine the number of bytes required for a given "aligned".
+ */
+MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
+}
+
+/**
+ * Returns the amount of additional space the cwksp must allocate
+ * for internal purposes (currently only alignment).
+ */
+MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+     * to align the beginning of the aligned section.
+     *
+     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+     * aligneds being sized in multiples of 64 bytes.
+     */
+    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
+    return slackSpace;
+}
+
+
+/**
+ * Return the number of additional bytes required to align a pointer to the given number of bytes.
+ * alignBytes must be a power of two.
+ */
+MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
+    size_t const alignBytesMask = alignBytes - 1;
+    size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+    assert((alignBytes & alignBytesMask) == 0);
+    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
+    return bytes;
+}
+
+/**
+ * Internal function. Do not use directly.
+ * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+ * which counts from the end of the wksp (as opposed to the object/table segment).
+ *
+ * Returns a pointer to the beginning of that space.
+ */
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes)
+{
+    void* const alloc = (BYTE*)ws->allocStart - bytes;
+    void* const bottom = ws->tableEnd;
+    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(alloc >= bottom);
+    if (alloc < bottom) {
+        DEBUGLOG(4, "cwksp: alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    /* the area is reserved from the end of wksp.
+     * If it overlaps with tableValidEnd, it voids guarantees on values' range */
+    if (alloc < ws->tableValidEnd) {
+        ws->tableValidEnd = alloc;
+    }
+    ws->allocStart = alloc;
+    return alloc;
+}
+
+/**
+ * Moves the cwksp to the next phase, and does any necessary allocations.
+ * cwksp initialization must necessarily go through each phase in order.
+ * Returns a 0 on success, or zstd error
+ */
+MEM_STATIC size_t
+ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase)
+{
+    assert(phase >= ws->phase);
+    if (phase > ws->phase) {
+        /* Going from allocating objects to allocating buffers */
+        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+                phase >= ZSTD_cwksp_alloc_buffers) {
+            ws->tableValidEnd = ws->objectEnd;
+        }
+
+        /* Going from allocating buffers to allocating aligneds/tables */
+        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+                phase >= ZSTD_cwksp_alloc_aligned) {
+            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+                size_t const bytesToAlign =
+                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+            }
+            {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+                void* const alloc = ws->objectEnd;
+                size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
+                DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                "table phase - alignment initial allocation failed!");
+                ws->objectEnd = objectEnd;
+                ws->tableEnd = objectEnd;  /* table area starts being empty */
+                if (ws->tableValidEnd < ws->tableEnd) {
+                    ws->tableValidEnd = ws->tableEnd;
+        }   }   }
+        ws->phase = phase;
+        ZSTD_cwksp_assert_internal_consistency(ws);
+    }
+    return 0;
+}
+
+/**
+ * Returns whether this object/buffer/etc was allocated in this workspace.
+ */
+MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+{
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
+}
+
+/**
+ * Internal function. Do not use directly.
+ */
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase)
+{
+    void* alloc;
+    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {
+        return NULL;
+    }
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    if (alloc) {
+        alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+        if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+            __asan_unpoison_memory_region(alloc, bytes);
+        }
+    }
+#endif
+
+    return alloc;
+}
+
+/**
+ * Reserves and returns unaligned memory.
+ */
+MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+{
+    return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+}
+
+/**
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+{
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
+                                            ZSTD_cwksp_alloc_aligned);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    return ptr;
+}
+
+/**
+ * Aligned on 64 bytes. These buffers have the special property that
+ * their values remain constrained, allowing us to re-use them without
+ * memset()-ing them.
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+{
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
+    void* alloc;
+    void* end;
+    void* top;
+
+    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+        return NULL;
+    }
+    alloc = ws->tableEnd;
+    end = (BYTE *)alloc + bytes;
+    top = ws->allocStart;
+
+    DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    assert((bytes & (sizeof(U32)-1)) == 0);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(end <= top);
+    if (end > top) {
+        DEBUGLOG(4, "cwksp: table alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->tableEnd = end;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        __asan_unpoison_memory_region(alloc, bytes);
+    }
+#endif
+
+    assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    return alloc;
+}
+
+/**
+ * Aligned on sizeof(void*).
+ * Note : should happen only once, at workspace first initialization
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
+    void* alloc = ws->objectEnd;
+    void* end = (BYTE*)alloc + roundedBytes;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    DEBUGLOG(4,
+        "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",
+        alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes);
+    assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0);
+    assert(bytes % ZSTD_ALIGNOF(void*) == 0);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    /* we must be in the first phase, no advance is possible */
+    if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) {
+        DEBUGLOG(3, "cwksp: object alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->objectEnd = end;
+    ws->tableEnd = end;
+    ws->tableValidEnd = end;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    alloc = (BYTE*)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        __asan_unpoison_memory_region(alloc, bytes);
+    }
+#endif
+
+    return alloc;
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
+{
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the table re-use logic is sound, and that we don't
+     * access table space that we haven't cleaned, we re-"poison" the table
+     * space every time we mark it dirty. */
+    {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        assert(__msan_test_shadow(ws->objectEnd, size) == -1);
+        __msan_poison(ws->objectEnd, size);
+    }
+#endif
+
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    ws->tableValidEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        ws->tableValidEnd = ws->tableEnd;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Zero the part of the allocated tables not already marked clean.
+ */
+MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+    }
+    ZSTD_cwksp_mark_tables_clean(ws);
+}
+
+/**
+ * Invalidates table allocations.
+ * All other allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing tables!");
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* We don't do this when the workspace is statically allocated, because
+     * when that is the case, we have no capability to hook into the end of the
+     * workspace's lifecycle to unpoison the memory.
+     */
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Invalidates all buffer, aligned, and table allocations.
+ * Object allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing!");
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the context re-use logic is sound, and that we don't
+     * access stuff that this compression hasn't initialized, we re-"poison"
+     * the workspace (or at least the non-static, non-table parts of it)
+     * every time we start a new compression. */
+    {
+        size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd;
+        __msan_poison(ws->tableValidEnd, size);
+    }
+#endif
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* We don't do this when the workspace is statically allocated, because
+     * when that is the case, we have no capability to hook into the end of the
+     * workspace's lifecycle to unpoison the memory.
+     */
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ws->allocStart = ws->workspaceEnd;
+    ws->allocFailed = 0;
+    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+        ws->phase = ZSTD_cwksp_alloc_buffers;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * The provided workspace takes ownership of the buffer [start, start+size).
+ * Any existing values in the workspace are ignored (the previously managed
+ * buffer, if present, must be separately freed).
+ */
+MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_cwksp_static_alloc_e isStatic) {
+    DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size);
+    assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */
+    ws->workspace = start;
+    ws->workspaceEnd = (BYTE*)start + size;
+    ws->objectEnd = ws->workspace;
+    ws->tableValidEnd = ws->objectEnd;
+    ws->phase = ZSTD_cwksp_alloc_objects;
+    ws->isStatic = isStatic;
+    ZSTD_cwksp_clear(ws);
+    ws->workspaceOversizedDuration = 0;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) {
+    void* workspace = ZSTD_customMalloc(size, customMem);
+    DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size);
+    RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!");
+    ZSTD_cwksp_init(ws, workspace, size, ZSTD_cwksp_dynamic_alloc);
+    return 0;
+}
+
+MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) {
+    void *ptr = ws->workspace;
+    DEBUGLOG(4, "cwksp: freeing workspace");
+    ZSTD_memset(ws, 0, sizeof(ZSTD_cwksp));
+    ZSTD_customFree(ptr, customMem);
+}
+
+/**
+ * Moves the management of a workspace from one cwksp to another. The src cwksp
+ * is left in an invalid state (src must be re-init()'ed before it's used again).
+ */
+MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+    *dst = *src;
+    ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+}
+
+MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+}
+
+MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+    return ws->allocFailed;
+}
+
+/*-*************************************
+*  Functions Checking Free Space
+***************************************/
+
+/* ZSTD_alignmentSpaceWithinBounds() :
+ * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+ * actual amount of space used.
+ */
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+                                                        size_t const estimatedSpace, int resizedWorkspace) {
+    if (resizedWorkspace) {
+        /* Resized/newly allocated wksp should have exact bounds */
+        return ZSTD_cwksp_used(ws) == estimatedSpace;
+    } else {
+        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+         */
+        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+    }
+}
+
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace;
+}
+
+MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_available(
+        ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)
+        && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
+        ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    if (ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) {
+        ws->workspaceOversizedDuration++;
+    } else {
+        ws->workspaceOversizedDuration = 0;
+    }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_CWKSP_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_double_fast.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_double_fast.c
new file mode 100644
index 000000000..c2dbd54c1
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_double_fast.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_double_fast.h"
+
+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0)
+                hashSmall[smHash] = curr + i;
+            if (i == 0 || hashLarge[lgHash] == 0)
+                hashLarge[lgHash] = curr + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* anchor = istart;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mode */
+    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    size_t mLength;
+    U32 offset;
+    U32 curr;
+
+    /* how many positions to search before increasing step size */
+    const size_t kStepIncr = 1 << kSearchStrength;
+    /* the position at which to increment the step size if no match is found */
+    const BYTE* nextStep;
+    size_t step; /* the current step size */
+
+    size_t hl0; /* the long hash at ip */
+    size_t hl1; /* the long hash at ip1 */
+
+    U32 idxl0; /* the long match index for ip */
+    U32 idxl1; /* the long match index for ip1 */
+
+    const BYTE* matchl0; /* the long match for ip */
+    const BYTE* matchs0; /* the short match for ip */
+    const BYTE* matchl1; /* the long match for ip1 */
+
+    const BYTE* ip = istart; /* the current position */
+    const BYTE* ip1; /* the next position */
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
+
+    /* init */
+    ip += ((ip - prefixLowest) == 0);
+    {
+        U32 const current = (U32)(ip - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+        U32 const maxRep = current - windowLow;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* Outer Loop: one iteration per match found and stored */
+    while (1) {
+        step = 1;
+        nextStep = ip + kStepIncr;
+        ip1 = ip + step;
+
+        if (ip1 > ilimit) {
+            goto _cleanup;
+        }
+
+        hl0 = ZSTD_hashPtr(ip, hBitsL, 8);
+        idxl0 = hashLong[hl0];
+        matchl0 = base + idxl0;
+
+        /* Inner Loop: one iteration per search / position */
+        do {
+            const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls);
+            const U32 idxs0 = hashSmall[hs0];
+            curr = (U32)(ip-base);
+            matchs0 = base + idxs0;
+
+            hashLong[hl0] = hashSmall[hs0] = curr;   /* update hash tables */
+
+            /* check noDict repcode */
+            if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                ip++;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                goto _match_stored;
+            }
+
+            hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
+
+            if (idxl0 > prefixLowestIndex) {
+                /* check prefix long match */
+                if (MEM_read64(matchl0) == MEM_read64(ip)) {
+                    mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                    offset = (U32)(ip-matchl0);
+                    while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            }
+
+            idxl1 = hashLong[hl1];
+            matchl1 = base + idxl1;
+
+            if (idxs0 > prefixLowestIndex) {
+                /* check prefix short match */
+                if (MEM_read32(matchs0) == MEM_read32(ip)) {
+                    goto _search_next_long;
+                }
+            }
+
+            if (ip1 >= nextStep) {
+                PREFETCH_L1(ip1 + 64);
+                PREFETCH_L1(ip1 + 128);
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip = ip1;
+            ip1 += step;
+
+            hl0 = hl1;
+            idxl0 = idxl1;
+            matchl0 = matchl1;
+    #if defined(__aarch64__)
+            PREFETCH_L1(ip+256);
+    #endif
+        } while (ip1 <= ilimit);
+
+_cleanup:
+        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+        /* save reps for next block */
+        rep[0] = offset_1 ? offset_1 : offsetSaved1;
+        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+        /* Return the last literals size */
+        return (size_t)(iend - anchor);
+
+_search_next_long:
+
+        /* check prefix long +1 match */
+        if (idxl1 > prefixLowestIndex) {
+            if (MEM_read64(matchl1) == MEM_read64(ip1)) {
+                ip = ip1;
+                mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8;
+                offset = (U32)(ip-matchl1);
+                while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        }
+
+        /* if no long +1 match, explore the short match we found */
+        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
+        offset = (U32)(ip - matchs0);
+        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */
+
+        /* fall-through */
+
+_match_found: /* requires ip, offset, mLength */
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        if (step < 4) {
+            /* It is unsafe to write this value back to the hashtable when ip1 is
+             * greater than or equal to the new ip we will have after we're done
+             * processing this match. Rather than perform that test directly
+             * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler
+             * more predictable test. The minmatch even if we take a short match is
+             * 4 bytes, so as long as step, the distance between ip and ip1
+             * (initially) is less than 4, we know ip1 < new ip. */
+            hashLong[hl1] = (U32)(ip1 - base);
+        }
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while ( (ip <= ilimit)
+                 && ( (offset_2>0)
+                    & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                ip += rLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+        }
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mode */
+    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+    const U32* const dictHashLong  = dms->hashTable;
+    const U32* const dictHashSmall = dms->chainTable;
+    const U32 dictStartIndex       = dms->window.dictLimit;
+    const BYTE* const dictBase     = dms->window.base;
+    const BYTE* const dictStart    = dictBase + dictStartIndex;
+    const BYTE* const dictEnd      = dms->window.nextSrc;
+    const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+
+    /* if a dictionary is attached, it must be within window range */
+    assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashLong, hashTableBytes)
+        PREFETCH_AREA(dictHashSmall, chainTableBytes)
+    }
+
+    /* init */
+    ip += (dictAndPrefixLength == 0);
+
+    /* dictMatchState repCode checks don't currently handle repCode == 0
+     * disabling. */
+    assert(offset_1 <= dictAndPrefixLength);
+    assert(offset_2 <= dictAndPrefixLength);
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        U32 offset;
+        size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+        size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+        U32 const curr = (U32)(ip-base);
+        U32 const matchIndexL = hashLong[h2];
+        U32 matchIndexS = hashSmall[h];
+        const BYTE* matchLong = base + matchIndexL;
+        const BYTE* match = base + matchIndexS;
+        const U32 repIndex = curr + 1 - offset_1;
+        const BYTE* repMatch = (repIndex < prefixLowestIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+        hashLong[h2] = hashSmall[h] = curr;   /* update hash tables */
+
+        /* check repcode */
+        if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+            goto _match_stored;
+        }
+
+        if (matchIndexL > prefixLowestIndex) {
+            /* check prefix long match */
+            if (MEM_read64(matchLong) == MEM_read64(ip)) {
+                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+                offset = (U32)(ip-matchLong);
+                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        } else if (dictTagsMatchL) {
+            /* check dictMatchState long match */
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+            const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+            assert(dictMatchL < dictEnd);
+
+            if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+                mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+                offset = (U32)(curr - dictMatchIndexL - dictIndexDelta);
+                while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+                goto _match_found;
+        }   }
+
+        if (matchIndexS > prefixLowestIndex) {
+            /* check prefix short match */
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        } else if (dictTagsMatchS) {
+            /* check dictMatchState short match */
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+            match = dictBase + dictMatchIndexS;
+            matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+            if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+        }   }
+
+        ip += ((ip-anchor) >> kSearchStrength) + 1;
+#if defined(__aarch64__)
+        PREFETCH_L1(ip+256);
+#endif
+        continue;
+
+_search_next_long:
+        {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+            const BYTE* matchL3 = base + matchIndexL3;
+            hashLong[hl3] = curr + 1;
+
+            /* check prefix long +1 match */
+            if (matchIndexL3 > prefixLowestIndex) {
+                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+                    ip++;
+                    offset = (U32)(ip-matchL3);
+                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            } else if (dictTagsMatchL3) {
+                /* check dict long +1 match */
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                assert(dictMatchL3 < dictEnd);
+                if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+                    ip++;
+                    offset = (U32)(curr + 1 - dictMatchIndexL3 - dictIndexDelta);
+                    while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+        }   }   }
+
+        /* if no long +1 match, explore the short match we found */
+        if (matchIndexS < prefixLowestIndex) {
+            mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+            offset = (U32)(curr - matchIndexS);
+            while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        } else {
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip - match);
+            while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        }
+
+_match_found:
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ?
+                        dictBase + repIndex2 - dictIndexDelta :
+                        base + repIndex2;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+    }   /* while (ip < ilimit) */
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+#define ZSTD_GEN_DFAST_FN(dictMode, mls)                                                                 \
+    static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(                                      \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \
+            void const* src, size_t srcSize)                                                             \
+    {                                                                                                    \
+        return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
+    }
+
+ZSTD_GEN_DFAST_FN(noDict, 4)
+ZSTD_GEN_DFAST_FN(noDict, 5)
+ZSTD_GEN_DFAST_FN(noDict, 6)
+ZSTD_GEN_DFAST_FN(noDict, 7)
+
+ZSTD_GEN_DFAST_FN(dictMatchState, 4)
+ZSTD_GEN_DFAST_FN(dictMatchState, 5)
+ZSTD_GEN_DFAST_FN(dictMatchState, 6)
+ZSTD_GEN_DFAST_FN(dictMatchState, 7)
+
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
+    /* if extDict is invalidated due to maxDistance, switch to "regular" variant */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+        const U32 matchIndex = hashSmall[hSmall];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+
+        const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+        const U32 matchLongIndex = hashLong[hLong];
+        const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+        const U32 curr = (U32)(ip-base);
+        const U32 repIndex = curr + 1 - offset_1;   /* offset_1 expected <= curr +1 */
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
+        size_t mLength;
+        hashSmall[hSmall] = hashLong[hLong] = curr;   /* update hash table */
+
+        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+            & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+        } else {
+            if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
+                offset = curr - matchLongIndex;
+                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+            } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
+                const BYTE* match3 = match3Base + matchIndex3;
+                U32 offset;
+                hashLong[h3] = curr + 1;
+                if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
+                    ip++;
+                    offset = curr+1 - matchIndex3;
+                    while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+                    offset = curr - matchIndex;
+                    while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                }
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+            } else {
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
+                continue;
+        }   }
+
+        /* move to next sequence start */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+                    & (offset_2 <= current2 - dictStartIndex))
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+ZSTD_GEN_DFAST_FN(extDict, 4)
+ZSTD_GEN_DFAST_FN(extDict, 5)
+ZSTD_GEN_DFAST_FN(extDict, 6)
+ZSTD_GEN_DFAST_FN(extDict, 7)
+
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+    }
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_double_fast.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_double_fast.h
new file mode 100644
index 000000000..6d8ee8c65
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_double_fast.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_DOUBLE_FAST_H
+#define ZSTD_DOUBLE_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "../common/mem.h"      /* U32 */
+#include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_DOUBLE_FAST_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_fast.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_fast.c
new file mode 100644
index 000000000..291173449
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_fast.c
@@ -0,0 +1,960 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+#include "zstd_fast.h"
+
+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
+        hashTable[hash0] = curr;
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hash] == 0) {  /* not yet filled */
+                    hashTable[hash] = curr + p;
+    }   }   }   }
+}
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
+
+/**
+ * If you squint hard enough (and ignore repcodes), the search operation at any
+ * given position is broken into 4 stages:
+ *
+ * 1. Hash   (map position to hash value via input read)
+ * 2. Lookup (map hash val to index via hashtable read)
+ * 3. Load   (map index to value at that position via input read)
+ * 4. Compare
+ *
+ * Each of these steps involves a memory read at an address which is computed
+ * from the previous step. This means these steps must be sequenced and their
+ * latencies are cumulative.
+ *
+ * Rather than do 1->2->3->4 sequentially for a single position before moving
+ * onto the next, this implementation interleaves these operations across the
+ * next few positions:
+ *
+ * R = Repcode Read & Compare
+ * H = Hash
+ * T = Table Lookup
+ * M = Match Read & Compare
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | ... M
+ * N+1 | ...   TM
+ * N+2 |    R H   T M
+ * N+3 |         H    TM
+ * N+4 |           R H   T M
+ * N+5 |                H   ...
+ * N+6 |                  R ...
+ *
+ * This is very much analogous to the pipelining of execution in a CPU. And just
+ * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
+ * branch).
+ *
+ * When this happens, we throw away our current state, and do the following prep
+ * to re-enter the loop:
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | H T
+ * N+1 |  H
+ *
+ * This is also the work we do at the beginning to enter the loop initially.
+ */
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_fast_noDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+
+    const BYTE* anchor = istart;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+    U32 rep_offset1 = rep[0];
+    U32 rep_offset2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    U32 mval; /* src value at match idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+
+    /* ip0 and ip1 are always adjacent. The targetLength skipping and
+     * uncompressibility acceleration is applied to every other position,
+     * matching the behavior of #1562. step therefore represents the gap
+     * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
+    ip0 += (ip0 == prefixStart);
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+        U32 const maxRep = curr - windowLow;
+        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
+        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+
+    do {
+        /* load repcode match for ip[2]*/
+        const U32 rval = MEM_read32(ip2 - rep_offset1);
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        /* check repcode at ip[2] */
+        if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
+            ip0 = ip2;
+            match0 = ip0 - rep_offset1;
+            mLength = ip0[-1] == match0[-1];
+            ip0 -= mLength;
+            match0 -= mLength;
+            offcode = REPCODE1_TO_OFFBASE;
+            mLength += 4;
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 is before the
+             * repcode (ip2). */
+            hashTable[hash1] = (U32)(ip1 - base);
+
+            goto _match;
+        }
+
+        /* load match for ip[0] */
+        if (idx >= prefixStartIndex) {
+            mval = MEM_read32(base + idx);
+        } else {
+            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+        }
+
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) == mval) {
+            /* found a match! */
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 == ip0 + 1, so
+             * we know we will resume searching after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
+
+            goto _offset;
+        }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        /* load match for ip[0] */
+        if (idx >= prefixStartIndex) {
+            mval = MEM_read32(base + idx);
+        } else {
+            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+        }
+
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) == mval) {
+            /* found a match! */
+
+            /* first write next hash table entry; we've already calculated it */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                 * position at which we will pick up our searching after we've
+                 * taken this match.
+                 *
+                 * The minimum possible match has length 4, so the earliest ip0
+                 * can be after we take this match will be the current ip0 + 4.
+                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                 * write this position.
+                 */
+                hashTable[hash1] = (U32)(ip1 - base);
+            }
+
+            goto _offset;
+        }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
+     * When the offsets are still zero, we need to restore them after the block to have a correct
+     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
+     * offsets were invalid. We need to figure out which offset to refill with.
+     *     - If both offsets are zero they are in the same order.
+     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
+     *     - If only one is zero, we need to decide which offset to restore.
+     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
+     *         - It is impossible for rep_offset2 to be non-zero.
+     *
+     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
+     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
+     */
+    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
+    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx */
+
+    /* Compute the offset code. */
+    match0 = base + idx;
+    rep_offset2 = rep_offset1;
+    rep_offset1 = (U32)(ip0-match0);
+    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+    mLength = 4;
+
+    /* Count the backwards match length. */
+    while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
+        ip0--;
+        match0--;
+        mLength++;
+    }
+
+_match: /* Requires: ip0, match0, offcode */
+
+    /* Count the forward length. */
+    mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
+            while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
+                { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += rLength;
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                anchor = ip0;
+                continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+    }   }   }
+
+    goto _start;
+}
+
+#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \
+    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                      \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
+            void const* src, size_t srcSize)                                                       \
+    {                                                                                              \
+        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
+    }
+
+ZSTD_GEN_FAST_FN(noDict, 4, 1)
+ZSTD_GEN_FAST_FN(noDict, 5, 1)
+ZSTD_GEN_FAST_FN(noDict, 6, 1)
+ZSTD_GEN_FAST_FN(noDict, 7, 1)
+
+ZSTD_GEN_FAST_FN(noDict, 4, 0)
+ZSTD_GEN_FAST_FN(noDict, 5, 0)
+ZSTD_GEN_FAST_FN(noDict, 6, 0)
+ZSTD_GEN_FAST_FN(noDict, 7, 0)
+
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
+    if (ms->cParams.targetLength > 1) {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
+        }
+    } else {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
+        }
+
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+    const BYTE* anchor = istart;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+    const U32* const dictHashTable = dms->hashTable;
+    const U32 dictStartIndex       = dms->window.dictLimit;
+    const BYTE* const dictBase     = dms->window.base;
+    const BYTE* const dictStart    = dictBase + dictStartIndex;
+    const BYTE* const dictEnd      = dms->window.nextSrc;
+    const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+
+    /* if a dictionary is still attached, it necessarily means that
+     * it is within window size. So we just check it. */
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    assert(endIndex - prefixStartIndex <= maxDistance);
+    (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+
+    (void)hasStep; /* not currently specialized on whether it's accelerated */
+
+    /* ensure there will be no underflow
+     * when translating a dict index into a local index */
+    assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashTable, hashTableBytes)
+    }
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+    ip0 += (dictAndPrefixLength == 0);
+    /* dictMatchState repCode checks don't currently handle repCode == 0
+     * disabling. */
+    assert(offset_1 <= dictAndPrefixLength);
+    assert(offset_2 <= dictAndPrefixLength);
+
+    /* Outer search loop */
+    assert(stepSize >= 1);
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+        size_t mLength;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE* match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
+                }
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
+                } /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
+            }
+
+            /* Prepare for next iteration */
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
+            matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip0 = ip1;
+            ip1 = ip1 + step;
+            if (ip1 > ilimit) goto _cleanup;
+
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }   /* end inner search loop */
+
+        /* match found */
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
+
+        if (ip0 <= ilimit) {
+            /* Fill Table */
+            assert(base+curr+2 > istart);  /* check base overflow */
+            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+            /* check immediate repcode */
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                        dictBase - dictIndexDelta + repIndex2 :
+                        base + repIndex2;
+                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        ip1 = ip0 + stepSize;
+    }
+
+_cleanup:
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState != NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_fast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* anchor = istart;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    const BYTE* idxBase; /* base pointer for idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
+
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
+    (void)hasStep; /* not currently specialized on whether it's accelerated */
+
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
+
+    /* switch to "regular" variant if extDict is invalidated due to maxDistance */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const maxRep = curr - dictStartIndex;
+        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+    idxBase = idx < prefixStartIndex ? dictBase : base;
+
+    do {
+        {   /* load repcode match for ip[2] */
+            U32 const current2 = (U32)(ip2 - base);
+            U32 const repIndex = current2 - offset_1;
+            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+            U32 rval;
+            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
+                 & (offset_1 > 0) ) {
+                rval = MEM_read32(repBase + repIndex);
+            } else {
+                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+            }
+
+            /* write back hash table entry */
+            current0 = (U32)(ip0 - base);
+            hashTable[hash0] = current0;
+
+            /* check repcode at ip[2] */
+            if (MEM_read32(ip2) == rval) {
+                ip0 = ip2;
+                match0 = repBase + repIndex;
+                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                assert((match0 != prefixStart) & (match0 != dictStart));
+                mLength = ip0[-1] == match0[-1];
+                ip0 -= mLength;
+                match0 -= mLength;
+                offcode = REPCODE1_TO_OFFBASE;
+                mLength += 4;
+                goto _match;
+        }   }
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx, idxBase */
+
+    /* Compute the offset code. */
+    {   U32 const offset = current0 - idx;
+        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
+        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
+        match0 = idxBase + idx;
+        offset_2 = offset_1;
+        offset_1 = offset;
+        offcode = OFFSET_TO_OFFBASE(offset);
+        mLength = 4;
+
+        /* Count the backwards match length. */
+        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
+            ip0--;
+            match0--;
+            mLength++;
+    }   }
+
+_match: /* Requires: ip0, match0, offcode, matchEnd */
+
+    /* Count the forward length. */
+    assert(matchEnd != 0);
+    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] = (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        while (ip0 <= ilimit) {
+            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
+            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
+                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
+                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
+            }
+            break;
+    }   }
+
+    goto _start;
+}
+
+ZSTD_GEN_FAST_FN(extDict, 4, 0)
+ZSTD_GEN_FAST_FN(extDict, 5, 0)
+ZSTD_GEN_FAST_FN(extDict, 6, 0)
+ZSTD_GEN_FAST_FN(extDict, 7, 0)
+
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
+    }
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_fast.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_fast.h
new file mode 100644
index 000000000..3bfeb2c5f
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_fast.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_FAST_H
+#define ZSTD_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "../common/mem.h"      /* U32 */
+#include "zstd_compress_internal.h"
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp);
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_lazy.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_lazy.c
new file mode 100644
index 000000000..e54b43c0c
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_lazy.c
@@ -0,0 +1,2120 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_lazy.h"
+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+
+static void
+ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+                const BYTE* ip, const BYTE* iend,
+                U32 mls)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hashLog = cParams->hashLog;
+
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    if (idx != target)
+        DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
+                    idx, target, ms->window.dictLimit);
+    assert(ip + 8 <= iend);   /* condition for ZSTD_hashPtr */
+    (void)iend;
+
+    assert(idx >= ms->window.dictLimit);   /* condition for valid base+idx */
+    for ( ; idx < target ; idx++) {
+        size_t const h  = ZSTD_hashPtr(base + idx, hashLog, mls);   /* assumption : ip + 8 <= iend */
+        U32    const matchIndex = hashTable[h];
+
+        U32*   const nextCandidatePtr = bt + 2*(idx&btMask);
+        U32*   const sortMarkPtr  = nextCandidatePtr + 1;
+
+        DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
+        hashTable[h] = idx;   /* Update Hash Table */
+        *nextCandidatePtr = matchIndex;   /* update BT like a chain */
+        *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
+    }
+    ms->nextToUpdate = target;
+}
+
+
+/** ZSTD_insertDUBT1() :
+ *  sort one already inserted but unsorted position
+ *  assumption : curr >= btlow == (curr - btmask)
+ *  doesn't fail */
+static void
+ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+                 U32 curr, const BYTE* inputEnd,
+                 U32 nbCompares, U32 btLow,
+                 const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
+    const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    U32* smallerPtr = bt + 2*(curr&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 matchIndex = *smallerPtr;   /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowValid = ms->window.lowLimit;
+    U32 const maxDistance = 1U << cParams->windowLog;
+    U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
+
+
+    DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
+                curr, dictLimit, windowLow);
+    assert(curr >= btLow);
+    assert(ip < iend);   /* condition for ZSTD_count */
+
+    for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < curr);
+        /* note : all candidates are now supposed sorted,
+         * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
+         * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
+
+        if ( (dictMode != ZSTD_extDict)
+          || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
+          || (curr < dictLimit) /* both in extDict */) {
+            const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
+                                     || (matchIndex+matchLength >= dictLimit)) ?
+                                        base : dictBase;
+            assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
+                 || (curr < dictLimit) );
+            match = mBase + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* preparation for next read of match[matchLength] */
+        }
+
+        DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
+                    curr, matchIndex, (U32)matchLength);
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
+                        matchIndex, btLow, nextPtr[1]);
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
+                        matchIndex, btLow, nextPtr[0]);
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+}
+
+
+static size_t
+ZSTD_DUBT_findBetterDictMatch (
+        const ZSTD_matchState_t* ms,
+        const BYTE* const ip, const BYTE* const iend,
+        size_t* offsetPtr,
+        size_t bestLength,
+        U32 nbCompares,
+        U32 const mls,
+        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_matchState_t * const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+    const U32 * const dictHashTable = dms->hashTable;
+    U32         const hashLog = dmsCParams->hashLog;
+    size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32               dictMatchIndex = dictHashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    U32         const curr = (U32)(ip-base);
+    const BYTE* const dictBase = dms->window.base;
+    const BYTE* const dictEnd = dms->window.nextSrc;
+    U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
+    U32         const dictLowLimit = dms->window.lowLimit;
+    U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
+
+    U32*        const dictBt = dms->chainTable;
+    U32         const btLog  = dmsCParams->chainLog - 1;
+    U32         const btMask = (1 << btLog) - 1;
+    U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
+
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+
+    (void)dictMode;
+    assert(dictMode == ZSTD_dictMatchState);
+
+    for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
+        U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match = dictBase + dictMatchIndex;
+        matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+        if (dictMatchIndex+matchLength >= dictHighLimit)
+            match = base + dictMatchIndex + dictIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+        if (matchLength > bestLength) {
+            U32 matchIndex = dictMatchIndex + dictIndexDelta;
+            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+            }
+            if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthLarger = matchLength;
+            dictMatchIndex = nextPtr[0];
+        }
+    }
+
+    if (bestLength >= MINMATCH) {
+        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+        DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                    curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+    }
+    return bestLength;
+
+}
+
+
+static size_t
+ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iend,
+                        size_t* offBasePtr,
+                        U32 const mls,
+                        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32          matchIndex  = hashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    U32    const curr = (U32)(ip-base);
+    U32    const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
+
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32    const btLow = (btMask >= curr) ? 0 : curr - btMask;
+    U32    const unsortLimit = MAX(btLow, windowLow);
+
+    U32*         nextCandidate = bt + 2*(matchIndex&btMask);
+    U32*         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+    U32          nbCompares = 1U << cParams->searchLog;
+    U32          nbCandidates = nbCompares;
+    U32          previousCandidate = 0;
+
+    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
+    assert(ip <= iend-8);   /* required for h calculation */
+    assert(dictMode != ZSTD_dedicatedDictSearch);
+
+    /* reach end of unsorted candidates list */
+    while ( (matchIndex > unsortLimit)
+         && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
+         && (nbCandidates > 1) ) {
+        DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
+                    matchIndex);
+        *unsortedMark = previousCandidate;  /* the unsortedMark becomes a reversed chain, to move up back to original position */
+        previousCandidate = matchIndex;
+        matchIndex = *nextCandidate;
+        nextCandidate = bt + 2*(matchIndex&btMask);
+        unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+        nbCandidates --;
+    }
+
+    /* nullify last candidate if it's still unsorted
+     * simplification, detrimental to compression ratio, beneficial for speed */
+    if ( (matchIndex > unsortLimit)
+      && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
+        DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
+                    matchIndex);
+        *nextCandidate = *unsortedMark = 0;
+    }
+
+    /* batch sort stacked candidates */
+    matchIndex = previousCandidate;
+    while (matchIndex) {  /* will end on matchIndex == 0 */
+        U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
+        U32 const nextCandidateIdx = *nextCandidateIdxPtr;
+        ZSTD_insertDUBT1(ms, matchIndex, iend,
+                         nbCandidates, unsortLimit, dictMode);
+        matchIndex = nextCandidateIdx;
+        nbCandidates++;
+    }
+
+    /* find longest match */
+    {   size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+        const BYTE* const dictBase = ms->window.dictBase;
+        const U32 dictLimit = ms->window.dictLimit;
+        const BYTE* const dictEnd = dictBase + dictLimit;
+        const BYTE* const prefixStart = base + dictLimit;
+        U32* smallerPtr = bt + 2*(curr&btMask);
+        U32* largerPtr  = bt + 2*(curr&btMask) + 1;
+        U32 matchEndIdx = curr + 8 + 1;
+        U32 dummy32;   /* to be nullified at the end */
+        size_t bestLength = 0;
+
+        matchIndex  = hashTable[h];
+        hashTable[h] = curr;   /* Update Hash Table */
+
+        for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
+            U32* const nextPtr = bt + 2*(matchIndex & btMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match;
+
+            if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
+                match = base + matchIndex;
+                matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+            } else {
+                match = dictBase + matchIndex;
+                matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+                if (matchIndex+matchLength >= dictLimit)
+                    match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+            }
+
+            if (matchLength > bestLength) {
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
+                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                    if (dictMode == ZSTD_dictMatchState) {
+                        nbCompares = 0; /* in addition to avoiding checking any
+                                         * further in this loop, make sure we
+                                         * skip checking in the dictionary. */
+                    }
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (match[matchLength] < ip[matchLength]) {
+                /* match is smaller than current */
+                *smallerPtr = matchIndex;             /* update smaller idx */
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+                matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                *largerPtr = matchIndex;
+                commonLengthLarger = matchLength;
+                if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                largerPtr = nextPtr;
+                matchIndex = nextPtr[0];
+        }   }
+
+        *smallerPtr = *largerPtr = 0;
+
+        assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+        if (dictMode == ZSTD_dictMatchState && nbCompares) {
+            bestLength = ZSTD_DUBT_findBetterDictMatch(
+                    ms, ip, iend,
+                    offBasePtr, bestLength, nbCompares,
+                    mls, dictMode);
+        }
+
+        assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+        ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+        if (bestLength >= MINMATCH) {
+            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+            DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+        }
+        return bestLength;
+    }
+}
+
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iLimit,
+                      size_t* offBasePtr,
+                const U32 mls /* template */,
+                const ZSTD_dictMode_e dictMode)
+{
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+}
+
+/***********************************
+* Dedicated dict search
+***********************************/
+
+void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32* const hashTable = ms->hashTable;
+    U32* const chainTable = ms->chainTable;
+    U32 const chainSize = 1 << ms->cParams.chainLog;
+    U32 idx = ms->nextToUpdate;
+    U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
+    U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32 const cacheSize = bucketSize - 1;
+    U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
+    U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
+
+    /* We know the hashtable is oversized by a factor of `bucketSize`.
+     * We are going to temporarily pretend `bucketSize == 1`, keeping only a
+     * single entry. We will use the rest of the space to construct a temporary
+     * chaintable.
+     */
+    U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32* const tmpHashTable = hashTable;
+    U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
+    U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
+    U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
+    U32 hashIdx;
+
+    assert(ms->cParams.chainLog <= 24);
+    assert(ms->cParams.hashLog > ms->cParams.chainLog);
+    assert(idx != 0);
+    assert(tmpMinChain <= minChain);
+
+    /* fill conventional hash table and conventional chain table */
+    for ( ; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
+        if (idx >= tmpMinChain) {
+            tmpChainTable[idx - tmpMinChain] = hashTable[h];
+        }
+        tmpHashTable[h] = idx;
+    }
+
+    /* sort chains into ddss chain table */
+    {
+        U32 chainPos = 0;
+        for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
+            U32 count;
+            U32 countBeyondMinChain = 0;
+            U32 i = tmpHashTable[hashIdx];
+            for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
+                /* skip through the chain to the first position that won't be
+                 * in the hash cache bucket */
+                if (i < minChain) {
+                    countBeyondMinChain++;
+                }
+                i = tmpChainTable[i - tmpMinChain];
+            }
+            if (count == cacheSize) {
+                for (count = 0; count < chainLimit;) {
+                    if (i < minChain) {
+                        if (!i || ++countBeyondMinChain > cacheSize) {
+                            /* only allow pulling `cacheSize` number of entries
+                             * into the cache or chainTable beyond `minChain`,
+                             * to replace the entries pulled out of the
+                             * chainTable into the cache. This lets us reach
+                             * back further without increasing the total number
+                             * of entries in the chainTable, guaranteeing the
+                             * DDSS chain table will fit into the space
+                             * allocated for the regular one. */
+                            break;
+                        }
+                    }
+                    chainTable[chainPos++] = i;
+                    count++;
+                    if (i < tmpMinChain) {
+                        break;
+                    }
+                    i = tmpChainTable[i - tmpMinChain];
+                }
+            } else {
+                count = 0;
+            }
+            if (count) {
+                tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
+            } else {
+                tmpHashTable[hashIdx] = 0;
+            }
+        }
+        assert(chainPos <= chainSize); /* I believe this is guaranteed... */
+    }
+
+    /* move chain pointers into the last entry of each hash bucket */
+    for (hashIdx = (1 << hashLog); hashIdx; ) {
+        U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
+        U32 const chainPackedPointer = tmpHashTable[hashIdx];
+        U32 i;
+        for (i = 0; i < cacheSize; i++) {
+            hashTable[bucketIdx + i] = 0;
+        }
+        hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
+    }
+
+    /* fill the buckets of the hash table */
+    for (idx = ms->nextToUpdate; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
+                   << ZSTD_LAZY_DDSS_BUCKET_LOG;
+        U32 i;
+        /* Shift hash cache down 1. */
+        for (i = cacheSize - 1; i; i--)
+            hashTable[h + i] = hashTable[h + i - 1];
+        hashTable[h] = idx;
+    }
+
+    ms->nextToUpdate = target;
+}
+
+/* Returns the longest match length found in the dedicated dict search structure.
+ * If none are longer than the argument ml, then ml will be returned.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
+                                            const ZSTD_matchState_t* const dms,
+                                            const BYTE* const ip, const BYTE* const iLimit,
+                                            const BYTE* const prefixStart, const U32 curr,
+                                            const U32 dictLimit, const size_t ddsIdx) {
+    const U32 ddsLowestIndex  = dms->window.dictLimit;
+    const BYTE* const ddsBase = dms->window.base;
+    const BYTE* const ddsEnd  = dms->window.nextSrc;
+    const U32 ddsSize         = (U32)(ddsEnd - ddsBase);
+    const U32 ddsIndexDelta   = dictLimit - ddsSize;
+    const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
+    const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
+    U32 ddsAttempt;
+    U32 matchIndex;
+
+    for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
+        PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
+    }
+
+    {
+        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+        U32 const chainIndex = chainPackedPointer >> 8;
+
+        PREFETCH_L1(&dms->chainTable[chainIndex]);
+    }
+
+    for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
+        size_t currentMl=0;
+        const BYTE* match;
+        matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
+        match = ddsBase + matchIndex;
+
+        if (!matchIndex) {
+            return ml;
+        }
+
+        /* guaranteed by table construction */
+        (void)ddsLowestIndex;
+        assert(matchIndex >= ddsLowestIndex);
+        assert(match+4 <= ddsEnd);
+        if (MEM_read32(match) == MEM_read32(ip)) {
+            /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+            currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+            if (ip+currentMl == iLimit) {
+                /* best possible, avoids read overflow on next attempt */
+                return ml;
+            }
+        }
+    }
+
+    {
+        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+        U32 chainIndex = chainPackedPointer >> 8;
+        U32 const chainLength = chainPackedPointer & 0xFF;
+        U32 const chainAttempts = nbAttempts - ddsAttempt;
+        U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
+        U32 chainAttempt;
+
+        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
+            PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
+        }
+
+        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
+            size_t currentMl=0;
+            const BYTE* match;
+            matchIndex = dms->chainTable[chainIndex];
+            match = ddsBase + matchIndex;
+
+            /* guaranteed by table construction */
+            assert(matchIndex >= ddsLowestIndex);
+            assert(match+4 <= ddsEnd);
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+            }
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+        }
+    }
+    return ml;
+}
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
+                        const BYTE* ip, U32 const mls)
+{
+    U32* const hashTable  = ms->hashTable;
+    const U32 hashLog = cParams->hashLog;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainMask = (1 << cParams->chainLog) - 1;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    ms->nextToUpdate = target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+}
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_HcFindBestMatch(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainSize = (1 << cParams->chainLog);
+    const U32 chainMask = chainSize-1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 curr = (U32)(ip-base);
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 lowestValid = ms->window.lowLimit;
+    const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    const U32 isDictionary = (ms->loadedDictEnd != 0);
+    const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+    const U32 minChain = curr > chainSize ? curr - chainSize : 0;
+    U32 nbAttempts = 1U << cParams->searchLog;
+    size_t ml=4-1;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
+                         ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
+    const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
+                        ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
+
+    U32 matchIndex;
+
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        const U32* entry = &dms->hashTable[ddsIdx];
+        PREFETCH_L1(entry);
+    }
+
+    /* HC4 match finder */
+    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+
+    for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+        size_t currentMl=0;
+        if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+            const BYTE* const match = base + matchIndex;
+            assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+            if (match[ml] == ip[ml])   /* potentially better */
+                currentMl = ZSTD_count(ip, match, iLimit);
+        } else {
+            const BYTE* const match = dictBase + matchIndex;
+            assert(match+4 <= dictEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+            if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+        }
+
+        if (matchIndex <= minChain) break;
+        matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+    }
+
+    assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
+                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
+    } else if (dictMode == ZSTD_dictMatchState) {
+        const U32* const dmsChainTable = dms->chainTable;
+        const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
+        const U32 dmsChainMask         = dmsChainSize - 1;
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+        const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
+
+        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
+
+        for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
+            size_t currentMl=0;
+            const BYTE* const match = dmsBase + matchIndex;
+            assert(match+4 <= dmsEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                assert(curr > matchIndex + dmsIndexDelta);
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+
+            if (matchIndex <= dmsMinChain) break;
+
+            matchIndex = dmsChainTable[matchIndex & dmsChainMask];
+        }
+    }
+
+    return ml;
+}
+
+/* *********************************
+* (SIMD) Row-based matchfinder
+***********************************/
+/* Constants for row-based hash */
+#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+#define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+
+#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
+
+typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 representing a mask of matches */
+
+/* ZSTD_VecMask_next():
+ * Starting from the LSB, returns the idx of the next non-zero bit.
+ * Basically counting the nb of trailing zeroes.
+ */
+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+    return ZSTD_countTrailingZeros64(val);
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+ */
+FORCE_INLINE_TEMPLATE
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &= 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &= 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+FORCE_INLINE_TEMPLATE
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &= 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+/* ZSTD_row_nextIndex():
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+  U32 const next = (*tagRow - 1) & rowMask;
+  *tagRow = (BYTE)next;
+  return next;
+}
+
+/* ZSTD_isAligned():
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
+ */
+MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+    assert((align & (align - 1)) == 0);
+    return (((size_t)ptr) & (align - 1)) == 0;
+}
+
+/* ZSTD_row_prefetch():
+ * Performs prefetching for the hashTable and tagTable at a given row.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
+    PREFETCH_L1(hashTable + relRow);
+    if (rowLog >= 5) {
+        PREFETCH_L1(hashTable + relRow + 16);
+        /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
+    }
+    PREFETCH_L1(tagTable + relRow);
+    if (rowLog == 6) {
+        PREFETCH_L1(tagTable + relRow + 32);
+    }
+    assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
+    assert(ZSTD_isAligned(hashTable + relRow, 64));                 /* prefetched hash row always 64-byte aligned */
+    assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
+}
+
+/* ZSTD_row_fillHashCache():
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+ * but not beyond iLimit.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+                                   U32 const rowLog, U32 const mls,
+                                   U32 idx, const BYTE* const iLimit)
+{
+    U32 const* const hashTable = ms->hashTable;
+    U16 const* const tagTable = ms->tagTable;
+    U32 const hashLog = ms->rowHashLog;
+    U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+    U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+
+    for (; idx < lim; ++idx) {
+        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+        ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+    }
+
+    DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
+                                                     ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
+                                                     ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
+}
+
+/* ZSTD_row_nextCachedHash():
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+                                                  U16 const* tagTable, BYTE const* base,
+                                                  U32 idx, U32 const hashLog,
+                                                  U32 const rowLog, U32 const mls)
+{
+    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+    U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+    ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+    {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+        cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
+        return hash;
+    }
+}
+
+/* ZSTD_row_update_internalImpl():
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+                                                        U32 updateStartIdx, U32 const updateEndIdx,
+                                                        U32 const mls, U32 const rowLog,
+                                                        U32 const rowMask, U32 const useCache)
+{
+    U32* const hashTable = ms->hashTable;
+    U16* const tagTable = ms->tagTable;
+    U32 const hashLog = ms->rowHashLog;
+    const BYTE* const base = ms->window.base;
+
+    DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+    for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32* const row = hashTable + relRow;
+        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+                                                       Explicit cast allows us to get exact desired position within each row */
+        U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+
+        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
+        row[pos] = updateStartIdx;
+    }
+}
+
+/* ZSTD_row_update_internal():
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+ * Skips sections of long matches as is necessary.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+                                                    U32 const mls, U32 const rowLog,
+                                                    U32 const rowMask, U32 const useCache)
+{
+    U32 idx = ms->nextToUpdate;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    const U32 kSkipThreshold = 384;
+    const U32 kMaxMatchStartPositionsToUpdate = 96;
+    const U32 kMaxMatchEndPositionsToUpdate = 32;
+
+    if (useCache) {
+        /* Only skip positions when using hash cache, i.e.
+         * if we are loading a dict, don't skip anything.
+         * If we decide to skip, then we only update a set number
+         * of positions at the beginning and end of the match.
+         */
+        if (UNLIKELY(target - idx > kSkipThreshold)) {
+            U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
+            ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
+            idx = target - kMaxMatchEndPositionsToUpdate;
+            ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
+        }
+    }
+    assert(target >= idx);
+    ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
+    ms->nextToUpdate = target;
+}
+
+/* ZSTD_row_update():
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
+ * processing.
+ */
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+    const U32 rowMask = (1u << rowLog) - 1;
+    const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+
+    DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
+}
+
+/* Returns the mask width of bits group of which will be set to 1. Given not all
+ * architectures have easy movemask instruction, this helps to iterate over
+ * groups of bits easier and faster.
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    (void)rowEntries;
+#if defined(ZSTD_ARCH_ARM_NEON)
+    /* NEON path only works for little endian */
+    if (!MEM_isLittleEndian()) {
+        return 1;
+    }
+    if (rowEntries == 16) {
+        return 4;
+    }
+    if (rowEntries == 32) {
+        return 2;
+    }
+    if (rowEntries == 64) {
+        return 1;
+    }
+#endif
+    return 1;
+}
+
+#if defined(ZSTD_ARCH_X86_SSE2)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
+{
+    const __m128i comparisonMask = _mm_set1_epi8((char)tag);
+    int matches[4] = {0};
+    int i;
+    assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
+    for (i=0; i<nbChunks; i++) {
+        const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
+        const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
+        matches[i] = _mm_movemask_epi8(equalMask);
+    }
+    if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
+    if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
+    assert(nbChunks == 4);
+    return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
+}
+#endif
+
+#if defined(ZSTD_ARCH_ARM_NEON)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    if (rowEntries == 16) {
+        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
+         * After that groups of 4 bits represent the equalMask. We lower
+         * all bits except the highest in these groups by doing AND with
+         * 0x88 = 0b10001000.
+         */
+        const uint8x16_t chunk = vld1q_u8(src);
+        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
+    } else if (rowEntries == 32) {
+        /* Same idea as with rowEntries == 16 but doing AND with
+         * 0x55 = 0b01010101.
+         */
+        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
+        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
+        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
+        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
+    } else { /* rowEntries == 64 */
+        const uint8x16x4_t chunk = vld4q_u8(src);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+
+        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped);
+    }
+}
+#endif
+
+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
+ * matches the hash at the nth position in a row of the tagTable.
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
+ * must rotate the "matches" bitfield to match up with the actual layout of the
+ * entries within the hashTable */
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+{
+    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+
+#if defined(ZSTD_ARCH_X86_SSE2)
+
+    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+
+#else /* SW or NEON-LE */
+
+# if defined(ZSTD_ARCH_ARM_NEON)
+  /* This NEON path only works for little endian - otherwise use SWAR below */
+    if (MEM_isLittleEndian()) {
+        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+    }
+# endif /* ZSTD_ARCH_ARM_NEON */
+    /* SWAR */
+    {   const size_t chunkSize = sizeof(size_t);
+        const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+        const size_t xFF = ~((size_t)0);
+        const size_t x01 = xFF / 0xFF;
+        const size_t x80 = x01 << 7;
+        const size_t splatChar = tag * x01;
+        ZSTD_VecMask matches = 0;
+        int i = rowEntries - chunkSize;
+        assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
+        if (MEM_isLittleEndian()) { /* runtime check so have two loops */
+            const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
+            do {
+                size_t chunk = MEM_readST(&src[i]);
+                chunk ^= splatChar;
+                chunk = (((chunk | x80) - x01) | chunk) & x80;
+                matches <<= chunkSize;
+                matches |= (chunk * extractMagic) >> shiftAmount;
+                i -= chunkSize;
+            } while (i >= 0);
+        } else { /* big endian: reverse bits during extraction */
+            const size_t msb = xFF ^ (xFF >> 1);
+            const size_t extractMagic = (msb / 0x1FF) | msb;
+            do {
+                size_t chunk = MEM_readST(&src[i]);
+                chunk ^= splatChar;
+                chunk = (((chunk | x80) - x01) | chunk) & x80;
+                matches <<= chunkSize;
+                matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
+                i -= chunkSize;
+            } while (i >= 0);
+        }
+        matches = ~matches;
+        if (rowEntries == 16) {
+            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+        } else if (rowEntries == 32) {
+            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+        } else {
+            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+        }
+    }
+#endif
+}
+
+/* The high-level approach of the SIMD row based match finder is as follows:
+ * - Figure out where to insert the new entry:
+ *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+ *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
+ *        which row to insert into.
+ *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+ *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+ *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+ *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+ *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+ *                  for alignment/performance reasons, leaving some bytes unused.
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
+ *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+ * - Pick the longest match.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_RowFindBestMatch(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode,
+                        const U32 rowLog)
+{
+    U32* const hashTable = ms->hashTable;
+    U16* const tagTable = ms->tagTable;
+    U32* const hashCache = ms->hashCache;
+    const U32 hashLog = ms->rowHashLog;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 curr = (U32)(ip-base);
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 lowestValid = ms->window.lowLimit;
+    const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    const U32 isDictionary = (ms->loadedDictEnd != 0);
+    const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+    const U32 rowEntries = (1U << rowLog);
+    const U32 rowMask = rowEntries - 1;
+    const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
+    U32 nbAttempts = 1U << cappedSearchLog;
+    size_t ml=4-1;
+
+    /* DMS/DDS variables that may be referenced laster */
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+
+    /* Initialize the following variables to satisfy static analyzer */
+    size_t ddsIdx = 0;
+    U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
+    U32 dmsTag = 0;
+    U32* dmsRow = NULL;
+    BYTE* dmsTagRow = NULL;
+
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+        {   /* Prefetch DDS hashtable entry */
+            ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
+            PREFETCH_L1(&dms->hashTable[ddsIdx]);
+        }
+        ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
+    }
+
+    if (dictMode == ZSTD_dictMatchState) {
+        /* Prefetch DMS rows */
+        U32* const dmsHashTable = dms->hashTable;
+        U16* const dmsTagTable = dms->tagTable;
+        U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+        dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
+        dmsRow = dmsHashTable + dmsRelRow;
+        ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
+    }
+
+    /* Update the hashTable and tagTable up to (but not including) ip */
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+    {   /* Get the hash for ip, compute the appropriate row */
+        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+        U32* const row = hashTable + relRow;
+        BYTE* tagRow = (BYTE*)(tagTable + relRow);
+        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+        U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+        size_t numMatches = 0;
+        size_t currMatch = 0;
+        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+
+        /* Cycle through the matches and prefetch */
+        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+            U32 const matchIndex = row[matchPos];
+            assert(numMatches < rowEntries);
+            if (matchIndex < lowLimit)
+                break;
+            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                PREFETCH_L1(base + matchIndex);
+            } else {
+                PREFETCH_L1(dictBase + matchIndex);
+            }
+            matchBuffer[numMatches++] = matchIndex;
+        }
+
+        /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+           in ZSTD_row_update_internal() at the next search. */
+        {
+            U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
+            row[pos] = ms->nextToUpdate++;
+        }
+
+        /* Return the longest match */
+        for (; currMatch < numMatches; ++currMatch) {
+            U32 const matchIndex = matchBuffer[currMatch];
+            size_t currentMl=0;
+            assert(matchIndex < curr);
+            assert(matchIndex >= lowLimit);
+
+            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                const BYTE* const match = base + matchIndex;
+                assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+                if (match[ml] == ip[ml])   /* potentially better */
+                    currentMl = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex;
+                assert(match+4 <= dictEnd);
+                if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+            }
+
+            /* Save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+        }
+    }
+
+    assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
+                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
+    } else if (dictMode == ZSTD_dictMatchState) {
+        /* TODO: Measure and potentially add prefetching to DMS */
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+
+        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+            U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+            size_t numMatches = 0;
+            size_t currMatch = 0;
+            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+
+            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                U32 const matchIndex = dmsRow[matchPos];
+                if (matchIndex < dmsLowestIndex)
+                    break;
+                PREFETCH_L1(dmsBase + matchIndex);
+                matchBuffer[numMatches++] = matchIndex;
+            }
+
+            /* Return the longest match */
+            for (; currMatch < numMatches; ++currMatch) {
+                U32 const matchIndex = matchBuffer[currMatch];
+                size_t currentMl=0;
+                assert(matchIndex >= dmsLowestIndex);
+                assert(matchIndex < curr);
+
+                {   const BYTE* const match = dmsBase + matchIndex;
+                    assert(match+4 <= dmsEnd);
+                    if (MEM_read32(match) == MEM_read32(ip))
+                        currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+                }
+
+                if (currentMl > ml) {
+                    ml = currentMl;
+                    assert(curr > matchIndex + dmsIndexDelta);
+                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                    if (ip+currentMl == iLimit) break;
+                }
+            }
+        }
+    }
+    return ml;
+}
+
+
+typedef size_t (*searchMax_f)(
+                    ZSTD_matchState_t* ms,
+                    const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+
+/**
+ * This struct contains the functions necessary for lazy to search.
+ * Currently, that is only searchMax. However, it is still valuable to have the
+ * VTable because this makes it easier to add more functions to the VTable later.
+ *
+ * TODO: The start of the search function involves loading and calculating a
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
+ * done in an initialization function, and saved somewhere in the match state.
+ * Then we could pass a pointer to the saved state instead of the match state,
+ * and avoid duplicate computations.
+ *
+ * TODO: Move the match re-winding into searchMax. This improves compression
+ * ratio, and unlocks further simplifications with the next TODO.
+ *
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
+ * and repcode search are in searchMax, there is no more logic in the match
+ * finder loop that requires knowledge about the dictMode. So we should be
+ * able to avoid force inlining it, and we can join the extDict loop with
+ * the single segment loop. It should go in searchMax instead of its own
+ * function to avoid having multiple virtual function calls per search.
+ */
+typedef struct {
+    searchMax_f searchMax;
+} ZSTD_LazyVTable;
+
+#define GEN_ZSTD_BT_VTABLE(dictMode, mls)                                             \
+    static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls(                            \
+            ZSTD_matchState_t* ms,                                                    \
+            const BYTE* ip, const BYTE* const iLimit,                                 \
+            size_t* offBasePtr)                                                       \
+    {                                                                                 \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                          \
+        return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode);\
+    }                                                                                 \
+    static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = {                 \
+        ZSTD_BtFindBestMatch_##dictMode##_##mls                                       \
+    };
+
+#define GEN_ZSTD_HC_VTABLE(dictMode, mls)                                             \
+    static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls(                            \
+            ZSTD_matchState_t* ms,                                                    \
+            const BYTE* ip, const BYTE* const iLimit,                                 \
+            size_t* offsetPtr)                                                        \
+    {                                                                                 \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                          \
+        return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
+    }                                                                                 \
+    static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = {                 \
+        ZSTD_HcFindBestMatch_##dictMode##_##mls                                       \
+    };
+
+#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog)                                             \
+    static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog(                         \
+            ZSTD_matchState_t* ms,                                                             \
+            const BYTE* ip, const BYTE* const iLimit,                                          \
+            size_t* offsetPtr)                                                                 \
+    {                                                                                          \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                                   \
+        assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog);                               \
+        return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
+    }                                                                                          \
+    static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = {              \
+        ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog                                    \
+    };
+
+#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
+    X(dictMode, mls, 4)                        \
+    X(dictMode, mls, 5)                        \
+    X(dictMode, mls, 6)
+
+#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4)      \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5)      \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
+
+#define ZSTD_FOR_EACH_MLS(X, dictMode) \
+    X(dictMode, 4)                     \
+    X(dictMode, 5)                     \
+    X(dictMode, 6)
+
+#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
+    X(__VA_ARGS__, noDict)              \
+    X(__VA_ARGS__, extDict)             \
+    X(__VA_ARGS__, dictMatchState)      \
+    X(__VA_ARGS__, dedicatedDictSearch)
+
+/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
+/* Generate Binary Tree VTables for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
+/* Generate Hash Chain VTables for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
+
+#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
+    {                                      \
+        &ZSTD_BtVTable_##dictMode##_4,     \
+        &ZSTD_BtVTable_##dictMode##_5,     \
+        &ZSTD_BtVTable_##dictMode##_6      \
+    }
+
+#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
+    {                                      \
+        &ZSTD_HcVTable_##dictMode##_4,     \
+        &ZSTD_HcVTable_##dictMode##_5,     \
+        &ZSTD_HcVTable_##dictMode##_6      \
+    }
+
+#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
+    {                                             \
+        &ZSTD_RowVTable_##dictMode##_##mls##_4,   \
+        &ZSTD_RowVTable_##dictMode##_##mls##_5,   \
+        &ZSTD_RowVTable_##dictMode##_##mls##_6    \
+    }
+
+#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode)      \
+    {                                            \
+        GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
+        GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
+        GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6)  \
+    }
+
+#define GEN_ZSTD_VTABLE_ARRAY(X) \
+    {                            \
+        X(noDict),               \
+        X(extDict),              \
+        X(dictMatchState),       \
+        X(dedicatedDictSearch)   \
+    }
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
+
+/**
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
+ * by the two searchMethod_e values. NULLs are placed for configurations
+ * that should never occur (extDict modes go to the other implementation
+ * below and there is no DDSS for binary tree search yet).
+ */
+
+static ZSTD_LazyVTable const*
+ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
+{
+    /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
+    ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
+    ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
+    /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
+    ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
+
+    U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
+    U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
+    switch (searchMethod) {
+        case search_hashChain:
+            return hcVTables[dictMode][mls - 4];
+        case search_binaryTree:
+            return btVTables[dictMode][mls - 4];
+        case search_rowHash:
+            return rowVTables[dictMode][mls - 4][rowLog - 4];
+        default:
+            return NULL;
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_lazy_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        const void* src, size_t srcSize,
+                        const searchMethod_e searchMethod, const U32 depth,
+                        ZSTD_dictMode_e const dictMode)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+
+    searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const int isDMS = dictMode == ZSTD_dictMatchState;
+    const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+    const int isDxS = isDMS || isDDS;
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = isDxS ? dms->window.base : NULL;
+    const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = isDxS ? dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = isDxS ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
+
+    assert(searchMax != NULL);
+
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const curr = (U32)(ip - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+        U32 const maxRep = curr - windowLow;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+    if (isDxS) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
+
+    if (searchMethod == search_rowHash) {
+        const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
+        ZSTD_row_fillHashCache(ms, base, rowLog,
+                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+                            ms->nextToUpdate, ilimit);
+    }
+
+    /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+     * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+     */
+    __asm__(".p2align 5");
+#endif
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offBase = REPCODE1_TO_OFFBASE;
+        const BYTE* start=ip+1;
+        DEBUGLOG(7, "search baseline (depth 0)");
+
+        /* check repCode */
+        if (isDxS) {
+            const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
+            const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
+                                && repIndex < prefixLowestIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                if (depth==0) goto _storeSequence;
+            }
+        }
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+            matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            if (depth==0) goto _storeSequence;
+        }
+
+        /* first search (depth 0) */
+        {   size_t offbaseFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offbaseFound);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offBase = offbaseFound;
+        }
+
+        if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            DEBUGLOG(7, "search depth 1");
+            ip ++;
+            if ( (dictMode == ZSTD_noDict)
+              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                int const gain2 = (int)(mlRep * 3);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                if ((mlRep >= 4) && (gain2 > gain1))
+                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+            }
+            if (isDxS) {
+                const U32 repIndex = (U32)(ip - base) - offset_1;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                    int const gain2 = (int)(mlRep * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                }
+            }
+            {   size_t ofbCandidate=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                DEBUGLOG(7, "search depth 2");
+                ip ++;
+                if ( (dictMode == ZSTD_noDict)
+                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                    int const gain2 = (int)(mlRep * 4);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                }
+                if (isDxS) {
+                    const U32 repIndex = (U32)(ip - base) - offset_1;
+                    const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                        && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                        const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                        size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                        int const gain2 = (int)(mlRep * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                        if ((mlRep >= 4) && (gain2 > gain1))
+                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                    }
+                }
+                {   size_t ofbCandidate=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* NOTE:
+         * Pay attention that `start[-value]` can lead to strange undefined behavior
+         * notably if `value` is unsigned, resulting in a large positive `-value`.
+         */
+        /* catch up */
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            if (dictMode == ZSTD_noDict) {
+                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
+                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                    { start--; matchLength++; }
+            }
+            if (isDxS) {
+                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            }
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+        }
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = (size_t)(start - anchor);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        if (isDxS) {
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex = current2 - offset_2;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                        dictBase - dictIndexDelta + repIndex :
+                        base + repIndex;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                    ip += matchLength;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        if (dictMode == ZSTD_noDict) {
+            while ( ((ip <= ilimit) & (offset_2>0))
+                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                /* store sequence */
+                matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+}
+
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+}
+
+/* Row-based matchfinder */
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+}
+
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_lazy_extDict_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        const void* src, size_t srcSize,
+                        const searchMethod_e searchMethod, const U32 depth)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+    const BYTE* const dictStart  = dictBase + ms->window.lowLimit;
+    const U32 windowLog = ms->cParams.windowLog;
+    const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
+
+    searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+
+    /* init */
+    ip += (ip == prefixStart);
+    if (searchMethod == search_rowHash) {
+        ZSTD_row_fillHashCache(ms, base, rowLog,
+                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+                               ms->nextToUpdate, ilimit);
+    }
+
+    /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+     * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+     */
+    __asm__(".p2align 5");
+#endif
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offBase = REPCODE1_TO_OFFBASE;
+        const BYTE* start=ip+1;
+        U32 curr = (U32)(ip-base);
+
+        /* check repCode */
+        {   const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
+            const U32 repIndex = (U32)(curr+1 - offset_1);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
+               & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
+            if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                if (depth==0) goto _storeSequence;
+        }   }
+
+        /* first search (depth 0) */
+        {   size_t ofbCandidate = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offBase = ofbCandidate;
+        }
+
+        if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            curr++;
+            /* check repCode */
+            if (offBase) {
+                const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                const U32 repIndex = (U32)(curr - offset_1);
+                const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                const BYTE* const repMatch = repBase + repIndex;
+                if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+                   & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                    /* repcode detected */
+                    const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                    size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                    int const gain2 = (int)(repLength * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                    if ((repLength >= 4) && (gain2 > gain1))
+                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+            }   }
+
+            /* search match, depth 1 */
+            {   size_t ofbCandidate = 999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                curr++;
+                /* check repCode */
+                if (offBase) {
+                    const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                    const U32 repIndex = (U32)(curr - offset_1);
+                    const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                    const BYTE* const repMatch = repBase + repIndex;
+                    if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+                       & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+                    if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                        /* repcode detected */
+                        const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                        size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                        int const gain2 = (int)(repLength * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                        if ((repLength >= 4) && (gain2 > gain1))
+                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                }   }
+
+                /* search match, depth 2 */
+                {   size_t ofbCandidate = 999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* catch up */
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+            const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+            const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+            while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+        }
+
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = (size_t)(start - anchor);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        while (ip <= ilimit) {
+            const U32 repCurrent = (U32)(ip-base);
+            const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
+            const U32 repIndex = repCurrent - offset_2;
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+               & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+            if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+            break;
+    }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+}
+
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+}
+
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_lazy.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_lazy.h
new file mode 100644
index 000000000..150f7b390
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_lazy.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LAZY_H
+#define ZSTD_LAZY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+/**
+ * Dedicated Dictionary Search Structure bucket log. In the
+ * ZSTD_dedicatedDictSearch mode, the hashTable has
+ * 2 ** ZSTD_LAZY_DDSS_BUCKET_LOG entries in each bucket, rather than just
+ * one.
+ */
+#define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+
+void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
+
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+        
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LAZY_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm.c
new file mode 100644
index 000000000..c14c62454
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_ldm.h"
+
+#include "../common/debug.h"
+#include "../common/xxhash.h"
+#include "zstd_fast.h"          /* ZSTD_fillHashTable() */
+#include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
+#include "zstd_ldm_geartab.h"
+
+#define LDM_BUCKET_SIZE_LOG 3
+#define LDM_MIN_MATCH_LENGTH 64
+#define LDM_HASH_RLOG 7
+
+typedef struct {
+    U64 rolling;
+    U64 stopMask;
+} ldmRollingHashState_t;
+
+/** ZSTD_ldm_gear_init():
+ *
+ * Initializes the rolling hash state such that it will honor the
+ * settings in params. */
+static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* params)
+{
+    unsigned maxBitsInMask = MIN(params->minMatchLength, 64);
+    unsigned hashRateLog = params->hashRateLog;
+
+    state->rolling = ~(U32)0;
+
+    /* The choice of the splitting criterion is subject to two conditions:
+     *   1. it has to trigger on average every 2^(hashRateLog) bytes;
+     *   2. ideally, it has to depend on a window of minMatchLength bytes.
+     *
+     * In the gear hash algorithm, bit n depends on the last n bytes;
+     * so in order to obtain a good quality splitting criterion it is
+     * preferable to use bits with high weight.
+     *
+     * To match condition 1 we use a mask with hashRateLog bits set
+     * and, because of the previous remark, we make sure these bits
+     * have the highest possible weight while still respecting
+     * condition 2.
+     */
+    if (hashRateLog > 0 && hashRateLog <= maxBitsInMask) {
+        state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog);
+    } else {
+        /* In this degenerate case we simply honor the hash rate. */
+        state->stopMask = ((U64)1 << hashRateLog) - 1;
+    }
+}
+
+/** ZSTD_ldm_gear_reset()
+ * Feeds [data, data + minMatchLength) into the hash without registering any
+ * splits. This effectively resets the hash state. This is used when skipping
+ * over data, either at the beginning of a block, or skipping sections.
+ */
+static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state,
+                                BYTE const* data, size_t minMatchLength)
+{
+    U64 hash = state->rolling;
+    size_t n = 0;
+
+#define GEAR_ITER_ONCE() do {                                  \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1;                                                \
+    } while (0)
+    while (n + 3 < minMatchLength) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < minMatchLength) {
+        GEAR_ITER_ONCE();
+    }
+#undef GEAR_ITER_ONCE
+}
+
+/** ZSTD_ldm_gear_feed():
+ *
+ * Registers in the splits array all the split points found in the first
+ * size bytes following the data pointer. This function terminates when
+ * either all the data has been processed or LDM_BATCH_SIZE splits are
+ * present in the splits array.
+ *
+ * Precondition: The splits array must not be full.
+ * Returns: The number of bytes processed. */
+static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state,
+                                 BYTE const* data, size_t size,
+                                 size_t* splits, unsigned* numSplits)
+{
+    size_t n;
+    U64 hash, mask;
+
+    hash = state->rolling;
+    mask = state->stopMask;
+    n = 0;
+
+#define GEAR_ITER_ONCE() do { \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1; \
+        if (UNLIKELY((hash & mask) == 0)) { \
+            splits[*numSplits] = n; \
+            *numSplits += 1; \
+            if (*numSplits == LDM_BATCH_SIZE) \
+                goto done; \
+        } \
+    } while (0)
+
+    while (n + 3 < size) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < size) {
+        GEAR_ITER_ONCE();
+    }
+
+#undef GEAR_ITER_ONCE
+
+done:
+    state->rolling = hash;
+    return n;
+}
+
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams)
+{
+    params->windowLog = cParams->windowLog;
+    ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+    DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+    if (params->hashLog == 0) {
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
+    }
+    if (params->hashRateLog == 0) {
+        params->hashRateLog = params->windowLog < params->hashLog
+                                   ? 0
+                                   : params->windowLog - params->hashLog;
+    }
+    params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+}
+
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+    size_t const ldmHSize = ((size_t)1) << params.hashLog;
+    size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
+    size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+    size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize)
+                           + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t));
+    return params.enableLdm == ZSTD_ps_enable ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+    return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0;
+}
+
+/** ZSTD_ldm_getBucket() :
+ *  Returns a pointer to the start of the bucket associated with hash. */
+static ldmEntry_t* ZSTD_ldm_getBucket(
+        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
+{
+    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
+}
+
+/** ZSTD_ldm_insertEntry() :
+ *  Insert the entry with corresponding hash into the hash table */
+static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                 size_t const hash, const ldmEntry_t entry,
+                                 ldmParams_t const ldmParams)
+{
+    BYTE* const pOffset = ldmState->bucketOffsets + hash;
+    unsigned const offset = *pOffset;
+
+    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry;
+    *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1));
+
+}
+
+/** ZSTD_ldm_countBackwardsMatch() :
+ *  Returns the number of bytes that match backwards before pIn and pMatch.
+ *
+ *  We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
+static size_t ZSTD_ldm_countBackwardsMatch(
+            const BYTE* pIn, const BYTE* pAnchor,
+            const BYTE* pMatch, const BYTE* pMatchBase)
+{
+    size_t matchLength = 0;
+    while (pIn > pAnchor && pMatch > pMatchBase && pIn[-1] == pMatch[-1]) {
+        pIn--;
+        pMatch--;
+        matchLength++;
+    }
+    return matchLength;
+}
+
+/** ZSTD_ldm_countBackwardsMatch_2segments() :
+ *  Returns the number of bytes that match backwards from pMatch,
+ *  even with the backwards match spanning 2 different segments.
+ *
+ *  On reaching `pMatchBase`, start counting from mEnd */
+static size_t ZSTD_ldm_countBackwardsMatch_2segments(
+                    const BYTE* pIn, const BYTE* pAnchor,
+                    const BYTE* pMatch, const BYTE* pMatchBase,
+                    const BYTE* pExtDictStart, const BYTE* pExtDictEnd)
+{
+    size_t matchLength = ZSTD_ldm_countBackwardsMatch(pIn, pAnchor, pMatch, pMatchBase);
+    if (pMatch - matchLength != pMatchBase || pMatchBase == pExtDictStart) {
+        /* If backwards match is entirely in the extDict or prefix, immediately return */
+        return matchLength;
+    }
+    DEBUGLOG(7, "ZSTD_ldm_countBackwardsMatch_2segments: found 2-parts backwards match (length in prefix==%zu)", matchLength);
+    matchLength += ZSTD_ldm_countBackwardsMatch(pIn - matchLength, pAnchor, pExtDictEnd, pExtDictStart);
+    DEBUGLOG(7, "final backwards match length = %zu", matchLength);
+    return matchLength;
+}
+
+/** ZSTD_ldm_fillFastTables() :
+ *
+ *  Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
+ *  This is similar to ZSTD_loadDictionaryContent.
+ *
+ *  The tables for the other strategies are filled within their
+ *  block compressors. */
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+                                      void const* end)
+{
+    const BYTE* const iend = (const BYTE*)end;
+
+    switch(ms->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+        break;
+
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+    case ZSTD_btlazy2:
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+    case ZSTD_btultra2:
+        break;
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    return 0;
+}
+
+void ZSTD_ldm_fillHashTable(
+            ldmState_t* ldmState, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params)
+{
+    U32 const minMatchLength = params->minMatchLength;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const istart = ip;
+    ldmRollingHashState_t hashState;
+    size_t* const splits = ldmState->splitIndices;
+    unsigned numSplits;
+
+    DEBUGLOG(5, "ZSTD_ldm_fillHashTable");
+
+    ZSTD_ldm_gear_init(&hashState, params);
+    while (ip < iend) {
+        size_t hashed;
+        unsigned n;
+
+        numSplits = 0;
+        hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);
+
+        for (n = 0; n < numSplits; n++) {
+            if (ip + splits[n] >= istart + minMatchLength) {
+                BYTE const* const split = ip + splits[n] - minMatchLength;
+                U64 const xxhash = XXH64(split, minMatchLength, 0);
+                U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
+                ldmEntry_t entry;
+
+                entry.offset = (U32)(split - base);
+                entry.checksum = (U32)(xxhash >> 32);
+                ZSTD_ldm_insertEntry(ldmState, hash, entry, *params);
+            }
+        }
+
+        ip += hashed;
+    }
+}
+
+
+/** ZSTD_ldm_limitTableUpdate() :
+ *
+ *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+ *  if it is far way
+ *  (after a long match, only update tables a limited amount). */
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+{
+    U32 const curr = (U32)(anchor - ms->window.base);
+    if (curr > ms->nextToUpdate + 1024) {
+        ms->nextToUpdate =
+            curr - MIN(512, curr - ms->nextToUpdate - 1024);
+    }
+}
+
+static size_t ZSTD_ldm_generateSequences_internal(
+        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    /* LDM parameters */
+    int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+    U32 const minMatchLength = params->minMatchLength;
+    U32 const entsPerBucket = 1U << params->bucketSizeLog;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    /* Prefix and extDict parameters */
+    U32 const dictLimit = ldmState->window.dictLimit;
+    U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+    BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+    BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+    BYTE const* const lowPrefixPtr = base + dictLimit;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    BYTE const* const ilimit = iend - HASH_READ_SIZE;
+    /* Input positions */
+    BYTE const* anchor = istart;
+    BYTE const* ip = istart;
+    /* Rolling hash state */
+    ldmRollingHashState_t hashState;
+    /* Arrays for staged-processing */
+    size_t* const splits = ldmState->splitIndices;
+    ldmMatchCandidate_t* const candidates = ldmState->matchCandidates;
+    unsigned numSplits;
+
+    if (srcSize < minMatchLength)
+        return iend - anchor;
+
+    /* Initialize the rolling hash state with the first minMatchLength bytes */
+    ZSTD_ldm_gear_init(&hashState, params);
+    ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength);
+    ip += minMatchLength;
+
+    while (ip < ilimit) {
+        size_t hashed;
+        unsigned n;
+
+        numSplits = 0;
+        hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip,
+                                    splits, &numSplits);
+
+        for (n = 0; n < numSplits; n++) {
+            BYTE const* const split = ip + splits[n] - minMatchLength;
+            U64 const xxhash = XXH64(split, minMatchLength, 0);
+            U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
+
+            candidates[n].split = split;
+            candidates[n].hash = hash;
+            candidates[n].checksum = (U32)(xxhash >> 32);
+            candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params);
+            PREFETCH_L1(candidates[n].bucket);
+        }
+
+        for (n = 0; n < numSplits; n++) {
+            size_t forwardMatchLength = 0, backwardMatchLength = 0,
+                   bestMatchLength = 0, mLength;
+            U32 offset;
+            BYTE const* const split = candidates[n].split;
+            U32 const checksum = candidates[n].checksum;
+            U32 const hash = candidates[n].hash;
+            ldmEntry_t* const bucket = candidates[n].bucket;
+            ldmEntry_t const* cur;
+            ldmEntry_t const* bestEntry = NULL;
+            ldmEntry_t newEntry;
+
+            newEntry.offset = (U32)(split - base);
+            newEntry.checksum = checksum;
+
+            /* If a split point would generate a sequence overlapping with
+             * the previous one, we merely register it in the hash table and
+             * move on */
+            if (split < anchor) {
+                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+                continue;
+            }
+
+            for (cur = bucket; cur < bucket + entsPerBucket; cur++) {
+                size_t curForwardMatchLength, curBackwardMatchLength,
+                       curTotalMatchLength;
+                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
+                    continue;
+                }
+                if (extDict) {
+                    BYTE const* const curMatchBase =
+                        cur->offset < dictLimit ? dictBase : base;
+                    BYTE const* const pMatch = curMatchBase + cur->offset;
+                    BYTE const* const matchEnd =
+                        cur->offset < dictLimit ? dictEnd : iend;
+                    BYTE const* const lowMatchPtr =
+                        cur->offset < dictLimit ? dictStart : lowPrefixPtr;
+                    curForwardMatchLength =
+                        ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments(
+                            split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd);
+                } else { /* !extDict */
+                    BYTE const* const pMatch = base + cur->offset;
+                    curForwardMatchLength = ZSTD_count(split, pMatch, iend);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr);
+                }
+                curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength;
+
+                if (curTotalMatchLength > bestMatchLength) {
+                    bestMatchLength = curTotalMatchLength;
+                    forwardMatchLength = curForwardMatchLength;
+                    backwardMatchLength = curBackwardMatchLength;
+                    bestEntry = cur;
+                }
+            }
+
+            /* No match found -- insert an entry into the hash table
+             * and process the next candidate match */
+            if (bestEntry == NULL) {
+                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+                continue;
+            }
+
+            /* Match found */
+            offset = (U32)(split - base) - bestEntry->offset;
+            mLength = forwardMatchLength + backwardMatchLength;
+            {
+                rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
+
+                /* Out of sequence storage */
+                if (rawSeqStore->size == rawSeqStore->capacity)
+                    return ERROR(dstSize_tooSmall);
+                seq->litLength = (U32)(split - backwardMatchLength - anchor);
+                seq->matchLength = (U32)mLength;
+                seq->offset = offset;
+                rawSeqStore->size++;
+            }
+
+            /* Insert the current entry into the hash table --- it must be
+             * done after the previous block to avoid clobbering bestEntry */
+            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+
+            anchor = split + forwardMatchLength;
+
+            /* If we find a match that ends after the data that we've hashed
+             * then we have a repeating, overlapping, pattern. E.g. all zeros.
+             * If one repetition of the pattern matches our `stopMask` then all
+             * repetitions will. We don't need to insert them all into out table,
+             * only the first one. So skip over overlapping matches.
+             * This is a major speed boost (20x) for compressing a single byte
+             * repeated, when that byte ends up in the table.
+             */
+            if (anchor > ip + hashed) {
+                ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength);
+                /* Continue the outer loop at anchor (ip + hashed == anchor). */
+                ip = anchor - hashed;
+                break;
+            }
+        }
+
+        ip += hashed;
+    }
+
+    return iend - anchor;
+}
+
+/*! ZSTD_ldm_reduceTable() :
+ *  reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+                                 U32 const reducerValue)
+{
+    U32 u;
+    for (u = 0; u < size; u++) {
+        if (table[u].offset < reducerValue) table[u].offset = 0;
+        else table[u].offset -= reducerValue;
+    }
+}
+
+size_t ZSTD_ldm_generateSequences(
+        ldmState_t* ldmState, rawSeqStore_t* sequences,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    U32 const maxDist = 1U << params->windowLog;
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    size_t const kMaxChunkSize = 1 << 20;
+    size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+    size_t chunk;
+    size_t leftoverSize = 0;
+
+    assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+    /* Check that ZSTD_window_update() has been called for this chunk prior
+     * to passing it to this function.
+     */
+    assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+    /* The input could be very large (in zstdmt), so it must be broken up into
+     * chunks to enforce the maximum distance and handle overflow correction.
+     */
+    assert(sequences->pos <= sequences->size);
+    assert(sequences->size <= sequences->capacity);
+    for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+        BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+        size_t const remaining = (size_t)(iend - chunkStart);
+        BYTE const *const chunkEnd =
+            (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+        size_t const chunkSize = chunkEnd - chunkStart;
+        size_t newLeftoverSize;
+        size_t const prevSize = sequences->size;
+
+        assert(chunkStart < iend);
+        /* 1. Perform overflow correction if necessary. */
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) {
+            U32 const ldmHSize = 1U << params->hashLog;
+            U32 const correction = ZSTD_window_correctOverflow(
+                &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart);
+            ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
+            /* invalidate dictionaries on overflow correction */
+            ldmState->loadedDictEnd = 0;
+        }
+        /* 2. We enforce the maximum offset allowed.
+         *
+         * kMaxChunkSize should be small enough that we don't lose too much of
+         * the window through early invalidation.
+         * TODO: * Test the chunk size.
+         *       * Try invalidation after the sequence generation and test the
+         *         offset against maxDist directly.
+         *
+         * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+         * that any offset used is valid at the END of the sequence, since it may
+         * be split into two sequences. This condition holds when using
+         * ZSTD_window_enforceMaxDist(), but if we move to checking offsets
+         * against maxDist directly, we'll have to carefully handle that case.
+         */
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL);
+        /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+        newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+            ldmState, sequences, params, chunkStart, chunkSize);
+        if (ZSTD_isError(newLeftoverSize))
+            return newLeftoverSize;
+        /* 4. We add the leftover literals from previous iterations to the first
+         *    newly generated sequence, or add the `newLeftoverSize` if none are
+         *    generated.
+         */
+        /* Prepend the leftover literals from the last call */
+        if (prevSize < sequences->size) {
+            sequences->seq[prevSize].litLength += (U32)leftoverSize;
+            leftoverSize = newLeftoverSize;
+        } else {
+            assert(newLeftoverSize == chunkSize);
+            leftoverSize += chunkSize;
+        }
+    }
+    return 0;
+}
+
+void
+ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
+{
+    while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+        if (srcSize <= seq->litLength) {
+            /* Skip past srcSize literals */
+            seq->litLength -= (U32)srcSize;
+            return;
+        }
+        srcSize -= seq->litLength;
+        seq->litLength = 0;
+        if (srcSize < seq->matchLength) {
+            /* Skip past the first srcSize of the match */
+            seq->matchLength -= (U32)srcSize;
+            if (seq->matchLength < minMatch) {
+                /* The match is too short, omit it */
+                if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+                    seq[1].litLength += seq[0].matchLength;
+                }
+                rawSeqStore->pos++;
+            }
+            return;
+        }
+        srcSize -= seq->matchLength;
+        seq->matchLength = 0;
+        rawSeqStore->pos++;
+    }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+                                 U32 const remaining, U32 const minMatch)
+{
+    rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+    assert(sequence.offset > 0);
+    /* Likely: No partial sequence */
+    if (remaining >= sequence.litLength + sequence.matchLength) {
+        rawSeqStore->pos++;
+        return sequence;
+    }
+    /* Cut the sequence short (offset == 0 ==> rest is literals). */
+    if (remaining <= sequence.litLength) {
+        sequence.offset = 0;
+    } else if (remaining < sequence.litLength + sequence.matchLength) {
+        sequence.matchLength = remaining - sequence.litLength;
+        if (sequence.matchLength < minMatch) {
+            sequence.offset = 0;
+        }
+    }
+    /* Skip past `remaining` bytes for the future sequences. */
+    ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+    return sequence;
+}
+
+void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
+    U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+    while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
+        if (currPos >= currSeq.litLength + currSeq.matchLength) {
+            currPos -= currSeq.litLength + currSeq.matchLength;
+            rawSeqStore->pos++;
+        } else {
+            rawSeqStore->posInSequence = currPos;
+            break;
+        }
+    }
+    if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) {
+        rawSeqStore->posInSequence = 0;
+    }
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_paramSwitch_e useRowMatchFinder,
+    void const* src, size_t srcSize)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    unsigned const minMatch = cParams->minMatch;
+    ZSTD_blockCompressor const blockCompressor =
+        ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    /* Input positions */
+    BYTE const* ip = istart;
+
+    DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
+    /* If using opt parser, use LDMs only as candidates rather than always accepting them */
+    if (cParams->strategy >= ZSTD_btopt) {
+        size_t lastLLSize;
+        ms->ldmSeqStore = rawSeqStore;
+        lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize);
+        ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore, srcSize);
+        return lastLLSize;
+    }
+
+    assert(rawSeqStore->pos <= rawSeqStore->size);
+    assert(rawSeqStore->size <= rawSeqStore->capacity);
+    /* Loop through each sequence and apply the block compressor to the literals */
+    while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+        /* maybeSplitSequence updates rawSeqStore->pos */
+        rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                   (U32)(iend - ip), minMatch);
+        int i;
+        /* End signal */
+        if (sequence.offset == 0)
+            break;
+
+        assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+        /* Fill tables for block compressor */
+        ZSTD_ldm_limitTableUpdate(ms, ip);
+        ZSTD_ldm_fillFastTables(ms, ip);
+        /* Run the block compressor */
+        DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+        {
+            size_t const newLitLength =
+                blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+            ip += sequence.litLength;
+            /* Update the repcodes */
+            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+                rep[i] = rep[i-1];
+            rep[0] = sequence.offset;
+            /* Store the sequence */
+            ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+                          OFFSET_TO_OFFBASE(sequence.offset),
+                          sequence.matchLength);
+            ip += sequence.matchLength;
+        }
+    }
+    /* Fill the tables for the block compressor */
+    ZSTD_ldm_limitTableUpdate(ms, ip);
+    ZSTD_ldm_fillFastTables(ms, ip);
+    /* Compress the last literals */
+    return blockCompressor(ms, seqStore, rep, ip, iend - ip);
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm.h
new file mode 100644
index 000000000..4e68dbf52
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LDM_H
+#define ZSTD_LDM_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"   /* ldmParams_t, U32 */
+#include "../zstd.h"   /* ZSTD_CCtx, size_t */
+
+/*-*************************************
+*  Long distance matching
+***************************************/
+
+#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT
+
+void ZSTD_ldm_fillHashTable(
+            ldmState_t* state, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params);
+
+/**
+ * ZSTD_ldm_generateSequences():
+ *
+ * Generates the sequences using the long distance match finder.
+ * Generates long range matching sequences in `sequences`, which parse a prefix
+ * of the source. `sequences` must be large enough to store every sequence,
+ * which can be checked with `ZSTD_ldm_getMaxNbSeq()`.
+ * @returns 0 or an error code.
+ *
+ * NOTE: The user must have called ZSTD_window_update() for all of the input
+ * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks.
+ * NOTE: This function returns an error if it runs out of space to store
+ *       sequences.
+ */
+size_t ZSTD_ldm_generateSequences(
+            ldmState_t* ldms, rawSeqStore_t* sequences,
+            ldmParams_t const* params, void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_blockCompress():
+ *
+ * Compresses a block using the predefined sequences, along with a secondary
+ * block compressor. The literals section of every sequence is passed to the
+ * secondary block compressor, and those sequences are interspersed with the
+ * predefined sequences. Returns the length of the last literals.
+ * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed.
+ * `rawSeqStore.seq` may also be updated to split the last sequence between two
+ * blocks.
+ * @return The length of the last literals.
+ *
+ * NOTE: The source must be at most the maximum block size, but the predefined
+ * sequences can be any size, and may be longer than the block. In the case that
+ * they are longer than the block, the last sequences may need to be split into
+ * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+ * NOTE: This function does not return any errors.
+ */
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            ZSTD_paramSwitch_e useRowMatchFinder,
+            void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_skipSequences():
+ *
+ * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`.
+ * Avoids emitting matches less than `minMatch` bytes.
+ * Must be called for data that is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+    U32 const minMatch);
+
+/* ZSTD_ldm_skipRawSeqStoreBytes():
+ * Moves forward in rawSeqStore by nbBytes, updating fields 'pos' and 'posInSequence'.
+ * Not to be used in conjunction with ZSTD_ldm_skipSequences().
+ * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes);
+
+/** ZSTD_ldm_getTableSize() :
+ *  Estimate the space needed for long distance matching tables or 0 if LDM is
+ *  disabled.
+ */
+size_t ZSTD_ldm_getTableSize(ldmParams_t params);
+
+/** ZSTD_ldm_getSeqSpace() :
+ *  Return an upper bound on the number of sequences that can be produced by
+ *  the long distance matcher, or 0 if LDM is disabled.
+ */
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+
+/** ZSTD_ldm_adjustParameters() :
+ *  If the params->hashRateLog is not set, set it to its default value based on
+ *  windowLog and params->hashLog.
+ *
+ *  Ensures that params->bucketSizeLog is <= params->hashLog (setting it to
+ *  params->hashLog if it is not).
+ *
+ *  Ensures that the minMatchLength >= targetLength during optimal parsing.
+ */
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm_geartab.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm_geartab.h
new file mode 100644
index 000000000..647f865be
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_ldm_geartab.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LDM_GEARTAB_H
+#define ZSTD_LDM_GEARTAB_H
+
+#include "../common/compiler.h" /* UNUSED_ATTR */
+#include "../common/mem.h"      /* U64 */
+
+static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
+    0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc,
+    0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05,
+    0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e,
+    0x9c8528f65badeaca, 0x86563706e2097529, 0x2902475fa375d889,
+    0xafb32a9739a5ebe6, 0xce2714da3883e639, 0x21eaf821722e69e,
+    0x37b628620b628,    0x49a8d455d88caf5,  0x8556d711e6958140,
+    0x4f7ae74fc605c1f,  0x829f0c3468bd3a20, 0x4ffdc885c625179e,
+    0x8473de048a3daf1b, 0x51008822b05646b2, 0x69d75d12b2d1cc5f,
+    0x8c9d4a19159154bc, 0xc3cc10f4abbd4003, 0xd06ddc1cecb97391,
+    0xbe48e6e7ed80302e, 0x3481db31cee03547, 0xacc3f67cdaa1d210,
+    0x65cb771d8c7f96cc, 0x8eb27177055723dd, 0xc789950d44cd94be,
+    0x934feadc3700b12b, 0x5e485f11edbdf182, 0x1e2e2a46fd64767a,
+    0x2969ca71d82efa7c, 0x9d46e9935ebbba2e, 0xe056b67e05e6822b,
+    0x94d73f55739d03a0, 0xcd7010bdb69b5a03, 0x455ef9fcd79b82f4,
+    0x869cb54a8749c161, 0x38d1a4fa6185d225, 0xb475166f94bbe9bb,
+    0xa4143548720959f1, 0x7aed4780ba6b26ba, 0xd0ce264439e02312,
+    0x84366d746078d508, 0xa8ce973c72ed17be, 0x21c323a29a430b01,
+    0x9962d617e3af80ee, 0xab0ce91d9c8cf75b, 0x530e8ee6d19a4dbc,
+    0x2ef68c0cf53f5d72, 0xc03a681640a85506, 0x496e4e9f9c310967,
+    0x78580472b59b14a0, 0x273824c23b388577, 0x66bf923ad45cb553,
+    0x47ae1a5a2492ba86, 0x35e304569e229659, 0x4765182a46870b6f,
+    0x6cbab625e9099412, 0xddac9a2e598522c1, 0x7172086e666624f2,
+    0xdf5003ca503b7837, 0x88c0c1db78563d09, 0x58d51865acfc289d,
+    0x177671aec65224f1, 0xfb79d8a241e967d7, 0x2be1e101cad9a49a,
+    0x6625682f6e29186b, 0x399553457ac06e50, 0x35dffb4c23abb74,
+    0x429db2591f54aade, 0xc52802a8037d1009, 0x6acb27381f0b25f3,
+    0xf45e2551ee4f823b, 0x8b0ea2d99580c2f7, 0x3bed519cbcb4e1e1,
+    0xff452823dbb010a,  0x9d42ed614f3dd267, 0x5b9313c06257c57b,
+    0xa114b8008b5e1442, 0xc1fe311c11c13d4b, 0x66e8763ea34c5568,
+    0x8b982af1c262f05d, 0xee8876faaa75fbb7, 0x8a62a4d0d172bb2a,
+    0xc13d94a3b7449a97, 0x6dbbba9dc15d037c, 0xc786101f1d92e0f1,
+    0xd78681a907a0b79b, 0xf61aaf2962c9abb9, 0x2cfd16fcd3cb7ad9,
+    0x868c5b6744624d21, 0x25e650899c74ddd7, 0xba042af4a7c37463,
+    0x4eb1a539465a3eca, 0xbe09dbf03b05d5ca, 0x774e5a362b5472ba,
+    0x47a1221229d183cd, 0x504b0ca18ef5a2df, 0xdffbdfbde2456eb9,
+    0x46cd2b2fbee34634, 0xf2aef8fe819d98c3, 0x357f5276d4599d61,
+    0x24a5483879c453e3, 0x88026889192b4b9,  0x28da96671782dbec,
+    0x4ef37c40588e9aaa, 0x8837b90651bc9fb3, 0xc164f741d3f0e5d6,
+    0xbc135a0a704b70ba, 0x69cd868f7622ada,  0xbc37ba89e0b9c0ab,
+    0x47c14a01323552f6, 0x4f00794bacee98bb, 0x7107de7d637a69d5,
+    0x88af793bb6f2255e, 0xf3c6466b8799b598, 0xc288c616aa7f3b59,
+    0x81ca63cf42fca3fd, 0x88d85ace36a2674b, 0xd056bd3792389e7,
+    0xe55c396c4e9dd32d, 0xbefb504571e6c0a6, 0x96ab32115e91e8cc,
+    0xbf8acb18de8f38d1, 0x66dae58801672606, 0x833b6017872317fb,
+    0xb87c16f2d1c92864, 0xdb766a74e58b669c, 0x89659f85c61417be,
+    0xc8daad856011ea0c, 0x76a4b565b6fe7eae, 0xa469d085f6237312,
+    0xaaf0365683a3e96c, 0x4dbb746f8424f7b8, 0x638755af4e4acc1,
+    0x3d7807f5bde64486, 0x17be6d8f5bbb7639, 0x903f0cd44dc35dc,
+    0x67b672eafdf1196c, 0xa676ff93ed4c82f1, 0x521d1004c5053d9d,
+    0x37ba9ad09ccc9202, 0x84e54d297aacfb51, 0xa0b4b776a143445,
+    0x820d471e20b348e,  0x1874383cb83d46dc, 0x97edeec7a1efe11c,
+    0xb330e50b1bdc42aa, 0x1dd91955ce70e032, 0xa514cdb88f2939d5,
+    0x2791233fd90db9d3, 0x7b670a4cc50f7a9b, 0x77c07d2a05c6dfa5,
+    0xe3778b6646d0a6fa, 0xb39c8eda47b56749, 0x933ed448addbef28,
+    0xaf846af6ab7d0bf4, 0xe5af208eb666e49,  0x5e6622f73534cd6a,
+    0x297daeca42ef5b6e, 0x862daef3d35539a6, 0xe68722498f8e1ea9,
+    0x981c53093dc0d572, 0xfa09b0bfbf86fbf5, 0x30b1e96166219f15,
+    0x70e7d466bdc4fb83, 0x5a66736e35f2a8e9, 0xcddb59d2b7c1baef,
+    0xd6c7d247d26d8996, 0xea4e39eac8de1ba3, 0x539c8bb19fa3aff2,
+    0x9f90e4c5fd508d8,  0xa34e5956fbaf3385, 0x2e2f8e151d3ef375,
+    0x173691e9b83faec1, 0xb85a8d56bf016379, 0x8382381267408ae3,
+    0xb90f901bbdc0096d, 0x7c6ad32933bcec65, 0x76bb5e2f2c8ad595,
+    0x390f851a6cf46d28, 0xc3e6064da1c2da72, 0xc52a0c101cfa5389,
+    0xd78eaf84a3fbc530, 0x3781b9e2288b997e, 0x73c2f6dea83d05c4,
+    0x4228e364c5b5ed7,  0x9d7a3edf0da43911, 0x8edcfeda24686756,
+    0x5e7667a7b7a9b3a1, 0x4c4f389fa143791d, 0xb08bc1023da7cddc,
+    0x7ab4be3ae529b1cc, 0x754e6132dbe74ff9, 0x71635442a839df45,
+    0x2f6fb1643fbe52de, 0x961e0a42cf7a8177, 0xf3b45d83d89ef2ea,
+    0xee3de4cf4a6e3e9b, 0xcd6848542c3295e7, 0xe4cee1664c78662f,
+    0x9947548b474c68c4, 0x25d73777a5ed8b0b, 0xc915b1d636b7fc,
+    0x21c2ba75d9b0d2da, 0x5f6b5dcf608a64a1, 0xdcf333255ff9570c,
+    0x633b922418ced4ee, 0xc136dde0b004b34a, 0x58cc83b05d4b2f5a,
+    0x5eb424dda28e42d2, 0x62df47369739cd98, 0xb4e0b42485e4ce17,
+    0x16e1f0c1f9a8d1e7, 0x8ec3916707560ebf, 0x62ba6e2df2cc9db3,
+    0xcbf9f4ff77d83a16, 0x78d9d7d07d2bbcc4, 0xef554ce1e02c41f4,
+    0x8d7581127eccf94d, 0xa9b53336cb3c8a05, 0x38c42c0bf45c4f91,
+    0x640893cdf4488863, 0x80ec34bc575ea568, 0x39f324f5b48eaa40,
+    0xe9d9ed1f8eff527f, 0x9224fc058cc5a214, 0xbaba00b04cfe7741,
+    0x309a9f120fcf52af, 0xa558f3ec65626212, 0x424bec8b7adabe2f,
+    0x41622513a6aea433, 0xb88da2d5324ca798, 0xd287733b245528a4,
+    0x9a44697e6d68aec3, 0x7b1093be2f49bb28, 0x50bbec632e3d8aad,
+    0x6cd90723e1ea8283, 0x897b9e7431b02bf3, 0x219efdcb338a7047,
+    0x3b0311f0a27c0656, 0xdb17bf91c0db96e7, 0x8cd4fd6b4e85a5b2,
+    0xfab071054ba6409d, 0x40d6fe831fa9dfd9, 0xaf358debad7d791e,
+    0xeb8d0e25a65e3e58, 0xbbcbd3df14e08580, 0xcf751f27ecdab2b,
+    0x2b4da14f2613d8f4
+};
+
+#endif /* ZSTD_LDM_GEARTAB_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_opt.c b/GraphBLAS/zstd/zstd_subset/compress/zstd_opt.c
new file mode 100644
index 000000000..800f87e9e
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_opt.c
@@ -0,0 +1,1449 @@
+/*
+ * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "hist.h"
+#include "zstd_opt.h"
+
+
+#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+#define ZSTD_MAX_PRICE     (1<<30)
+
+#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+
+#if 0    /* approximation at bit level (for tests) */
+#  define BITCOST_ACCURACY 0
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+#elif 0  /* fractional bit accuracy (for tests) */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#else    /* opt==approx, ultra==accurate */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#endif
+
+MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+{
+    return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+}
+
+MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+{
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + BITCOST_ACCURACY < 31);
+    return weight;
+}
+
+#if (DEBUGLEVEL>=2)
+/* debugging function,
+ * @return price in bytes as fractional value
+ * for debug messages only */
+MEM_STATIC double ZSTD_fCost(U32 price)
+{
+    return (double)price / (BITCOST_MULTIPLIER*8);
+}
+#endif
+
+static int ZSTD_compressedLiterals(optState_t const* const optPtr)
+{
+    return optPtr->literalCompressionMode != ZSTD_ps_disable;
+}
+
+static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
+{
+    if (ZSTD_compressedLiterals(optPtr))
+        optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
+    optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
+    optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
+    optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
+}
+
+
+static U32 sum_u32(const unsigned table[], size_t nbElts)
+{
+    size_t n;
+    U32 total = 0;
+    for (n=0; n<nbElts; n++) {
+        total += table[n];
+    }
+    return total;
+}
+
+static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
+{
+    U32 s, sum=0;
+    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
+    assert(shift < 30);
+    for (s=0; s<lastEltIndex+1; s++) {
+        table[s] = 1 + (table[s] >> shift);
+        sum += table[s];
+    }
+    return sum;
+}
+
+/* ZSTD_scaleStats() :
+ * reduce all elements in table is sum too large
+ * return the resulting sum of elements */
+static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+{
+    U32 const prevsum = sum_u32(table, lastEltIndex+1);
+    U32 const factor = prevsum >> logTarget;
+    DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+    assert(logTarget < 30);
+    if (factor <= 1) return prevsum;
+    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
+}
+
+/* ZSTD_rescaleFreqs() :
+ * if first block (detected by optPtr->litLengthSum == 0) : init statistics
+ *    take hints from dictionary if there is one
+ *    and init from zero if there is none,
+ *    using src for literals stats, and baseline stats for sequence symbols
+ * otherwise downscale existing stats, to be used as seed for next block.
+ */
+static void
+ZSTD_rescaleFreqs(optState_t* const optPtr,
+            const BYTE* const src, size_t const srcSize,
+                  int const optLevel)
+{
+    int const compressedLiterals = ZSTD_compressedLiterals(optPtr);
+    DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+    optPtr->priceType = zop_dynamic;
+
+    if (optPtr->litLengthSum == 0) {  /* first block : init */
+        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+            optPtr->priceType = zop_predef;
+        }
+
+        assert(optPtr->symbolCosts != NULL);
+        if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+            /* huffman table presumed generated by dictionary */
+            optPtr->priceType = zop_dynamic;
+
+            if (compressedLiterals) {
+                unsigned lit;
+                assert(optPtr->litFreq != NULL);
+                optPtr->litSum = 0;
+                for (lit=0; lit<=MaxLit; lit++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit);
+                    assert(bitCost <= scaleLog);
+                    optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
+
+            {   unsigned ll;
+                FSE_CState_t llstate;
+                FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
+                optPtr->litLengthSum = 0;
+                for (ll=0; ll<=MaxLL; ll++) {
+                    U32 const scaleLog = 10;   /* scale to 1K */
+                    U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+                    assert(bitCost < scaleLog);
+                    optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+            }   }
+
+            {   unsigned ml;
+                FSE_CState_t mlstate;
+                FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
+                optPtr->matchLengthSum = 0;
+                for (ml=0; ml<=MaxML; ml++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+                    assert(bitCost < scaleLog);
+                    optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+            }   }
+
+            {   unsigned of;
+                FSE_CState_t ofstate;
+                FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
+                optPtr->offCodeSum = 0;
+                for (of=0; of<=MaxOff; of++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+                    assert(bitCost < scaleLog);
+                    optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->offCodeSum += optPtr->offCodeFreq[of];
+            }   }
+
+        } else {  /* not a dictionary */
+
+            assert(optPtr->litFreq != NULL);
+            if (compressedLiterals) {
+                unsigned lit = MaxLit;
+                HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
+            }
+
+            {   unsigned const baseLLfreqs[MaxLL+1] = {
+                    4, 2, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1
+                };
+                ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs));
+                optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1);
+            }
+
+            {   unsigned ml;
+                for (ml=0; ml<=MaxML; ml++)
+                    optPtr->matchLengthFreq[ml] = 1;
+            }
+            optPtr->matchLengthSum = MaxML+1;
+
+            {   unsigned const baseOFCfreqs[MaxOff+1] = {
+                    6, 2, 1, 1, 2, 3, 4, 4,
+                    4, 3, 2, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1
+                };
+                ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs));
+                optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+            }
+
+
+        }
+
+    } else {   /* new block : re-use previous statistics, scaled down */
+
+        if (compressedLiterals)
+            optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+        optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11);
+        optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11);
+        optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11);
+    }
+
+    ZSTD_setBasePrices(optPtr, optLevel);
+}
+
+/* ZSTD_rawLiteralsCost() :
+ * price of literals (only) in specified segment (which length can be 0).
+ * does not include price of literalLength symbol */
+static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                const optState_t* const optPtr,
+                                int optLevel)
+{
+    if (litLength == 0) return 0;
+
+    if (!ZSTD_compressedLiterals(optPtr))
+        return (litLength << 3) * BITCOST_MULTIPLIER;  /* Uncompressed - 8 bytes per literal. */
+
+    if (optPtr->priceType == zop_predef)
+        return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+
+    /* dynamic statistics */
+    {   U32 price = optPtr->litSumBasePrice * litLength;
+        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+        U32 u;
+        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+        for (u=0; u < litLength; u++) {
+            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
+            price -= litPrice;
+        }
+        return price;
+    }
+}
+
+/* ZSTD_litLengthPrice() :
+ * cost of literalLength symbol */
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
+{
+    assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+    if (optPtr->priceType == zop_predef)
+        return WEIGHT(litLength, optLevel);
+    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+     * because it isn't representable in the zstd format. So instead just
+     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+     * would be all literals.
+     */
+    if (litLength == ZSTD_BLOCKSIZE_MAX)
+        return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+
+    /* dynamic statistics */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        return (LL_bits[llCode] * BITCOST_MULTIPLIER)
+             + optPtr->litLengthSumBasePrice
+             - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
+    }
+}
+
+/* ZSTD_getMatchPrice() :
+ * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+ * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_getMatchPrice(U32 const offBase,
+                   U32 const matchLength,
+             const optState_t* const optPtr,
+                   int const optLevel)
+{
+    U32 price;
+    U32 const offCode = ZSTD_highbit32(offBase);
+    U32 const mlBase = matchLength - MINMATCH;
+    assert(matchLength >= MINMATCH);
+
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+
+    /* dynamic statistics */
+    price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+    if ((optLevel<2) /*static*/ && offCode >= 20)
+        price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
+
+    /* match Length */
+    {   U32 const mlCode = ZSTD_MLcode(mlBase);
+        price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
+    }
+
+    price += BITCOST_MULTIPLIER / 5;   /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
+
+    DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
+    return price;
+}
+
+/* ZSTD_updateStats() :
+ * assumption : literals + litLength <= iend */
+static void ZSTD_updateStats(optState_t* const optPtr,
+                             U32 litLength, const BYTE* literals,
+                             U32 offBase, U32 matchLength)
+{
+    /* literals */
+    if (ZSTD_compressedLiterals(optPtr)) {
+        U32 u;
+        for (u=0; u < litLength; u++)
+            optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+        optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+    }
+
+    /* literal Length */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        optPtr->litLengthFreq[llCode]++;
+        optPtr->litLengthSum++;
+    }
+
+    /* offset code : expected to follow storeSeq() numeric representation */
+    {   U32 const offCode = ZSTD_highbit32(offBase);
+        assert(offCode <= MaxOff);
+        optPtr->offCodeFreq[offCode]++;
+        optPtr->offCodeSum++;
+    }
+
+    /* match Length */
+    {   U32 const mlBase = matchLength - MINMATCH;
+        U32 const mlCode = ZSTD_MLcode(mlBase);
+        optPtr->matchLengthFreq[mlCode]++;
+        optPtr->matchLengthSum++;
+    }
+}
+
+
+/* ZSTD_readMINMATCH() :
+ * function safe only for comparisons
+ * assumption : memPtr must be at least 4 bytes before end of buffer */
+MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+{
+    switch (length)
+    {
+    default :
+    case 4 : return MEM_read32(memPtr);
+    case 3 : if (MEM_isLittleEndian())
+                return MEM_read32(memPtr)<<8;
+             else
+                return MEM_read32(memPtr)>>8;
+    }
+}
+
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+                                              U32* nextToUpdate3,
+                                              const BYTE* const ip)
+{
+    U32* const hashTable3 = ms->hashTable3;
+    U32 const hashLog3 = ms->hashLog3;
+    const BYTE* const base = ms->window.base;
+    U32 idx = *nextToUpdate3;
+    U32 const target = (U32)(ip - base);
+    size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+    assert(hashLog3 > 0);
+
+    while(idx < target) {
+        hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
+        idx++;
+    }
+
+    *nextToUpdate3 = target;
+    return hashTable3[hash3];
+}
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+ * @param ip assumed <= iend-8 .
+ * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+ * @return : nb of positions added */
+static U32 ZSTD_insertBt1(
+                const ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iend,
+                U32 const target,
+                U32 const mls, const int extDict)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    const U32 curr = (U32)(ip-base);
+    const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+    U32* smallerPtr = bt + 2*(curr&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 dummy32;   /* to be nullified at the end */
+    /* windowLow is based on target because
+     * we only need positions that will be in the window at the end of the tree update.
+     */
+    U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog);
+    U32 matchEndIdx = curr+8+1;
+    size_t bestLength = 8;
+    U32 nbCompares = 1U << cParams->searchLog;
+#ifdef ZSTD_C_PREDICT
+    U32 predictedSmall = *(bt + 2*((curr-1)&btMask) + 0);
+    U32 predictedLarge = *(bt + 2*((curr-1)&btMask) + 1);
+    predictedSmall += (predictedSmall>0);
+    predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+    DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr);
+
+    assert(curr <= target);
+    assert(ip <= iend-8);   /* required for h calculation */
+    hashTable[h] = curr;   /* Update Hash Table */
+
+    assert(windowLow > 0);
+    for (; nbCompares && (matchIndex >= windowLow); --nbCompares) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < curr);
+
+#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
+            *smallerPtr = matchIndex;
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+            continue;
+        }
+        if (matchIndex == predictedLarge) {
+            *largerPtr = matchIndex;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+            continue;
+        }
+#endif
+
+        if (!extDict || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if actually extDict */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+    {   U32 positions = 0;
+        if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384));   /* speed optimization */
+        assert(matchEndIdx > curr + 8);
+        return MAX(positions, matchEndIdx - (curr + 8));
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+void ZSTD_updateTree_internal(
+                ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iend,
+                const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                idx, target, dictMode);
+
+    while(idx < target) {
+        U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict);
+        assert(idx < (U32)(idx + forward));
+        idx += forward;
+    }
+    assert((size_t)(ip - base) <= (size_t)(U32)(-1));
+    assert((size_t)(iend - base) <= (size_t)(U32)(-1));
+    ms->nextToUpdate = target;
+}
+
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+    ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_insertBtAndGetAllMatches (
+                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+                    ZSTD_matchState_t* ms,
+                    U32* nextToUpdate3,
+                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+                    const U32 rep[ZSTD_REP_NUM],
+                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+                    const U32 lengthToBeat,
+                    U32 const mls /* template */)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    const BYTE* const base = ms->window.base;
+    U32 const curr = (U32)(ip-base);
+    U32 const hashLog = cParams->hashLog;
+    U32 const minMatch = (mls==3) ? 3 : 4;
+    U32* const hashTable = ms->hashTable;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32 matchIndex  = hashTable[h];
+    U32* const bt   = ms->chainTable;
+    U32 const btLog = cParams->chainLog - 1;
+    U32 const btMask= (1U << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const dictBase = ms->window.dictBase;
+    U32 const dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
+    U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
+    U32 const matchLow = windowLow ? windowLow : 1;
+    U32* smallerPtr = bt + 2*(curr&btMask);
+    U32* largerPtr  = bt + 2*(curr&btMask) + 1;
+    U32 matchEndIdx = curr+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 mnum = 0;
+    U32 nbCompares = 1U << cParams->searchLog;
+
+    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+    const ZSTD_compressionParameters* const dmsCParams =
+                                      dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+    const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+    const BYTE* const dmsEnd        = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
+    U32         const dmsHighLimit  = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
+    U32         const dmsLowLimit   = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
+    U32         const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
+    U32         const dmsHashLog    = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
+    U32         const dmsBtLog      = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
+    U32         const dmsBtMask     = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
+    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
+
+    size_t bestLength = lengthToBeat-1;
+    DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", curr);
+
+    /* check repCode */
+    assert(ll0 <= 1);   /* necessarily 1 or 0 */
+    {   U32 const lastR = ZSTD_REP_NUM + ll0;
+        U32 repCode;
+        for (repCode = ll0; repCode < lastR; repCode++) {
+            U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            U32 const repIndex = curr - repOffset;
+            U32 repLen = 0;
+            assert(curr >= dictLimit);
+            if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < curr-dictLimit) {  /* equivalent to `curr > repIndex >= dictLimit` */
+                /* We must validate the repcode offset because when we're using a dictionary the
+                 * valid offset range shrinks when the dictionary goes out of bounds.
+                 */
+                if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) {
+                    repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
+                }
+            } else {  /* repIndex < dictLimit || repIndex >= curr */
+                const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ?
+                                             dmsBase + repIndex - dmsIndexDelta :
+                                             dictBase + repIndex;
+                assert(curr >= windowLow);
+                if ( dictMode == ZSTD_extDict
+                  && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow)  /* equivalent to `curr > repIndex >= windowLow` */
+                     & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                }
+                if (dictMode == ZSTD_dictMatchState
+                  && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `curr > repIndex >= dmsLowLimit` */
+                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
+            }   }
+            /* save longer solution */
+            if (repLen > bestLength) {
+                DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                            repCode, ll0, repOffset, repLen);
+                bestLength = repLen;
+                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                matches[mnum].len = (U32)repLen;
+                mnum++;
+                if ( (repLen > sufficient_len)
+                   | (ip+repLen == iLimit) ) {  /* best possible */
+                    return mnum;
+    }   }   }   }
+
+    /* HC3 match finder */
+    if ((mls == 3) /*static*/ && (bestLength < mls)) {
+        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip);
+        if ((matchIndex3 >= matchLow)
+          & (curr - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
+            size_t mlen;
+            if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
+                const BYTE* const match = base + matchIndex3;
+                mlen = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex3;
+                mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart);
+            }
+
+            /* save best solution */
+            if (mlen >= mls /* == 3 > bestLength */) {
+                DEBUGLOG(8, "found small match with hlog3, of length %u",
+                            (U32)mlen);
+                bestLength = mlen;
+                assert(curr > matchIndex3);
+                assert(mnum==0);  /* no prior solution */
+                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                matches[0].len = (U32)mlen;
+                mnum = 1;
+                if ( (mlen > sufficient_len) |
+                     (ip+mlen == iLimit) ) {  /* best possible length */
+                    ms->nextToUpdate = curr+1;  /* skip insertion */
+                    return 1;
+        }   }   }
+        /* no dictMatchState lookup: dicts don't have a populated HC3 table */
+    }  /* if (mls == 3) */
+
+    hashTable[h] = curr;   /* Update Hash Table */
+
+    for (; nbCompares && (matchIndex >= matchLow); --nbCompares) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        const BYTE* match;
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(curr > matchIndex);
+
+        if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);  /* ensure the condition is correct when !extDict */
+            match = base + matchIndex;
+            if (matchIndex >= dictLimit) assert(memcmp(match, ip, matchLength) == 0);  /* ensure early section of match is equal as expected */
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
+        } else {
+            match = dictBase + matchIndex;
+            assert(memcmp(match, ip, matchLength) == 0);  /* ensure early section of match is equal as expected */
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* prepare for match[matchLength] read */
+        }
+
+        if (matchLength > bestLength) {
+            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
+                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+            assert(matchEndIdx > matchIndex);
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            bestLength = matchLength;
+            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+            matches[mnum].len = (U32)matchLength;
+            mnum++;
+            if ( (matchLength > ZSTD_OPT_NUM)
+               | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
+                break; /* drop, to preserve bt consistency (miss a little bit of compression) */
+        }   }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new candidate => larger than match, which was smaller than current */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous, closer to current */
+        } else {
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+    assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
+    if (dictMode == ZSTD_dictMatchState && nbCompares) {
+        size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
+        U32 dictMatchIndex = dms->hashTable[dmsH];
+        const U32* const dmsBt = dms->chainTable;
+        commonLengthSmaller = commonLengthLarger = 0;
+        for (; nbCompares && (dictMatchIndex > dmsLowLimit); --nbCompares) {
+            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match = dmsBase + dictMatchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
+            if (dictMatchIndex+matchLength >= dmsHighLimit)
+                match = base + dictMatchIndex + dmsIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+            if (matchLength > bestLength) {
+                matchIndex = dictMatchIndex + dmsIndexDelta;
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
+                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                bestLength = matchLength;
+                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                matches[mnum].len = (U32)matchLength;
+                mnum++;
+                if ( (matchLength > ZSTD_OPT_NUM)
+                   | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }   }
+
+            if (dictMatchIndex <= dmsBtLow) { break; }   /* beyond tree size, stop the search */
+            if (match[matchLength] < ip[matchLength]) {
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                commonLengthLarger = matchLength;
+                dictMatchIndex = nextPtr[0];
+    }   }   }  /* if (dictMode == ZSTD_dictMatchState) */
+
+    assert(matchEndIdx > curr+8);
+    ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
+    return mnum;
+}
+
+typedef U32 (*ZSTD_getAllMatchesFn)(
+    ZSTD_match_t*,
+    ZSTD_matchState_t*,
+    U32*,
+    const BYTE*,
+    const BYTE*,
+    const U32 rep[ZSTD_REP_NUM],
+    U32 const ll0,
+    U32 const lengthToBeat);
+
+FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
+        ZSTD_match_t* matches,
+        ZSTD_matchState_t* ms,
+        U32* nextToUpdate3,
+        const BYTE* ip,
+        const BYTE* const iHighLimit,
+        const U32 rep[ZSTD_REP_NUM],
+        U32 const ll0,
+        U32 const lengthToBeat,
+        const ZSTD_dictMode_e dictMode,
+        const U32 mls)
+{
+    assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls);
+    DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls);
+    if (ip < ms->window.base + ms->nextToUpdate)
+        return 0;   /* skipped area */
+    ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode);
+    return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls);
+}
+
+#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls)            \
+    static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)(      \
+            ZSTD_match_t* matches,                             \
+            ZSTD_matchState_t* ms,                             \
+            U32* nextToUpdate3,                                \
+            const BYTE* ip,                                    \
+            const BYTE* const iHighLimit,                      \
+            const U32 rep[ZSTD_REP_NUM],                       \
+            U32 const ll0,                                     \
+            U32 const lengthToBeat)                            \
+    {                                                          \
+        return ZSTD_btGetAllMatches_internal(                  \
+                matches, ms, nextToUpdate3, ip, iHighLimit,    \
+                rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \
+    }
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6)
+
+GEN_ZSTD_BT_GET_ALL_MATCHES(noDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(extDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
+
+#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode)  \
+    {                                            \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6)  \
+    }
+
+static ZSTD_getAllMatchesFn
+ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode)
+{
+    ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict),
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState)
+    };
+    U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6);
+    assert((U32)dictMode < 3);
+    assert(mls - 3 < 4);
+    return getAllMatchesFns[(int)dictMode][mls - 3];
+}
+
+/*************************
+*  LDM helper functions  *
+*************************/
+
+/* Struct containing info needed to make decision about ldm inclusion */
+typedef struct {
+    rawSeqStore_t seqStore;   /* External match candidates store for this block */
+    U32 startPosInBlock;      /* Start position of the current match candidate */
+    U32 endPosInBlock;        /* End position of the current match candidate */
+    U32 offset;               /* Offset of the match candidate */
+} ZSTD_optLdm_t;
+
+/* ZSTD_optLdm_skipRawSeqStoreBytes():
+ * Moves forward in @rawSeqStore by @nbBytes,
+ * which will update the fields 'pos' and 'posInSequence'.
+ */
+static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes)
+{
+    U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+    while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
+        if (currPos >= currSeq.litLength + currSeq.matchLength) {
+            currPos -= currSeq.litLength + currSeq.matchLength;
+            rawSeqStore->pos++;
+        } else {
+            rawSeqStore->posInSequence = currPos;
+            break;
+        }
+    }
+    if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) {
+        rawSeqStore->posInSequence = 0;
+    }
+}
+
+/* ZSTD_opt_getNextMatchAndUpdateSeqStore():
+ * Calculates the beginning and end of the next match in the current block.
+ * Updates 'pos' and 'posInSequence' of the ldmSeqStore.
+ */
+static void
+ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
+                                       U32 blockBytesRemaining)
+{
+    rawSeq currSeq;
+    U32 currBlockEndPos;
+    U32 literalsBytesRemaining;
+    U32 matchBytesRemaining;
+
+    /* Setting match end position to MAX to ensure we never use an LDM during this block */
+    if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
+        optLdm->startPosInBlock = UINT_MAX;
+        optLdm->endPosInBlock = UINT_MAX;
+        return;
+    }
+    /* Calculate appropriate bytes left in matchLength and litLength
+     * after adjusting based on ldmSeqStore->posInSequence */
+    currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos];
+    assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength);
+    currBlockEndPos = currPosInBlock + blockBytesRemaining;
+    literalsBytesRemaining = (optLdm->seqStore.posInSequence < currSeq.litLength) ?
+            currSeq.litLength - (U32)optLdm->seqStore.posInSequence :
+            0;
+    matchBytesRemaining = (literalsBytesRemaining == 0) ?
+            currSeq.matchLength - ((U32)optLdm->seqStore.posInSequence - currSeq.litLength) :
+            currSeq.matchLength;
+
+    /* If there are more literal bytes than bytes remaining in block, no ldm is possible */
+    if (literalsBytesRemaining >= blockBytesRemaining) {
+        optLdm->startPosInBlock = UINT_MAX;
+        optLdm->endPosInBlock = UINT_MAX;
+        ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, blockBytesRemaining);
+        return;
+    }
+
+    /* Matches may be < MINMATCH by this process. In that case, we will reject them
+       when we are deciding whether or not to add the ldm */
+    optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining;
+    optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining;
+    optLdm->offset = currSeq.offset;
+
+    if (optLdm->endPosInBlock > currBlockEndPos) {
+        /* Match ends after the block ends, we can't use the whole match */
+        optLdm->endPosInBlock = currBlockEndPos;
+        ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, currBlockEndPos - currPosInBlock);
+    } else {
+        /* Consume nb of bytes equal to size of sequence left */
+        ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, literalsBytesRemaining + matchBytesRemaining);
+    }
+}
+
+/* ZSTD_optLdm_maybeAddMatch():
+ * Adds a match if it's long enough,
+ * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock',
+ * into 'matches'. Maintains the correct ordering of 'matches'.
+ */
+static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+{
+    U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+    U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+
+    /* Ensure that current block position is not outside of the match */
+    if (currPosInBlock < optLdm->startPosInBlock
+      || currPosInBlock >= optLdm->endPosInBlock
+      || candidateMatchLength < MINMATCH) {
+        return;
+    }
+
+    if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
+        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
+                 candidateOffBase, candidateMatchLength, currPosInBlock);
+        matches[*nbMatches].len = candidateMatchLength;
+        matches[*nbMatches].off = candidateOffBase;
+        (*nbMatches)++;
+    }
+}
+
+/* ZSTD_optLdm_processMatchCandidate():
+ * Wrapper function to update ldm seq store and call ldm functions as necessary.
+ */
+static void
+ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+                                  ZSTD_match_t* matches, U32* nbMatches,
+                                  U32 currPosInBlock, U32 remainingBytes)
+{
+    if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
+        return;
+    }
+
+    if (currPosInBlock >= optLdm->endPosInBlock) {
+        if (currPosInBlock > optLdm->endPosInBlock) {
+            /* The position at which ZSTD_optLdm_processMatchCandidate() is called is not necessarily
+             * at the end of a match from the ldm seq store, and will often be some bytes
+             * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots"
+             */
+            U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock;
+            ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot);
+        }
+        ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);
+    }
+    ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
+}
+
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+
+static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+{
+    return sol.litlen + sol.mlen;
+}
+
+#if 0 /* debug */
+
+static void
+listStats(const U32* table, int lastEltID)
+{
+    int const nbElts = lastEltID + 1;
+    int enb;
+    for (enb=0; enb < nbElts; enb++) {
+        (void)table;
+        /* RAWLOG(2, "%3i:%3i,  ", enb, table[enb]); */
+        RAWLOG(2, "%4i,", table[enb]);
+    }
+    RAWLOG(2, " \n");
+}
+
+#endif
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                               seqStore_t* seqStore,
+                               U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+                         const int optLevel,
+                         const ZSTD_dictMode_e dictMode)
+{
+    optState_t* const optStatePtr = &ms->opt;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+
+    ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode);
+
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
+    U32 nextToUpdate3 = ms->nextToUpdate;
+
+    ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+    ZSTD_match_t* const matches = optStatePtr->matchTable;
+    ZSTD_optimal_t lastSequence;
+    ZSTD_optLdm_t optLdm;
+
+    optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+    optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+    ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u",
+                (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate);
+    assert(optLevel <= 2);
+    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
+    ip += (ip==prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 cur, last_pos = 0;
+
+        /* find first match */
+        {   U32 const litlen = (U32)(ip - anchor);
+            U32 const ll0 = !litlen;
+            U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+            ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+                                              (U32)(ip-istart), (U32)(iend - ip));
+            if (!nbMatches) { ip++; continue; }
+
+            /* initialize opt[0] */
+            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+            opt[0].mlen = 0;  /* means is_a_literal */
+            opt[0].litlen = litlen;
+            /* We don't need to include the actual price of the literals because
+             * it is static for the duration of the forward pass, and is included
+             * in every price. We include the literal length to avoid negative
+             * prices when we subtract the previous literal length.
+             */
+            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
+
+            /* large match -> immediate encoding */
+            {   U32 const maxML = matches[nbMatches-1].len;
+                U32 const maxOffBase = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
+                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+
+                if (maxML > sufficient_len) {
+                    lastSequence.litlen = litlen;
+                    lastSequence.mlen = maxML;
+                    lastSequence.off = maxOffBase;
+                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                                maxML, sufficient_len);
+                    cur = 0;
+                    last_pos = ZSTD_totalLen(lastSequence);
+                    goto _shortestPath;
+            }   }
+
+            /* set prices for first matches starting position == 0 */
+            assert(opt[0].price >= 0);
+            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 pos;
+                U32 matchNb;
+                for (pos = 1; pos < minMatch; pos++) {
+                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
+                }
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offBase = matches[matchNb].off;
+                    U32 const end = matches[matchNb].len;
+                    for ( ; pos <= end ; pos++ ) {
+                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
+                        U32 const sequencePrice = literalsPrice + matchPrice;
+                        DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                    pos, ZSTD_fCost(sequencePrice));
+                        opt[pos].mlen = pos;
+                        opt[pos].off = offBase;
+                        opt[pos].litlen = litlen;
+                        opt[pos].price = (int)sequencePrice;
+                }   }
+                last_pos = pos-1;
+            }
+        }
+
+        /* check further positions */
+        for (cur = 1; cur <= last_pos; cur++) {
+            const BYTE* const inr = ip + cur;
+            assert(cur < ZSTD_OPT_NUM);
+            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
+
+            /* Fix current position with one literal if cheaper */
+            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+                int const price = opt[cur-1].price
+                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
+                assert(price < 1000000000); /* overflow check */
+                if (price <= opt[cur].price) {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+                    opt[cur].mlen = 0;
+                    opt[cur].off = 0;
+                    opt[cur].litlen = litlen;
+                    opt[cur].price = price;
+                } else {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+                }
+            }
+
+            /* Set the repcodes of the current position. We must do it here
+             * because we rely on the repcodes of the 2nd to last sequence being
+             * correct to set the next chunks repcodes during the backward
+             * traversal.
+             */
+            ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+            assert(cur >= opt[cur].mlen);
+            if (opt[cur].mlen != 0) {
+                U32 const prev = cur - opt[cur].mlen;
+                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
+                ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+            } else {
+                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+            }
+
+            /* last match must start at a minimum distance of 8 from oend */
+            if (inr > ilimit) continue;
+
+            if (cur == last_pos) break;
+
+            if ( (optLevel==0) /*static_test*/
+              && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
+                continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+            }
+
+            assert(opt[cur].price >= 0);
+            {   U32 const ll0 = (opt[cur].mlen != 0);
+                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+                U32 const previousPrice = (U32)opt[cur].price;
+                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                U32 matchNb;
+
+                ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+                                                  (U32)(inr-istart), (U32)(iend-inr));
+
+                if (!nbMatches) {
+                    DEBUGLOG(7, "rPos:%u : no match found", cur);
+                    continue;
+                }
+
+                {   U32 const maxML = matches[nbMatches-1].len;
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+                                inr-istart, cur, nbMatches, maxML);
+
+                    if ( (maxML > sufficient_len)
+                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+                        lastSequence.mlen = maxML;
+                        lastSequence.off = matches[nbMatches-1].off;
+                        lastSequence.litlen = litlen;
+                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+                        last_pos = cur + ZSTD_totalLen(lastSequence);
+                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
+                        goto _shortestPath;
+                }   }
+
+                /* set prices using matches found at position == cur */
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    U32 const lastML = matches[matchNb].len;
+                    U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                    U32 mlen;
+
+                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
+                                matchNb, matches[matchNb].off, lastML, litlen);
+
+                    for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                        U32 const pos = cur + mlen;
+                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+
+                        if ((pos > last_pos) || (price < opt[pos].price)) {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
+                            opt[pos].mlen = mlen;
+                            opt[pos].off = offset;
+                            opt[pos].litlen = litlen;
+                            opt[pos].price = price;
+                        } else {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                        }
+            }   }   }
+        }  /* for (cur = 1; cur <= last_pos; cur++) */
+
+        lastSequence = opt[last_pos];
+        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
+
+_shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+        assert(opt[0].mlen == 0);
+
+        /* Set the next chunk's repcodes based on the repcodes of the beginning
+         * of the last match, and the last sequence. This avoids us having to
+         * update them while traversing the sequences.
+         */
+        if (lastSequence.mlen != 0) {
+            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+            ZSTD_memcpy(rep, &reps, sizeof(reps));
+        } else {
+            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
+        }
+
+        {   U32 const storeEnd = cur + 1;
+            U32 storeStart = storeEnd;
+            U32 seqPos = cur;
+
+            DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                        last_pos, cur); (void)last_pos;
+            assert(storeEnd < ZSTD_OPT_NUM);
+            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+            opt[storeEnd] = lastSequence;
+            while (seqPos > 0) {
+                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+                storeStart--;
+                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+                opt[storeStart] = opt[seqPos];
+                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+            }
+
+            /* save sequences */
+            DEBUGLOG(6, "sending selected sequences into seqStore")
+            {   U32 storePos;
+                for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                    U32 const llen = opt[storePos].litlen;
+                    U32 const mlen = opt[storePos].mlen;
+                    U32 const offBase = opt[storePos].off;
+                    U32 const advance = llen + mlen;
+                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                anchor - istart, (unsigned)llen, (unsigned)mlen);
+
+                    if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                        assert(storePos == storeEnd);   /* must be last sequence */
+                        ip = anchor + llen;     /* last "sequence" is a bunch of literals => don't progress anchor */
+                        continue;   /* will finish */
+                    }
+
+                    assert(anchor + llen <= iend);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                    anchor += advance;
+                    ip = anchor;
+            }   }
+            ZSTD_setBasePrices(optStatePtr, optLevel);
+        }
+    }   /* while (ip < ilimit) */
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+static size_t ZSTD_compressBlock_opt0(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+}
+
+static size_t ZSTD_compressBlock_opt2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+}
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+}
+
+
+
+
+/* ZSTD_initStats_ultra():
+ * make a first compression pass, just to seed stats with more accurate starting values.
+ * only works on first block, with no dictionary and no ldm.
+ * this function cannot error, hence its contract must be respected.
+ */
+static void
+ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+                     seqStore_t* seqStore,
+                     U32 rep[ZSTD_REP_NUM],
+               const void* src, size_t srcSize)
+{
+    U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+    ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+
+    DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize);
+    assert(ms->opt.litLengthSum == 0);    /* first block */
+    assert(seqStore->sequences == seqStore->sequencesStart);   /* no ldm */
+    assert(ms->window.dictLimit == ms->window.lowLimit);   /* no dictionary */
+    assert(ms->window.dictLimit - ms->nextToUpdate <= 1);  /* no prefix (note: intentional overflow, defined as 2-complement) */
+
+    ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+
+    /* invalidate first scan from history */
+    ZSTD_resetSeqStore(seqStore);
+    ms->window.base -= srcSize;
+    ms->window.dictLimit += (U32)srcSize;
+    ms->window.lowLimit = ms->window.dictLimit;
+    ms->nextToUpdate = ms->window.dictLimit;
+
+}
+
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+
+    /* 2-pass strategy:
+     * this strategy makes a first pass over first block to collect statistics
+     * and seed next round's statistics with it.
+     * After 1st pass, function forgets everything, and starts a new block.
+     * Consequently, this can only work if no data has been previously loaded in tables,
+     * aka, no dictionary, no prefix, no ldm preprocessing.
+     * The compression ratio gain is generally small (~0.5% on first block),
+     * the cost is 2x cpu time on first block. */
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    if ( (ms->opt.litLengthSum==0)   /* first block */
+      && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+      && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+      && (srcSize > ZSTD_PREDEF_THRESHOLD)
+      ) {
+        ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+    }
+
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+}
+
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+}
+
+/* note : no btultra2 variant for extDict nor dictMatchState,
+ * because btultra2 is not meant to work with dictionaries
+ * and is only specific for the first block (no prefix) */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstd_opt.h b/GraphBLAS/zstd/zstd_subset/compress/zstd_opt.h
new file mode 100644
index 000000000..627255f53
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstd_opt.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_OPT_H
+#define ZSTD_OPT_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+        /* note : no btultra2 variant for extDict nor dictMatchState,
+         * because btultra2 is not meant to work with dictionaries
+         * and is only specific for the first block (no prefix) */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_OPT_H */
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstdmt_compress.c b/GraphBLAS/zstd/zstd_subset/compress/zstdmt_compress.c
new file mode 100644
index 000000000..0c10eb603
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstdmt_compress.c
@@ -0,0 +1,1866 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)   /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+/* ======   Constants   ====== */
+#define ZSTDMT_OVERLAPLOG_DEFAULT 0
+
+
+/* ======   Dependencies   ====== */
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */
+#include "../common/mem.h"         /* MEM_STATIC */
+#include "../common/pool.h"        /* threadpool */
+#include "../common/threading.h"   /* mutex */
+#include "zstd_compress_internal.h"  /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
+#include "zstd_ldm.h"
+#include "zstdmt_compress.h"
+
+/* Guards code to support resizing the SeqPool.
+ * We will want to resize the SeqPool to save memory in the future.
+ * Until then, comment the code out since it is unused.
+ */
+#define ZSTD_RESIZE_SEQPOOL 0
+
+/* ======   Debug   ====== */
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \
+    && !defined(_MSC_VER) \
+    && !defined(__MINGW32__)
+
+#  include <stdio.h>
+#  include <unistd.h>
+#  include <sys/times.h>
+
+#  define DEBUG_PRINTHEX(l,p,n) {            \
+    unsigned debug_u;                        \
+    for (debug_u=0; debug_u<(n); debug_u++)  \
+        RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
+    RAWLOG(l, " \n");                        \
+}
+
+static unsigned long long GetCurrentClockTimeMicroseconds(void)
+{
+   static clock_t _ticksPerSecond = 0;
+   if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK);
+
+   {   struct tms junk; clock_t newTicks = (clock_t) times(&junk);
+       return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond);
+}  }
+
+#define MUTEX_WAIT_TIME_DLEVEL 6
+#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) {          \
+    if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {   \
+        unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \
+        ZSTD_pthread_mutex_lock(mutex);           \
+        {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
+            unsigned long long const elapsedTime = (afterTime-beforeTime); \
+            if (elapsedTime > 1000) {  /* or whatever threshold you like; I'm using 1 millisecond here */ \
+                DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
+                   elapsedTime, #mutex);          \
+        }   }                                     \
+    } else {                                      \
+        ZSTD_pthread_mutex_lock(mutex);           \
+    }                                             \
+}
+
+#else
+
+#  define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m)
+#  define DEBUG_PRINTHEX(l,p,n) {}
+
+#endif
+
+
+/* =====   Buffer Pool   ===== */
+/* a single Buffer Pool can be invoked from multiple threads in parallel */
+
+typedef struct buffer_s {
+    void* start;
+    size_t capacity;
+} buffer_t;
+
+static const buffer_t g_nullBuffer = { NULL, 0 };
+
+typedef struct ZSTDMT_bufferPool_s {
+    ZSTD_pthread_mutex_t poolMutex;
+    size_t bufferSize;
+    unsigned totalBuffers;
+    unsigned nbBuffers;
+    ZSTD_customMem cMem;
+    buffer_t bTable[1];   /* variable size */
+} ZSTDMT_bufferPool;
+
+static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem)
+{
+    ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_customCalloc(
+        sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem);
+    if (bufPool==NULL) return NULL;
+    if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
+        ZSTD_customFree(bufPool, cMem);
+        return NULL;
+    }
+    bufPool->bufferSize = 64 KB;
+    bufPool->totalBuffers = maxNbBuffers;
+    bufPool->nbBuffers = 0;
+    bufPool->cMem = cMem;
+    return bufPool;
+}
+
+static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    unsigned u;
+    DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool);
+    if (!bufPool) return;   /* compatibility with free on NULL */
+    for (u=0; u<bufPool->totalBuffers; u++) {
+        DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start);
+        ZSTD_customFree(bufPool->bTable[u].start, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_destroy(&bufPool->poolMutex);
+    ZSTD_customFree(bufPool, bufPool->cMem);
+}
+
+/* only works at initialization, not during compression */
+static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const poolSize = sizeof(*bufPool)
+                          + (bufPool->totalBuffers - 1) * sizeof(buffer_t);
+    unsigned u;
+    size_t totalBufferSize = 0;
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    for (u=0; u<bufPool->totalBuffers; u++)
+        totalBufferSize += bufPool->bTable[u].capacity;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+
+    return poolSize + totalBufferSize;
+}
+
+/* ZSTDMT_setBufferSize() :
+ * all future buffers provided by this buffer pool will have _at least_ this size
+ * note : it's better for all buffers to have same size,
+ * as they become freely interchangeable, reducing malloc/free usages and memory fragmentation */
+static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const bSize)
+{
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    DEBUGLOG(4, "ZSTDMT_setBufferSize: bSize = %u", (U32)bSize);
+    bufPool->bufferSize = bSize;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+}
+
+
+static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, unsigned maxNbBuffers)
+{
+    if (srcBufPool==NULL) return NULL;
+    if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */
+        return srcBufPool;
+    /* need a larger buffer pool */
+    {   ZSTD_customMem const cMem = srcBufPool->cMem;
+        size_t const bSize = srcBufPool->bufferSize;   /* forward parameters */
+        ZSTDMT_bufferPool* newBufPool;
+        ZSTDMT_freeBufferPool(srcBufPool);
+        newBufPool = ZSTDMT_createBufferPool(maxNbBuffers, cMem);
+        if (newBufPool==NULL) return newBufPool;
+        ZSTDMT_setBufferSize(newBufPool, bSize);
+        return newBufPool;
+    }
+}
+
+/** ZSTDMT_getBuffer() :
+ *  assumption : bufPool must be valid
+ * @return : a buffer, with start pointer and size
+ *  note: allocation may fail, in this case, start==NULL and size==0 */
+static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const bSize = bufPool->bufferSize;
+    DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize);
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers) {   /* try to use an existing buffer */
+        buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)];
+        size_t const availBufferSize = buf.capacity;
+        bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer;
+        if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) {
+            /* large enough, but not too much */
+            DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u",
+                        bufPool->nbBuffers, (U32)buf.capacity);
+            ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+            return buf;
+        }
+        /* size conditions not respected : scratch this buffer, create new one */
+        DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing");
+        ZSTD_customFree(buf.start, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* create new buffer */
+    DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer");
+    {   buffer_t buffer;
+        void* const start = ZSTD_customMalloc(bSize, bufPool->cMem);
+        buffer.start = start;   /* note : start can be NULL if malloc fails ! */
+        buffer.capacity = (start==NULL) ? 0 : bSize;
+        if (start==NULL) {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: buffer allocation failure !!");
+        } else {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: created buffer of size %u", (U32)bSize);
+        }
+        return buffer;
+    }
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+/** ZSTDMT_resizeBuffer() :
+ * assumption : bufPool must be valid
+ * @return : a buffer that is at least the buffer pool buffer size.
+ *           If a reallocation happens, the data in the input buffer is copied.
+ */
+static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer)
+{
+    size_t const bSize = bufPool->bufferSize;
+    if (buffer.capacity < bSize) {
+        void* const start = ZSTD_customMalloc(bSize, bufPool->cMem);
+        buffer_t newBuffer;
+        newBuffer.start = start;
+        newBuffer.capacity = start == NULL ? 0 : bSize;
+        if (start != NULL) {
+            assert(newBuffer.capacity >= buffer.capacity);
+            ZSTD_memcpy(newBuffer.start, buffer.start, buffer.capacity);
+            DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize);
+            return newBuffer;
+        }
+        DEBUGLOG(5, "ZSTDMT_resizeBuffer: buffer allocation failure !!");
+    }
+    return buffer;
+}
+#endif
+
+/* store buffer for later re-use, up to pool capacity */
+static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
+{
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer");
+    if (buf.start == NULL) return;   /* compatible with release on NULL */
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers < bufPool->totalBuffers) {
+        bufPool->bTable[bufPool->nbBuffers++] = buf;  /* stored for later use */
+        DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u",
+                    (U32)buf.capacity, (U32)(bufPool->nbBuffers-1));
+        ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+        return;
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* Reached bufferPool capacity (should not happen) */
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing ");
+    ZSTD_customFree(buf.start, bufPool->cMem);
+}
+
+/* We need 2 output buffers per worker since each dstBuff must be flushed after it is released.
+ * The 3 additional buffers are as follows:
+ *   1 buffer for input loading
+ *   1 buffer for "next input" when submitting current one
+ *   1 buffer stuck in queue */
+#define BUF_POOL_MAX_NB_BUFFERS(nbWorkers) (2*(nbWorkers) + 3)
+
+/* After a worker releases its rawSeqStore, it is immediately ready for reuse.
+ * So we only need one seq buffer per worker. */
+#define SEQ_POOL_MAX_NB_BUFFERS(nbWorkers) (nbWorkers)
+
+/* =====   Seq Pool Wrapper   ====== */
+
+typedef ZSTDMT_bufferPool ZSTDMT_seqPool;
+
+static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool)
+{
+    return ZSTDMT_sizeof_bufferPool(seqPool);
+}
+
+static rawSeqStore_t bufferToSeq(buffer_t buffer)
+{
+    rawSeqStore_t seq = kNullRawSeqStore;
+    seq.seq = (rawSeq*)buffer.start;
+    seq.capacity = buffer.capacity / sizeof(rawSeq);
+    return seq;
+}
+
+static buffer_t seqToBuffer(rawSeqStore_t seq)
+{
+    buffer_t buffer;
+    buffer.start = seq.seq;
+    buffer.capacity = seq.capacity * sizeof(rawSeq);
+    return buffer;
+}
+
+static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool)
+{
+    if (seqPool->bufferSize == 0) {
+        return kNullRawSeqStore;
+    }
+    return bufferToSeq(ZSTDMT_getBuffer(seqPool));
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+static rawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq)));
+}
+#endif
+
+static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq));
+}
+
+static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)
+{
+  ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq));
+}
+
+static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(SEQ_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
+    if (seqPool == NULL) return NULL;
+    ZSTDMT_setNbSeq(seqPool, 0);
+    return seqPool;
+}
+
+static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool)
+{
+    ZSTDMT_freeBufferPool(seqPool);
+}
+
+static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers)
+{
+    return ZSTDMT_expandBufferPool(pool, SEQ_POOL_MAX_NB_BUFFERS(nbWorkers));
+}
+
+
+/* =====   CCtx Pool   ===== */
+/* a single CCtx Pool can be invoked from multiple threads in parallel */
+
+typedef struct {
+    ZSTD_pthread_mutex_t poolMutex;
+    int totalCCtx;
+    int availCCtx;
+    ZSTD_customMem cMem;
+    ZSTD_CCtx* cctx[1];   /* variable size */
+} ZSTDMT_CCtxPool;
+
+/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */
+static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
+{
+    int cid;
+    for (cid=0; cid<pool->totalCCtx; cid++)
+        ZSTD_freeCCtx(pool->cctx[cid]);  /* note : compatible with free on NULL */
+    ZSTD_pthread_mutex_destroy(&pool->poolMutex);
+    ZSTD_customFree(pool, pool->cMem);
+}
+
+/* ZSTDMT_createCCtxPool() :
+ * implies nbWorkers >= 1 , checked by caller ZSTDMT_createCCtx() */
+static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers,
+                                              ZSTD_customMem cMem)
+{
+    ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_customCalloc(
+        sizeof(ZSTDMT_CCtxPool) + (nbWorkers-1)*sizeof(ZSTD_CCtx*), cMem);
+    assert(nbWorkers > 0);
+    if (!cctxPool) return NULL;
+    if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) {
+        ZSTD_customFree(cctxPool, cMem);
+        return NULL;
+    }
+    cctxPool->cMem = cMem;
+    cctxPool->totalCCtx = nbWorkers;
+    cctxPool->availCCtx = 1;   /* at least one cctx for single-thread mode */
+    cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem);
+    if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; }
+    DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers);
+    return cctxPool;
+}
+
+static ZSTDMT_CCtxPool* ZSTDMT_expandCCtxPool(ZSTDMT_CCtxPool* srcPool,
+                                              int nbWorkers)
+{
+    if (srcPool==NULL) return NULL;
+    if (nbWorkers <= srcPool->totalCCtx) return srcPool;   /* good enough */
+    /* need a larger cctx pool */
+    {   ZSTD_customMem const cMem = srcPool->cMem;
+        ZSTDMT_freeCCtxPool(srcPool);
+        return ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    }
+}
+
+/* only works during initialization phase, not during compression */
+static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool)
+{
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    {   unsigned const nbWorkers = cctxPool->totalCCtx;
+        size_t const poolSize = sizeof(*cctxPool)
+                                + (nbWorkers-1) * sizeof(ZSTD_CCtx*);
+        unsigned u;
+        size_t totalCCtxSize = 0;
+        for (u=0; u<nbWorkers; u++) {
+            totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctx[u]);
+        }
+        ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+        assert(nbWorkers > 0);
+        return poolSize + totalCCtxSize;
+    }
+}
+
+static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool)
+{
+    DEBUGLOG(5, "ZSTDMT_getCCtx");
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    if (cctxPool->availCCtx) {
+        cctxPool->availCCtx--;
+        {   ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx];
+            ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+            return cctx;
+    }   }
+    ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+    DEBUGLOG(5, "create one more CCtx");
+    return ZSTD_createCCtx_advanced(cctxPool->cMem);   /* note : can be NULL, when creation fails ! */
+}
+
+static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return;   /* compatibility with release on NULL */
+    ZSTD_pthread_mutex_lock(&pool->poolMutex);
+    if (pool->availCCtx < pool->totalCCtx)
+        pool->cctx[pool->availCCtx++] = cctx;
+    else {
+        /* pool overflow : should not happen, since totalCCtx==nbWorkers */
+        DEBUGLOG(4, "CCtx pool overflow : free cctx");
+        ZSTD_freeCCtx(cctx);
+    }
+    ZSTD_pthread_mutex_unlock(&pool->poolMutex);
+}
+
+/* ====   Serial State   ==== */
+
+typedef struct {
+    void const* start;
+    size_t size;
+} range_t;
+
+typedef struct {
+    /* All variables in the struct are protected by mutex. */
+    ZSTD_pthread_mutex_t mutex;
+    ZSTD_pthread_cond_t cond;
+    ZSTD_CCtx_params params;
+    ldmState_t ldmState;
+    XXH64_state_t xxhState;
+    unsigned nextJobID;
+    /* Protects ldmWindow.
+     * Must be acquired after the main mutex when acquiring both.
+     */
+    ZSTD_pthread_mutex_t ldmWindowMutex;
+    ZSTD_pthread_cond_t ldmWindowCond;  /* Signaled when ldmWindow is updated */
+    ZSTD_window_t ldmWindow;  /* A thread-safe copy of ldmState.window */
+} serialState_t;
+
+static int
+ZSTDMT_serialState_reset(serialState_t* serialState,
+                         ZSTDMT_seqPool* seqPool,
+                         ZSTD_CCtx_params params,
+                         size_t jobSize,
+                         const void* dict, size_t const dictSize,
+                         ZSTD_dictContentType_e dictContentType)
+{
+    /* Adjust parameters */
+    if (params.ldmParams.enableLdm == ZSTD_ps_enable) {
+        DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
+        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+        assert(params.ldmParams.hashRateLog < 32);
+    } else {
+        ZSTD_memset(&params.ldmParams, 0, sizeof(params.ldmParams));
+    }
+    serialState->nextJobID = 0;
+    if (params.fParams.checksumFlag)
+        XXH64_reset(&serialState->xxhState, 0);
+    if (params.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_customMem cMem = params.customMem;
+        unsigned const hashLog = params.ldmParams.hashLog;
+        size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t);
+        unsigned const bucketLog =
+            params.ldmParams.hashLog - params.ldmParams.bucketSizeLog;
+        unsigned const prevBucketLog =
+            serialState->params.ldmParams.hashLog -
+            serialState->params.ldmParams.bucketSizeLog;
+        size_t const numBuckets = (size_t)1 << bucketLog;
+        /* Size the seq pool tables */
+        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize));
+        /* Reset the window */
+        ZSTD_window_init(&serialState->ldmState.window);
+        /* Resize tables and output space if necessary. */
+        if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) {
+            ZSTD_customFree(serialState->ldmState.hashTable, cMem);
+            serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_customMalloc(hashSize, cMem);
+        }
+        if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) {
+            ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem);
+            serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(numBuckets, cMem);
+        }
+        if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets)
+            return 1;
+        /* Zero the tables */
+        ZSTD_memset(serialState->ldmState.hashTable, 0, hashSize);
+        ZSTD_memset(serialState->ldmState.bucketOffsets, 0, numBuckets);
+
+        /* Update window state and fill hash table with dict */
+        serialState->ldmState.loadedDictEnd = 0;
+        if (dictSize > 0) {
+            if (dictContentType == ZSTD_dct_rawContent) {
+                BYTE const* const dictEnd = (const BYTE*)dict + dictSize;
+                ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0);
+                ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, &params.ldmParams);
+                serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base);
+            } else {
+                /* don't even load anything */
+            }
+        }
+
+        /* Initialize serialState's copy of ldmWindow. */
+        serialState->ldmWindow = serialState->ldmState.window;
+    }
+
+    serialState->params = params;
+    serialState->params.jobSize = (U32)jobSize;
+    return 0;
+}
+
+static int ZSTDMT_serialState_init(serialState_t* serialState)
+{
+    int initError = 0;
+    ZSTD_memset(serialState, 0, sizeof(*serialState));
+    initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL);
+    initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->ldmWindowCond, NULL);
+    return initError;
+}
+
+static void ZSTDMT_serialState_free(serialState_t* serialState)
+{
+    ZSTD_customMem cMem = serialState->params.customMem;
+    ZSTD_pthread_mutex_destroy(&serialState->mutex);
+    ZSTD_pthread_cond_destroy(&serialState->cond);
+    ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex);
+    ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond);
+    ZSTD_customFree(serialState->ldmState.hashTable, cMem);
+    ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem);
+}
+
+static void ZSTDMT_serialState_update(serialState_t* serialState,
+                                      ZSTD_CCtx* jobCCtx, rawSeqStore_t seqStore,
+                                      range_t src, unsigned jobID)
+{
+    /* Wait for our turn */
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    while (serialState->nextJobID < jobID) {
+        DEBUGLOG(5, "wait for serialState->cond");
+        ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
+    }
+    /* A future job may error and skip our job */
+    if (serialState->nextJobID == jobID) {
+        /* It is now our turn, do any processing necessary */
+        if (serialState->params.ldmParams.enableLdm == ZSTD_ps_enable) {
+            size_t error;
+            assert(seqStore.seq != NULL && seqStore.pos == 0 &&
+                   seqStore.size == 0 && seqStore.capacity > 0);
+            assert(src.size <= serialState->params.jobSize);
+            ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0);
+            error = ZSTD_ldm_generateSequences(
+                &serialState->ldmState, &seqStore,
+                &serialState->params.ldmParams, src.start, src.size);
+            /* We provide a large enough buffer to never fail. */
+            assert(!ZSTD_isError(error)); (void)error;
+            /* Update ldmWindow to match the ldmState.window and signal the main
+             * thread if it is waiting for a buffer.
+             */
+            ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+            serialState->ldmWindow = serialState->ldmState.window;
+            ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+            ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+        }
+        if (serialState->params.fParams.checksumFlag && src.size > 0)
+            XXH64_update(&serialState->xxhState, src.start, src.size);
+    }
+    /* Now it is the next jobs turn */
+    serialState->nextJobID++;
+    ZSTD_pthread_cond_broadcast(&serialState->cond);
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+    if (seqStore.size > 0) {
+        size_t const err = ZSTD_referenceExternalSequences(
+            jobCCtx, seqStore.seq, seqStore.size);
+        assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
+static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState,
+                                              unsigned jobID, size_t cSize)
+{
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    if (serialState->nextJobID <= jobID) {
+        assert(ZSTD_isError(cSize)); (void)cSize;
+        DEBUGLOG(5, "Skipping past job %u because of error", jobID);
+        serialState->nextJobID = jobID + 1;
+        ZSTD_pthread_cond_broadcast(&serialState->cond);
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+        ZSTD_window_clear(&serialState->ldmWindow);
+        ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+        ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+    }
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+}
+
+
+/* ------------------------------------------ */
+/* =====          Worker thread         ===== */
+/* ------------------------------------------ */
+
+static const range_t kNullRange = { NULL, 0 };
+
+typedef struct {
+    size_t   consumed;                   /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */
+    size_t   cSize;                      /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */
+    ZSTD_pthread_mutex_t job_mutex;      /* Thread-safe - used by mtctx and worker */
+    ZSTD_pthread_cond_t job_cond;        /* Thread-safe - used by mtctx and worker */
+    ZSTDMT_CCtxPool* cctxPool;           /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_bufferPool* bufPool;          /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_seqPool* seqPool;             /* Thread-safe - used by mtctx and (all) workers */
+    serialState_t* serial;               /* Thread-safe - used by mtctx and (all) workers */
+    buffer_t dstBuff;                    /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */
+    range_t prefix;                      /* set by mtctx, then read by worker & mtctx => no barrier */
+    range_t src;                         /* set by mtctx, then read by worker & mtctx => no barrier */
+    unsigned jobID;                      /* set by mtctx, then read by worker => no barrier */
+    unsigned firstJob;                   /* set by mtctx, then read by worker => no barrier */
+    unsigned lastJob;                    /* set by mtctx, then read by worker => no barrier */
+    ZSTD_CCtx_params params;             /* set by mtctx, then read by worker => no barrier */
+    const ZSTD_CDict* cdict;             /* set by mtctx, then read by worker => no barrier */
+    unsigned long long fullFrameSize;    /* set by mtctx, then read by worker => no barrier */
+    size_t   dstFlushed;                 /* used only by mtctx */
+    unsigned frameChecksumNeeded;        /* used only by mtctx */
+} ZSTDMT_jobDescription;
+
+#define JOB_ERROR(e) {                          \
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
+    job->cSize = e;                             \
+    ZSTD_pthread_mutex_unlock(&job->job_mutex); \
+    goto _endJob;                               \
+}
+
+/* ZSTDMT_compressionJob() is a POOL_function type */
+static void ZSTDMT_compressionJob(void* jobDescription)
+{
+    ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
+    ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
+    ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
+    rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
+    buffer_t dstBuff = job->dstBuff;
+    size_t lastCBlockSize = 0;
+
+    /* resources */
+    if (cctx==NULL) JOB_ERROR(ERROR(memory_allocation));
+    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
+        dstBuff = ZSTDMT_getBuffer(job->bufPool);
+        if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation));
+        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
+    }
+    if (jobParams.ldmParams.enableLdm == ZSTD_ps_enable && rawSeqStore.seq == NULL)
+        JOB_ERROR(ERROR(memory_allocation));
+
+    /* Don't compute the checksum for chunks, since we compute it externally,
+     * but write it in the header.
+     */
+    if (job->jobID != 0) jobParams.fParams.checksumFlag = 0;
+    /* Don't run LDM for the chunks, since we handle it externally */
+    jobParams.ldmParams.enableLdm = ZSTD_ps_disable;
+    /* Correct nbWorkers to 0. */
+    jobParams.nbWorkers = 0;
+
+
+    /* init */
+    if (job->cdict) {
+        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, &jobParams, job->fullFrameSize);
+        assert(job->firstJob);  /* only allowed for first job */
+        if (ZSTD_isError(initError)) JOB_ERROR(initError);
+    } else {  /* srcStart points at reloaded section */
+        U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
+        {   size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob);
+            if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError);
+        }
+        if (!job->firstJob) {
+            size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0);
+            if (ZSTD_isError(err)) JOB_ERROR(err);
+        }
+        {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
+                                        job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
+                                        ZSTD_dtlm_fast,
+                                        NULL, /*cdict*/
+                                        &jobParams, pledgedSrcSize);
+            if (ZSTD_isError(initError)) JOB_ERROR(initError);
+    }   }
+
+    /* Perform serial step as early as possible, but after CCtx initialization */
+    ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
+
+    if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
+        size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
+        if (ZSTD_isError(hSize)) JOB_ERROR(hSize);
+        DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
+        ZSTD_invalidateRepCodes(cctx);
+    }
+
+    /* compress */
+    {   size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX;
+        int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize);
+        const BYTE* ip = (const BYTE*) job->src.start;
+        BYTE* const ostart = (BYTE*)dstBuff.start;
+        BYTE* op = ostart;
+        BYTE* oend = op + dstBuff.capacity;
+        int chunkNb;
+        if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize);   /* check overflow */
+        DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks);
+        assert(job->cSize == 0);
+        for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
+            size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize);
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
+            ip += chunkSize;
+            op += cSize; assert(op < oend);
+            /* stats */
+            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+            job->cSize += cSize;
+            job->consumed = chunkSize * chunkNb;
+            DEBUGLOG(5, "ZSTDMT_compressionJob: compress new block : cSize==%u bytes (total: %u)",
+                        (U32)cSize, (U32)job->cSize);
+            ZSTD_pthread_cond_signal(&job->job_cond);   /* warns some more data is ready to be flushed */
+            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+        }
+        /* last block */
+        assert(chunkSize > 0);
+        assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
+        if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
+            size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
+            size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
+            size_t const cSize = (job->lastJob) ?
+                 ZSTD_compressEnd     (cctx, op, oend-op, ip, lastBlockSize) :
+                 ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize);
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
+            lastCBlockSize = cSize;
+    }   }
+    if (!job->firstJob) {
+        /* Double check that we don't have an ext-dict, because then our
+         * repcode invalidation doesn't work.
+         */
+        assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+    }
+    ZSTD_CCtx_trace(cctx, 0);
+
+_endJob:
+    ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize);
+    if (job->prefix.size > 0)
+        DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start);
+    DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start);
+    /* release resources */
+    ZSTDMT_releaseSeq(job->seqPool, rawSeqStore);
+    ZSTDMT_releaseCCtx(job->cctxPool, cctx);
+    /* report */
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+    if (ZSTD_isError(job->cSize)) assert(lastCBlockSize == 0);
+    job->cSize += lastCBlockSize;
+    job->consumed = job->src.size;  /* when job->consumed == job->src.size , compression job is presumed completed */
+    ZSTD_pthread_cond_signal(&job->job_cond);
+    ZSTD_pthread_mutex_unlock(&job->job_mutex);
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+typedef struct {
+    range_t prefix;         /* read-only non-owned prefix buffer */
+    buffer_t buffer;
+    size_t filled;
+} inBuff_t;
+
+typedef struct {
+  BYTE* buffer;     /* The round input buffer. All jobs get references
+                     * to pieces of the buffer. ZSTDMT_tryGetInputRange()
+                     * handles handing out job input buffers, and makes
+                     * sure it doesn't overlap with any pieces still in use.
+                     */
+  size_t capacity;  /* The capacity of buffer. */
+  size_t pos;       /* The position of the current inBuff in the round
+                     * buffer. Updated past the end if the inBuff once
+                     * the inBuff is sent to the worker thread.
+                     * pos <= capacity.
+                     */
+} roundBuff_t;
+
+static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
+
+#define RSYNC_LENGTH 32
+/* Don't create chunks smaller than the zstd block size.
+ * This stops us from regressing compression ratio too much,
+ * and ensures our output fits in ZSTD_compressBound().
+ *
+ * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then
+ * ZSTD_COMPRESSBOUND() will need to be updated.
+ */
+#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX
+#define RSYNC_MIN_BLOCK_SIZE (1<<RSYNC_MIN_BLOCK_LOG)
+
+typedef struct {
+  U64 hash;
+  U64 hitMask;
+  U64 primePower;
+} rsyncState_t;
+
+struct ZSTDMT_CCtx_s {
+    POOL_ctx* factory;
+    ZSTDMT_jobDescription* jobs;
+    ZSTDMT_bufferPool* bufPool;
+    ZSTDMT_CCtxPool* cctxPool;
+    ZSTDMT_seqPool* seqPool;
+    ZSTD_CCtx_params params;
+    size_t targetSectionSize;
+    size_t targetPrefixSize;
+    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create a new job. */
+    inBuff_t inBuff;
+    roundBuff_t roundBuff;
+    serialState_t serial;
+    rsyncState_t rsync;
+    unsigned jobIDMask;
+    unsigned doneJobID;
+    unsigned nextJobID;
+    unsigned frameEnded;
+    unsigned allJobsCompleted;
+    unsigned long long frameContentSize;
+    unsigned long long consumed;
+    unsigned long long produced;
+    ZSTD_customMem cMem;
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+    unsigned providedFactory: 1;
+};
+
+static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem)
+{
+    U32 jobNb;
+    if (jobTable == NULL) return;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        ZSTD_pthread_mutex_destroy(&jobTable[jobNb].job_mutex);
+        ZSTD_pthread_cond_destroy(&jobTable[jobNb].job_cond);
+    }
+    ZSTD_customFree(jobTable, cMem);
+}
+
+/* ZSTDMT_allocJobsTable()
+ * allocate and init a job table.
+ * update *nbJobsPtr to next power of 2 value, as size of table */
+static ZSTDMT_jobDescription* ZSTDMT_createJobsTable(U32* nbJobsPtr, ZSTD_customMem cMem)
+{
+    U32 const nbJobsLog2 = ZSTD_highbit32(*nbJobsPtr) + 1;
+    U32 const nbJobs = 1 << nbJobsLog2;
+    U32 jobNb;
+    ZSTDMT_jobDescription* const jobTable = (ZSTDMT_jobDescription*)
+                ZSTD_customCalloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem);
+    int initError = 0;
+    if (jobTable==NULL) return NULL;
+    *nbJobsPtr = nbJobs;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        initError |= ZSTD_pthread_mutex_init(&jobTable[jobNb].job_mutex, NULL);
+        initError |= ZSTD_pthread_cond_init(&jobTable[jobNb].job_cond, NULL);
+    }
+    if (initError != 0) {
+        ZSTDMT_freeJobsTable(jobTable, nbJobs, cMem);
+        return NULL;
+    }
+    return jobTable;
+}
+
+static size_t ZSTDMT_expandJobsTable (ZSTDMT_CCtx* mtctx, U32 nbWorkers) {
+    U32 nbJobs = nbWorkers + 2;
+    if (nbJobs > mtctx->jobIDMask+1) {  /* need more job capacity */
+        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+        mtctx->jobIDMask = 0;
+        mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, mtctx->cMem);
+        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
+        assert((nbJobs != 0) && ((nbJobs & (nbJobs - 1)) == 0));  /* ensure nbJobs is a power of 2 */
+        mtctx->jobIDMask = nbJobs - 1;
+    }
+    return 0;
+}
+
+
+/* ZSTDMT_CCtxParam_setNbWorkers():
+ * Internal use only */
+static size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers)
+{
+    return ZSTD_CCtxParams_setParameter(params, ZSTD_c_nbWorkers, (int)nbWorkers);
+}
+
+MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool)
+{
+    ZSTDMT_CCtx* mtctx;
+    U32 nbJobs = nbWorkers + 2;
+    int initError;
+    DEBUGLOG(3, "ZSTDMT_createCCtx_advanced (nbWorkers = %u)", nbWorkers);
+
+    if (nbWorkers < 1) return NULL;
+    nbWorkers = MIN(nbWorkers , ZSTDMT_NBWORKERS_MAX);
+    if ((cMem.customAlloc!=NULL) ^ (cMem.customFree!=NULL))
+        /* invalid custom allocator */
+        return NULL;
+
+    mtctx = (ZSTDMT_CCtx*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtx), cMem);
+    if (!mtctx) return NULL;
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    mtctx->cMem = cMem;
+    mtctx->allJobsCompleted = 1;
+    if (pool != NULL) {
+      mtctx->factory = pool;
+      mtctx->providedFactory = 1;
+    }
+    else {
+      mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem);
+      mtctx->providedFactory = 0;
+    }
+    mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem);
+    assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0);  /* ensure nbJobs is a power of 2 */
+    mtctx->jobIDMask = nbJobs - 1;
+    mtctx->bufPool = ZSTDMT_createBufferPool(BUF_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
+    mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem);
+    initError = ZSTDMT_serialState_init(&mtctx->serial);
+    mtctx->roundBuff = kNullRoundBuff;
+    if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) {
+        ZSTDMT_freeCCtx(mtctx);
+        return NULL;
+    }
+    DEBUGLOG(3, "mt_cctx created, for %u threads", nbWorkers);
+    return mtctx;
+}
+
+ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool)
+{
+#ifdef ZSTD_MULTITHREAD
+    return ZSTDMT_createCCtx_advanced_internal(nbWorkers, cMem, pool);
+#else
+    (void)nbWorkers;
+    (void)cMem;
+    (void)pool;
+    return NULL;
+#endif
+}
+
+
+/* ZSTDMT_releaseAllJobResources() :
+ * note : ensure all workers are killed first ! */
+static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx)
+{
+    unsigned jobID;
+    DEBUGLOG(3, "ZSTDMT_releaseAllJobResources");
+    for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) {
+        /* Copy the mutex/cond out */
+        ZSTD_pthread_mutex_t const mutex = mtctx->jobs[jobID].job_mutex;
+        ZSTD_pthread_cond_t const cond = mtctx->jobs[jobID].job_cond;
+
+        DEBUGLOG(4, "job%02u: release dst address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].dstBuff.start);
+        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
+
+        /* Clear the job description, but keep the mutex/cond */
+        ZSTD_memset(&mtctx->jobs[jobID], 0, sizeof(mtctx->jobs[jobID]));
+        mtctx->jobs[jobID].job_mutex = mutex;
+        mtctx->jobs[jobID].job_cond = cond;
+    }
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->allJobsCompleted = 1;
+}
+
+static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx)
+{
+    DEBUGLOG(4, "ZSTDMT_waitForAllJobsCompleted");
+    while (mtctx->doneJobID < mtctx->nextJobID) {
+        unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
+        while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
+            DEBUGLOG(4, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
+            ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
+        mtctx->doneJobID++;
+    }
+}
+
+size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx==NULL) return 0;   /* compatible with free on NULL */
+    if (!mtctx->providedFactory)
+        POOL_free(mtctx->factory);   /* stop and free worker threads */
+    ZSTDMT_releaseAllJobResources(mtctx);  /* release job resources into pools first */
+    ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+    ZSTDMT_freeBufferPool(mtctx->bufPool);
+    ZSTDMT_freeCCtxPool(mtctx->cctxPool);
+    ZSTDMT_freeSeqPool(mtctx->seqPool);
+    ZSTDMT_serialState_free(&mtctx->serial);
+    ZSTD_freeCDict(mtctx->cdictLocal);
+    if (mtctx->roundBuff.buffer)
+        ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem);
+    ZSTD_customFree(mtctx, mtctx->cMem);
+    return 0;
+}
+
+size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx == NULL) return 0;   /* supports sizeof NULL */
+    return sizeof(*mtctx)
+            + POOL_sizeof(mtctx->factory)
+            + ZSTDMT_sizeof_bufferPool(mtctx->bufPool)
+            + (mtctx->jobIDMask+1) * sizeof(ZSTDMT_jobDescription)
+            + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool)
+            + ZSTDMT_sizeof_seqPool(mtctx->seqPool)
+            + ZSTD_sizeof_CDict(mtctx->cdictLocal)
+            + mtctx->roundBuff.capacity;
+}
+
+
+/* ZSTDMT_resize() :
+ * @return : error code if fails, 0 on success */
+static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers)
+{
+    if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation);
+    FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbWorkers) , "");
+    mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, BUF_POOL_MAX_NB_BUFFERS(nbWorkers));
+    if (mtctx->bufPool == NULL) return ERROR(memory_allocation);
+    mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers);
+    if (mtctx->cctxPool == NULL) return ERROR(memory_allocation);
+    mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers);
+    if (mtctx->seqPool == NULL) return ERROR(memory_allocation);
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    return 0;
+}
+
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates a selected set of compression parameters, remaining compatible with currently active frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams)
+{
+    U32 const saved_wlog = mtctx->params.cParams.windowLog;   /* Do not modify windowLog while compressing */
+    int const compressionLevel = cctxParams->compressionLevel;
+    DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)",
+                compressionLevel);
+    mtctx->params.compressionLevel = compressionLevel;
+    {   ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+        cParams.windowLog = saved_wlog;
+        mtctx->params.cParams = cParams;
+    }
+}
+
+/* ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ * Note : mutex will be acquired during statistics collection inside workers. */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
+{
+    ZSTD_frameProgression fps;
+    DEBUGLOG(5, "ZSTDMT_getFrameProgression");
+    fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
+    fps.consumed = mtctx->consumed;
+    fps.produced = fps.flushed = mtctx->produced;
+    fps.currentJobID = mtctx->nextJobID;
+    fps.nbActiveWorkers = 0;
+    {   unsigned jobNb;
+        unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
+        DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
+                    mtctx->doneJobID, lastJobNb, mtctx->jobReady)
+        for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
+            unsigned const wJobID = jobNb & mtctx->jobIDMask;
+            ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID];
+            ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+            {   size_t const cResult = jobPtr->cSize;
+                size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+                size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+                assert(flushed <= produced);
+                fps.ingested += jobPtr->src.size;
+                fps.consumed += jobPtr->consumed;
+                fps.produced += produced;
+                fps.flushed  += flushed;
+                fps.nbActiveWorkers += (jobPtr->consumed < jobPtr->src.size);
+            }
+            ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        }
+    }
+    return fps;
+}
+
+
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx)
+{
+    size_t toFlush;
+    unsigned const jobID = mtctx->doneJobID;
+    assert(jobID <= mtctx->nextJobID);
+    if (jobID == mtctx->nextJobID) return 0;   /* no active job => nothing to flush */
+
+    /* look into oldest non-fully-flushed job */
+    {   unsigned const wJobID = jobID & mtctx->jobIDMask;
+        ZSTDMT_jobDescription* const jobPtr = &mtctx->jobs[wJobID];
+        ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+        {   size_t const cResult = jobPtr->cSize;
+            size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+            size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+            assert(flushed <= produced);
+            assert(jobPtr->consumed <= jobPtr->src.size);
+            toFlush = produced - flushed;
+            /* if toFlush==0, nothing is available to flush.
+             * However, jobID is expected to still be active:
+             * if jobID was already completed and fully flushed,
+             * ZSTDMT_flushProduced() should have already moved onto next job.
+             * Therefore, some input has not yet been consumed. */
+            if (toFlush==0) {
+                assert(jobPtr->consumed < jobPtr->src.size);
+            }
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+    }
+
+    return toFlush;
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+static unsigned ZSTDMT_computeTargetJobLog(const ZSTD_CCtx_params* params)
+{
+    unsigned jobLog;
+    if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+        /* In Long Range Mode, the windowLog is typically oversized.
+         * In which case, it's preferable to determine the jobSize
+         * based on cycleLog instead. */
+        jobLog = MAX(21, ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy) + 3);
+    } else {
+        jobLog = MAX(20, params->cParams.windowLog + 2);
+    }
+    return MIN(jobLog, (unsigned)ZSTDMT_JOBLOG_MAX);
+}
+
+static int ZSTDMT_overlapLog_default(ZSTD_strategy strat)
+{
+    switch(strat)
+    {
+        case ZSTD_btultra2:
+            return 9;
+        case ZSTD_btultra:
+        case ZSTD_btopt:
+            return 8;
+        case ZSTD_btlazy2:
+        case ZSTD_lazy2:
+            return 7;
+        case ZSTD_lazy:
+        case ZSTD_greedy:
+        case ZSTD_dfast:
+        case ZSTD_fast:
+        default:;
+    }
+    return 6;
+}
+
+static int ZSTDMT_overlapLog(int ovlog, ZSTD_strategy strat)
+{
+    assert(0 <= ovlog && ovlog <= 9);
+    if (ovlog == 0) return ZSTDMT_overlapLog_default(strat);
+    return ovlog;
+}
+
+static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params)
+{
+    int const overlapRLog = 9 - ZSTDMT_overlapLog(params->overlapLog, params->cParams.strategy);
+    int ovLog = (overlapRLog >= 8) ? 0 : (params->cParams.windowLog - overlapRLog);
+    assert(0 <= overlapRLog && overlapRLog <= 8);
+    if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+        /* In Long Range Mode, the windowLog is typically oversized.
+         * In which case, it's preferable to determine the jobSize
+         * based on chainLog instead.
+         * Then, ovLog becomes a fraction of the jobSize, rather than windowSize */
+        ovLog = MIN(params->cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2)
+                - overlapRLog;
+    }
+    assert(0 <= ovLog && ovLog <= ZSTD_WINDOWLOG_MAX);
+    DEBUGLOG(4, "overlapLog : %i", params->overlapLog);
+    DEBUGLOG(4, "overlap size : %i", 1 << ovLog);
+    return (ovLog==0) ? 0 : (size_t)1 << ovLog;
+}
+
+/* ====================================== */
+/* =======      Streaming API     ======= */
+/* ====================================== */
+
+size_t ZSTDMT_initCStream_internal(
+        ZSTDMT_CCtx* mtctx,
+        const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+        const ZSTD_CDict* cdict, ZSTD_CCtx_params params,
+        unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u)",
+                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx);
+
+    /* params supposed partially fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    /* init */
+    if (params.nbWorkers != mtctx->params.nbWorkers)
+        FORWARD_IF_ERROR( ZSTDMT_resize(mtctx, params.nbWorkers) , "");
+
+    if (params.jobSize != 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN) params.jobSize = ZSTDMT_JOBSIZE_MIN;
+    if (params.jobSize > (size_t)ZSTDMT_JOBSIZE_MAX) params.jobSize = (size_t)ZSTDMT_JOBSIZE_MAX;
+
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal: %u workers", params.nbWorkers);
+
+    if (mtctx->allJobsCompleted == 0) {   /* previous compression not correctly finished */
+        ZSTDMT_waitForAllJobsCompleted(mtctx);
+        ZSTDMT_releaseAllJobResources(mtctx);
+        mtctx->allJobsCompleted = 1;
+    }
+
+    mtctx->params = params;
+    mtctx->frameContentSize = pledgedSrcSize;
+    if (dict) {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
+                                                    ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */
+                                                    params.cParams, mtctx->cMem);
+        mtctx->cdict = mtctx->cdictLocal;
+        if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = NULL;
+        mtctx->cdict = cdict;
+    }
+
+    mtctx->targetPrefixSize = ZSTDMT_computeOverlapSize(&params);
+    DEBUGLOG(4, "overlapLog=%i => %u KB", params.overlapLog, (U32)(mtctx->targetPrefixSize>>10));
+    mtctx->targetSectionSize = params.jobSize;
+    if (mtctx->targetSectionSize == 0) {
+        mtctx->targetSectionSize = 1ULL << ZSTDMT_computeTargetJobLog(&params);
+    }
+    assert(mtctx->targetSectionSize <= (size_t)ZSTDMT_JOBSIZE_MAX);
+
+    if (params.rsyncable) {
+        /* Aim for the targetsectionSize as the average job size. */
+        U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10);
+        U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10);
+        /* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our
+         * expected job size is at least 4x larger. */
+        assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2);
+        DEBUGLOG(4, "rsyncLog = %u", rsyncBits);
+        mtctx->rsync.hash = 0;
+        mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1;
+        mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH);
+    }
+    if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
+    DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), (U32)params.jobSize);
+    DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
+    ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
+    {
+        /* If ldm is enabled we need windowSize space. */
+        size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0;
+        /* Two buffers of slack, plus extra space for the overlap
+         * This is the minimum slack that LDM works with. One extra because
+         * flush might waste up to targetSectionSize-1 bytes. Another extra
+         * for the overlap (if > 0), then one to fill which doesn't overlap
+         * with the LDM window.
+         */
+        size_t const nbSlackBuffers = 2 + (mtctx->targetPrefixSize > 0);
+        size_t const slackSize = mtctx->targetSectionSize * nbSlackBuffers;
+        /* Compute the total size, and always have enough slack */
+        size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1);
+        size_t const sectionsSize = mtctx->targetSectionSize * nbWorkers;
+        size_t const capacity = MAX(windowSize, sectionsSize) + slackSize;
+        if (mtctx->roundBuff.capacity < capacity) {
+            if (mtctx->roundBuff.buffer)
+                ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem);
+            mtctx->roundBuff.buffer = (BYTE*)ZSTD_customMalloc(capacity, mtctx->cMem);
+            if (mtctx->roundBuff.buffer == NULL) {
+                mtctx->roundBuff.capacity = 0;
+                return ERROR(memory_allocation);
+            }
+            mtctx->roundBuff.capacity = capacity;
+        }
+    }
+    DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity>>10));
+    mtctx->roundBuff.pos = 0;
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->inBuff.prefix = kNullRange;
+    mtctx->doneJobID = 0;
+    mtctx->nextJobID = 0;
+    mtctx->frameEnded = 0;
+    mtctx->allJobsCompleted = 0;
+    mtctx->consumed = 0;
+    mtctx->produced = 0;
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize,
+                                 dict, dictSize, dictContentType))
+        return ERROR(memory_allocation);
+    return 0;
+}
+
+
+/* ZSTDMT_writeLastEmptyBlock()
+ * Write a single empty block with an end-of-frame to finish a frame.
+ * Job must be created from streaming variant.
+ * This function is always successful if expected conditions are fulfilled.
+ */
+static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job)
+{
+    assert(job->lastJob == 1);
+    assert(job->src.size == 0);   /* last job is empty -> will be simplified into a last empty block */
+    assert(job->firstJob == 0);   /* cannot be first job, as it also needs to create frame header */
+    assert(job->dstBuff.start == NULL);   /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */
+    job->dstBuff = ZSTDMT_getBuffer(job->bufPool);
+    if (job->dstBuff.start == NULL) {
+      job->cSize = ERROR(memory_allocation);
+      return;
+    }
+    assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize);   /* no buffer should ever be that small */
+    job->src = kNullRange;
+    job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity);
+    assert(!ZSTD_isError(job->cSize));
+    assert(job->consumed == 0);
+}
+
+static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZSTD_EndDirective endOp)
+{
+    unsigned const jobID = mtctx->nextJobID & mtctx->jobIDMask;
+    int const endFrame = (endOp == ZSTD_e_end);
+
+    if (mtctx->nextJobID > mtctx->doneJobID + mtctx->jobIDMask) {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: will not create new job : table is full");
+        assert((mtctx->nextJobID & mtctx->jobIDMask) == (mtctx->doneJobID & mtctx->jobIDMask));
+        return 0;
+    }
+
+    if (!mtctx->jobReady) {
+        BYTE const* src = (BYTE const*)mtctx->inBuff.buffer.start;
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ",
+                    mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size);
+        mtctx->jobs[jobID].src.start = src;
+        mtctx->jobs[jobID].src.size = srcSize;
+        assert(mtctx->inBuff.filled >= srcSize);
+        mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix;
+        mtctx->jobs[jobID].consumed = 0;
+        mtctx->jobs[jobID].cSize = 0;
+        mtctx->jobs[jobID].params = mtctx->params;
+        mtctx->jobs[jobID].cdict = mtctx->nextJobID==0 ? mtctx->cdict : NULL;
+        mtctx->jobs[jobID].fullFrameSize = mtctx->frameContentSize;
+        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+        mtctx->jobs[jobID].cctxPool = mtctx->cctxPool;
+        mtctx->jobs[jobID].bufPool = mtctx->bufPool;
+        mtctx->jobs[jobID].seqPool = mtctx->seqPool;
+        mtctx->jobs[jobID].serial = &mtctx->serial;
+        mtctx->jobs[jobID].jobID = mtctx->nextJobID;
+        mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0);
+        mtctx->jobs[jobID].lastJob = endFrame;
+        mtctx->jobs[jobID].frameChecksumNeeded = mtctx->params.fParams.checksumFlag && endFrame && (mtctx->nextJobID>0);
+        mtctx->jobs[jobID].dstFlushed = 0;
+
+        /* Update the round buffer pos and clear the input buffer to be reset */
+        mtctx->roundBuff.pos += srcSize;
+        mtctx->inBuff.buffer = g_nullBuffer;
+        mtctx->inBuff.filled = 0;
+        /* Set the prefix */
+        if (!endFrame) {
+            size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize);
+            mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize;
+            mtctx->inBuff.prefix.size = newPrefixSize;
+        } else {   /* endFrame==1 => no need for another input buffer */
+            mtctx->inBuff.prefix = kNullRange;
+            mtctx->frameEnded = endFrame;
+            if (mtctx->nextJobID == 0) {
+                /* single job exception : checksum is already calculated directly within worker thread */
+                mtctx->params.fParams.checksumFlag = 0;
+        }   }
+
+        if ( (srcSize == 0)
+          && (mtctx->nextJobID>0)/*single job must also write frame header*/ ) {
+            DEBUGLOG(5, "ZSTDMT_createCompressionJob: creating a last empty block to end frame");
+            assert(endOp == ZSTD_e_end);  /* only possible case : need to end the frame with an empty last block */
+            ZSTDMT_writeLastEmptyBlock(mtctx->jobs + jobID);
+            mtctx->nextJobID++;
+            return 0;
+        }
+    }
+
+    DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes  (end:%u, jobNb == %u (mod:%u))",
+                mtctx->nextJobID,
+                (U32)mtctx->jobs[jobID].src.size,
+                mtctx->jobs[jobID].lastJob,
+                mtctx->nextJobID,
+                jobID);
+    if (POOL_tryAdd(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[jobID])) {
+        mtctx->nextJobID++;
+        mtctx->jobReady = 0;
+    } else {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: no worker available for job %u", mtctx->nextJobID);
+        mtctx->jobReady = 1;
+    }
+    return 0;
+}
+
+
+/*! ZSTDMT_flushProduced() :
+ *  flush whatever data has been produced but not yet flushed in current job.
+ *  move to next job if current one is fully flushed.
+ * `output` : `pos` will be updated with amount of data flushed .
+ * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
+ * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
+static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, unsigned blockToFlush, ZSTD_EndDirective end)
+{
+    unsigned const wJobID = mtctx->doneJobID & mtctx->jobIDMask;
+    DEBUGLOG(5, "ZSTDMT_flushProduced (blocking:%u , job %u <= %u)",
+                blockToFlush, mtctx->doneJobID, mtctx->nextJobID);
+    assert(output->size >= output->pos);
+
+    ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+    if (  blockToFlush
+      && (mtctx->doneJobID < mtctx->nextJobID) ) {
+        assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize);
+        while (mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) {  /* nothing to flush */
+            if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) {
+                DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none",
+                            mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size);
+                break;
+            }
+            DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+            ZSTD_pthread_cond_wait(&mtctx->jobs[wJobID].job_cond, &mtctx->jobs[wJobID].job_mutex);  /* block when nothing to flush but some to come */
+    }   }
+
+    /* try to flush something */
+    {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
+        size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;       /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        if (ZSTD_isError(cSize)) {
+            DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
+                        mtctx->doneJobID, ZSTD_getErrorName(cSize));
+            ZSTDMT_waitForAllJobsCompleted(mtctx);
+            ZSTDMT_releaseAllJobResources(mtctx);
+            return cSize;
+        }
+        /* add frame checksum if necessary (can only happen once) */
+        assert(srcConsumed <= srcSize);
+        if ( (srcConsumed == srcSize)   /* job completed -> worker no longer active */
+          && mtctx->jobs[wJobID].frameChecksumNeeded ) {
+            U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState);
+            DEBUGLOG(4, "ZSTDMT_flushProduced: writing checksum : %08X \n", checksum);
+            MEM_writeLE32((char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].cSize, checksum);
+            cSize += 4;
+            mtctx->jobs[wJobID].cSize += 4;  /* can write this shared value, as worker is no longer active */
+            mtctx->jobs[wJobID].frameChecksumNeeded = 0;
+        }
+
+        if (cSize > 0) {   /* compression is ongoing or completed */
+            size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
+            DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
+                        (U32)toFlush, mtctx->doneJobID, (U32)srcConsumed, (U32)srcSize, (U32)cSize);
+            assert(mtctx->doneJobID < mtctx->nextJobID);
+            assert(cSize >= mtctx->jobs[wJobID].dstFlushed);
+            assert(mtctx->jobs[wJobID].dstBuff.start != NULL);
+            if (toFlush > 0) {
+                ZSTD_memcpy((char*)output->dst + output->pos,
+                    (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed,
+                    toFlush);
+            }
+            output->pos += toFlush;
+            mtctx->jobs[wJobID].dstFlushed += toFlush;  /* can write : this value is only used by mtctx */
+
+            if ( (srcConsumed == srcSize)    /* job is completed */
+              && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
+                DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+                ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
+                DEBUGLOG(5, "dstBuffer released");
+                mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
+                mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
+                mtctx->consumed += srcSize;
+                mtctx->produced += cSize;
+                mtctx->doneJobID++;
+        }   }
+
+        /* return value : how many bytes left in buffer ; fake it to 1 when unknown but >0 */
+        if (cSize > mtctx->jobs[wJobID].dstFlushed) return (cSize - mtctx->jobs[wJobID].dstFlushed);
+        if (srcSize > srcConsumed) return 1;   /* current job not completely compressed */
+    }
+    if (mtctx->doneJobID < mtctx->nextJobID) return 1;   /* some more jobs ongoing */
+    if (mtctx->jobReady) return 1;      /* one job is ready to push, just not yet in the list */
+    if (mtctx->inBuff.filled > 0) return 1;   /* input is not empty, and still needs to be converted into a job */
+    mtctx->allJobsCompleted = mtctx->frameEnded;   /* all jobs are entirely flushed => if this one is last one, frame is completed */
+    if (end == ZSTD_e_end) return !mtctx->frameEnded;  /* for ZSTD_e_end, question becomes : is frame completed ? instead of : are internal buffers fully flushed ? */
+    return 0;   /* internal buffers fully flushed */
+}
+
+/**
+ * Returns the range of data used by the earliest job that is not yet complete.
+ * If the data of the first job is broken up into two segments, we cover both
+ * sections.
+ */
+static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx)
+{
+    unsigned const firstJobID = mtctx->doneJobID;
+    unsigned const lastJobID = mtctx->nextJobID;
+    unsigned jobID;
+
+    for (jobID = firstJobID; jobID < lastJobID; ++jobID) {
+        unsigned const wJobID = jobID & mtctx->jobIDMask;
+        size_t consumed;
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+        consumed = mtctx->jobs[wJobID].consumed;
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+
+        if (consumed < mtctx->jobs[wJobID].src.size) {
+            range_t range = mtctx->jobs[wJobID].prefix;
+            if (range.size == 0) {
+                /* Empty prefix */
+                range = mtctx->jobs[wJobID].src;
+            }
+            /* Job source in multiple segments not supported yet */
+            assert(range.start <= mtctx->jobs[wJobID].src.start);
+            return range;
+        }
+    }
+    return kNullRange;
+}
+
+/**
+ * Returns non-zero iff buffer and range overlap.
+ */
+static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range)
+{
+    BYTE const* const bufferStart = (BYTE const*)buffer.start;
+    BYTE const* const rangeStart = (BYTE const*)range.start;
+
+    if (rangeStart == NULL || bufferStart == NULL)
+        return 0;
+
+    {
+        BYTE const* const bufferEnd = bufferStart + buffer.capacity;
+        BYTE const* const rangeEnd = rangeStart + range.size;
+
+        /* Empty ranges cannot overlap */
+        if (bufferStart == bufferEnd || rangeStart == rangeEnd)
+            return 0;
+
+        return bufferStart < rangeEnd && rangeStart < bufferEnd;
+    }
+}
+
+static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window)
+{
+    range_t extDict;
+    range_t prefix;
+
+    DEBUGLOG(5, "ZSTDMT_doesOverlapWindow");
+    extDict.start = window.dictBase + window.lowLimit;
+    extDict.size = window.dictLimit - window.lowLimit;
+
+    prefix.start = window.base + window.dictLimit;
+    prefix.size = window.nextSrc - (window.base + window.dictLimit);
+    DEBUGLOG(5, "extDict [0x%zx, 0x%zx)",
+                (size_t)extDict.start,
+                (size_t)extDict.start + extDict.size);
+    DEBUGLOG(5, "prefix  [0x%zx, 0x%zx)",
+                (size_t)prefix.start,
+                (size_t)prefix.start + prefix.size);
+
+    return ZSTDMT_isOverlapped(buffer, extDict)
+        || ZSTDMT_isOverlapped(buffer, prefix);
+}
+
+static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer)
+{
+    if (mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable) {
+        ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
+        DEBUGLOG(5, "ZSTDMT_waitForLdmComplete");
+        DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
+                    (size_t)buffer.start,
+                    (size_t)buffer.start + buffer.capacity);
+        ZSTD_PTHREAD_MUTEX_LOCK(mutex);
+        while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
+            DEBUGLOG(5, "Waiting for LDM to finish...");
+            ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
+        }
+        DEBUGLOG(6, "Done waiting for LDM to finish");
+        ZSTD_pthread_mutex_unlock(mutex);
+    }
+}
+
+/**
+ * Attempts to set the inBuff to the next section to fill.
+ * If any part of the new section is still in use we give up.
+ * Returns non-zero if the buffer is filled.
+ */
+static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
+{
+    range_t const inUse = ZSTDMT_getInputDataInUse(mtctx);
+    size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos;
+    size_t const target = mtctx->targetSectionSize;
+    buffer_t buffer;
+
+    DEBUGLOG(5, "ZSTDMT_tryGetInputRange");
+    assert(mtctx->inBuff.buffer.start == NULL);
+    assert(mtctx->roundBuff.capacity >= target);
+
+    if (spaceLeft < target) {
+        /* ZSTD_invalidateRepCodes() doesn't work for extDict variants.
+         * Simply copy the prefix to the beginning in that case.
+         */
+        BYTE* const start = (BYTE*)mtctx->roundBuff.buffer;
+        size_t const prefixSize = mtctx->inBuff.prefix.size;
+
+        buffer.start = start;
+        buffer.capacity = prefixSize;
+        if (ZSTDMT_isOverlapped(buffer, inUse)) {
+            DEBUGLOG(5, "Waiting for buffer...");
+            return 0;
+        }
+        ZSTDMT_waitForLdmComplete(mtctx, buffer);
+        ZSTD_memmove(start, mtctx->inBuff.prefix.start, prefixSize);
+        mtctx->inBuff.prefix.start = start;
+        mtctx->roundBuff.pos = prefixSize;
+    }
+    buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos;
+    buffer.capacity = target;
+
+    if (ZSTDMT_isOverlapped(buffer, inUse)) {
+        DEBUGLOG(5, "Waiting for buffer...");
+        return 0;
+    }
+    assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
+
+    ZSTDMT_waitForLdmComplete(mtctx, buffer);
+
+    DEBUGLOG(5, "Using prefix range [%zx, %zx)",
+                (size_t)mtctx->inBuff.prefix.start,
+                (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size);
+    DEBUGLOG(5, "Using source range [%zx, %zx)",
+                (size_t)buffer.start,
+                (size_t)buffer.start + buffer.capacity);
+
+
+    mtctx->inBuff.buffer = buffer;
+    mtctx->inBuff.filled = 0;
+    assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity);
+    return 1;
+}
+
+typedef struct {
+  size_t toLoad;  /* The number of bytes to load from the input. */
+  int flush;      /* Boolean declaring if we must flush because we found a synchronization point. */
+} syncPoint_t;
+
+/**
+ * Searches through the input for a synchronization point. If one is found, we
+ * will instruct the caller to flush, and return the number of bytes to load.
+ * Otherwise, we will load as many bytes as possible and instruct the caller
+ * to continue as normal.
+ */
+static syncPoint_t
+findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
+{
+    BYTE const* const istart = (BYTE const*)input.src + input.pos;
+    U64 const primePower = mtctx->rsync.primePower;
+    U64 const hitMask = mtctx->rsync.hitMask;
+
+    syncPoint_t syncPoint;
+    U64 hash;
+    BYTE const* prev;
+    size_t pos;
+
+    syncPoint.toLoad = MIN(input.size - input.pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
+    syncPoint.flush = 0;
+    if (!mtctx->params.rsyncable)
+        /* Rsync is disabled. */
+        return syncPoint;
+    if (mtctx->inBuff.filled + input.size - input.pos < RSYNC_MIN_BLOCK_SIZE)
+        /* We don't emit synchronization points if it would produce too small blocks.
+         * We don't have enough input to find a synchronization point, so don't look.
+         */
+        return syncPoint;
+    if (mtctx->inBuff.filled + syncPoint.toLoad < RSYNC_LENGTH)
+        /* Not enough to compute the hash.
+         * We will miss any synchronization points in this RSYNC_LENGTH byte
+         * window. However, since it depends only in the internal buffers, if the
+         * state is already synchronized, we will remain synchronized.
+         * Additionally, the probability that we miss a synchronization point is
+         * low: RSYNC_LENGTH / targetSectionSize.
+         */
+        return syncPoint;
+    /* Initialize the loop variables. */
+    if (mtctx->inBuff.filled < RSYNC_MIN_BLOCK_SIZE) {
+        /* We don't need to scan the first RSYNC_MIN_BLOCK_SIZE positions
+         * because they can't possibly be a sync point. So we can start
+         * part way through the input buffer.
+         */
+        pos = RSYNC_MIN_BLOCK_SIZE - mtctx->inBuff.filled;
+        if (pos >= RSYNC_LENGTH) {
+            prev = istart + pos - RSYNC_LENGTH;
+            hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
+        } else {
+            assert(mtctx->inBuff.filled >= RSYNC_LENGTH);
+            prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
+            hash = ZSTD_rollingHash_compute(prev + pos, (RSYNC_LENGTH - pos));
+            hash = ZSTD_rollingHash_append(hash, istart, pos);
+        }
+    } else {
+        /* We have enough bytes buffered to initialize the hash,
+         * and have processed enough bytes to find a sync point.
+         * Start scanning at the beginning of the input.
+         */
+        assert(mtctx->inBuff.filled >= RSYNC_MIN_BLOCK_SIZE);
+        assert(RSYNC_MIN_BLOCK_SIZE >= RSYNC_LENGTH);
+        pos = 0;
+        prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
+        hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
+        if ((hash & hitMask) == hitMask) {
+            /* We're already at a sync point so don't load any more until
+             * we're able to flush this sync point.
+             * This likely happened because the job table was full so we
+             * couldn't add our job.
+             */
+            syncPoint.toLoad = 0;
+            syncPoint.flush = 1;
+            return syncPoint;
+        }
+    }
+    /* Starting with the hash of the previous RSYNC_LENGTH bytes, roll
+     * through the input. If we hit a synchronization point, then cut the
+     * job off, and tell the compressor to flush the job. Otherwise, load
+     * all the bytes and continue as normal.
+     * If we go too long without a synchronization point (targetSectionSize)
+     * then a block will be emitted anyways, but this is okay, since if we
+     * are already synchronized we will remain synchronized.
+     */
+    assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
+    for (; pos < syncPoint.toLoad; ++pos) {
+        BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH];
+        /* This assert is very expensive, and Debian compiles with asserts enabled.
+         * So disable it for now. We can get similar coverage by checking it at the
+         * beginning & end of the loop.
+         * assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
+         */
+        hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower);
+        assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE);
+        if ((hash & hitMask) == hitMask) {
+            syncPoint.toLoad = pos + 1;
+            syncPoint.flush = 1;
+            ++pos; /* for assert */
+            break;
+        }
+    }
+    assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
+    return syncPoint;
+}
+
+size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx)
+{
+    size_t hintInSize = mtctx->targetSectionSize - mtctx->inBuff.filled;
+    if (hintInSize==0) hintInSize = mtctx->targetSectionSize;
+    return hintInSize;
+}
+
+/** ZSTDMT_compressStream_generic() :
+ *  internal use only - exposed to be invoked from zstd_compress.c
+ *  assumption : output and input are valid (pos <= size)
+ * @return : minimum amount of data remaining to flush, 0 if none */
+size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                     ZSTD_outBuffer* output,
+                                     ZSTD_inBuffer* input,
+                                     ZSTD_EndDirective endOp)
+{
+    unsigned forwardInputProgress = 0;
+    DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)",
+                (U32)endOp, (U32)(input->size - input->pos));
+    assert(output->pos <= output->size);
+    assert(input->pos  <= input->size);
+
+    if ((mtctx->frameEnded) && (endOp==ZSTD_e_continue)) {
+        /* current frame being ended. Only flush/end are allowed */
+        return ERROR(stage_wrong);
+    }
+
+    /* fill input buffer */
+    if ( (!mtctx->jobReady)
+      && (input->size > input->pos) ) {   /* support NULL input */
+        if (mtctx->inBuff.buffer.start == NULL) {
+            assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */
+            if (!ZSTDMT_tryGetInputRange(mtctx)) {
+                /* It is only possible for this operation to fail if there are
+                 * still compression jobs ongoing.
+                 */
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange failed");
+                assert(mtctx->doneJobID != mtctx->nextJobID);
+            } else
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start);
+        }
+        if (mtctx->inBuff.buffer.start != NULL) {
+            syncPoint_t const syncPoint = findSynchronizationPoint(mtctx, *input);
+            if (syncPoint.flush && endOp == ZSTD_e_continue) {
+                endOp = ZSTD_e_flush;
+            }
+            assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize);
+            DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u",
+                        (U32)syncPoint.toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize);
+            ZSTD_memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, syncPoint.toLoad);
+            input->pos += syncPoint.toLoad;
+            mtctx->inBuff.filled += syncPoint.toLoad;
+            forwardInputProgress = syncPoint.toLoad>0;
+        }
+    }
+    if ((input->pos < input->size) && (endOp == ZSTD_e_end)) {
+        /* Can't end yet because the input is not fully consumed.
+            * We are in one of these cases:
+            * - mtctx->inBuff is NULL & empty: we couldn't get an input buffer so don't create a new job.
+            * - We filled the input buffer: flush this job but don't end the frame.
+            * - We hit a synchronization point: flush this job but don't end the frame.
+            */
+        assert(mtctx->inBuff.filled == 0 || mtctx->inBuff.filled == mtctx->targetSectionSize || mtctx->params.rsyncable);
+        endOp = ZSTD_e_flush;
+    }
+
+    if ( (mtctx->jobReady)
+      || (mtctx->inBuff.filled >= mtctx->targetSectionSize)  /* filled enough : let's compress */
+      || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0))  /* something to flush : let's go */
+      || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) {   /* must finish the frame with a zero-size block */
+        size_t const jobSize = mtctx->inBuff.filled;
+        assert(mtctx->inBuff.filled <= mtctx->targetSectionSize);
+        FORWARD_IF_ERROR( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) , "");
+    }
+
+    /* check for potential compressed data ready to be flushed */
+    {   size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
+        if (input->pos < input->size) return MAX(remainingToFlush, 1);  /* input not consumed : do not end flush yet */
+        DEBUGLOG(5, "end of ZSTDMT_compressStream_generic: remainingToFlush = %u", (U32)remainingToFlush);
+        return remainingToFlush;
+    }
+}
diff --git a/GraphBLAS/zstd/zstd_subset/compress/zstdmt_compress.h b/GraphBLAS/zstd/zstd_subset/compress/zstdmt_compress.h
new file mode 100644
index 000000000..271eb1ac7
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/compress/zstdmt_compress.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+
+
+/* Note : This is an internal API.
+ *        These APIs used to be exposed with ZSTDLIB_API,
+ *        because it used to be the only way to invoke MT compression.
+ *        Now, you must use ZSTD_compress2 and ZSTD_compressStream2() instead.
+ *
+ *        This API requires ZSTD_MULTITHREAD to be defined during compilation,
+ *        otherwise ZSTDMT_createCCtx*() will fail.
+ */
+
+/* ===   Dependencies   === */
+#include "../common/zstd_deps.h"   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters */
+#include "../zstd.h"            /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */
+
+
+/* ===   Constants   === */
+#ifndef ZSTDMT_NBWORKERS_MAX /* a different value can be selected at compile time */
+#  define ZSTDMT_NBWORKERS_MAX ((sizeof(void*)==4) /*32-bit*/ ? 64 : 256)
+#endif
+#ifndef ZSTDMT_JOBSIZE_MIN   /* a different value can be selected at compile time */
+#  define ZSTDMT_JOBSIZE_MIN (512 KB)
+#endif
+#define ZSTDMT_JOBLOG_MAX   (MEM_32bits() ? 29 : 30)
+#define ZSTDMT_JOBSIZE_MAX  (MEM_32bits() ? (512 MB) : (1024 MB))
+
+
+/* ========================================================
+ * ===  Private interface, for use by ZSTD_compress.c   ===
+ * ===  Not exposed in libzstd. Never invoke directly   ===
+ * ======================================================== */
+
+/* ===   Memory management   === */
+typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
+/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */
+ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
+                                        ZSTD_customMem cMem,
+					ZSTD_threadPool *pool);
+size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
+
+size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
+
+/* ===   Streaming functions   === */
+
+size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  mtctx can be freshly constructed or reused from a prior compression.
+ *  If mtctx is reused, memory allocations from the prior compression may not be freed,
+ *  even if they are not needed for the current compression.
+ *  @return : 0, or an error code */
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* mtctx,
+                    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+                    const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
+
+/*! ZSTDMT_compressStream_generic() :
+ *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
+ *  depending on flush directive.
+ * @return : minimum amount of data still to be flushed
+ *           0 if fully flushed
+ *           or an error code
+ *  note : needs to be init using any ZSTD_initCStream*() variant */
+size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                     ZSTD_outBuffer* output,
+                                     ZSTD_inBuffer* input,
+                                     ZSTD_EndDirective endOp);
+
+ /*! ZSTDMT_toFlushNow()
+  *  Tell how many bytes are ready to be flushed immediately.
+  *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+  *  If return 0, it means there is no active job,
+  *  or, it means oldest job is still active, but everything produced has been flushed so far,
+  *  therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
+
+/*! ZSTDMT_getFrameProgression():
+ *  tells how much data has been consumed (input) and produced (output) for current frame.
+ *  able to count progression inside worker threads.
+ */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTDMT_COMPRESS_H */
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/huf_decompress.c b/GraphBLAS/zstd/zstd_subset/decompress/huf_decompress.c
new file mode 100644
index 000000000..c6fd92860
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/huf_decompress.c
@@ -0,0 +1,1891 @@
+/* ******************************************************************
+ * huff0 huffman decoder,
+ * part of Finite State Entropy library
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include "../common/zstd_deps.h"  /* ZSTD_memcpy, ZSTD_memset */
+#include "../common/compiler.h"
+#include "../common/bitstream.h"  /* BIT_* */
+#include "../common/fse.h"        /* to compress headers */
+#define HUF_STATIC_LINKING_ONLY
+#include "../common/huf.h"
+#include "../common/error_private.h"
+#include "../common/zstd_internal.h"
+#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+
+#define HUF_DECODER_FAST_TABLELOG 11
+
+/* **************************************************************
+*  Macros
+****************************************************************/
+
+/* These two optional macros force the use one way or another of the two
+ * Huffman decompression implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(HUF_FORCE_DECOMPRESS_X1) && \
+    defined(HUF_FORCE_DECOMPRESS_X2)
+#error "Cannot force the use of the X1 and X2 decoders at the same time!"
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+#else
+# define HUF_ASM_X86_64_BMI2_ATTRS
+#endif
+
+#ifdef __cplusplus
+# define HUF_EXTERN_C extern "C"
+#else
+# define HUF_EXTERN_C
+#endif
+#define HUF_ASM_DECL HUF_EXTERN_C
+
+#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+# define HUF_NEED_BMI2_FUNCTION 1
+#else
+# define HUF_NEED_BMI2_FUNCTION 0
+#endif
+
+#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+# define HUF_NEED_DEFAULT_FUNCTION 1
+#else
+# define HUF_NEED_DEFAULT_FUNCTION 0
+#endif
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+
+
+/* **************************************************************
+*  Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a)         HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
+
+/* **************************************************************
+*  BMI2 Variant Wrappers
+****************************************************************/
+#if DYNAMIC_BMI2
+
+#define HUF_DGEN(fn)                                                        \
+                                                                            \
+    static size_t fn##_default(                                             \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2(                          \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        if (bmi2) {                                                         \
+            return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+        }                                                                   \
+        return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+    }
+
+#else
+
+#define HUF_DGEN(fn)                                                        \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        (void)bmi2;                                                         \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }
+
+#endif
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+    DTableDesc dtd;
+    ZSTD_memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+static size_t HUF_initDStream(BYTE const* ip) {
+    BYTE const lastByte = ip[7];
+    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+    size_t const value = MEM_readLEST(ip) | 1;
+    assert(bitsConsumed <= 8);
+    return value << bitsConsumed;
+}
+typedef struct {
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U64 bits[4];
+    void const* dt;
+    BYTE const* ilimit;
+    BYTE* oend;
+    BYTE const* iend[4];
+} HUF_DecompressAsmArgs;
+
+/**
+ * Initializes args for the asm decoding loop.
+ * @returns 0 on success
+ *          1 if the fallback implementation should be used.
+ *          Or an error code on failure.
+ */
+static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+{
+    void const* dt = DTable + 1;
+    U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+
+    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
+
+    BYTE* const oend = (BYTE*)dst + dstSize;
+
+    /* The following condition is false on x32 platform,
+     * but HUF_asm is not compatible with this ABI */
+    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+
+    /* strict minimum : jump table + 1 byte per stream */
+    if (srcSize < 10)
+        return ERROR(corruption_detected);
+
+    /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
+     * If table log is not correct at this point, fallback to the old decoder.
+     * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+     */
+    if (dtLog != HUF_DECODER_FAST_TABLELOG)
+        return 1;
+
+    /* Read the jump table. */
+    {
+        const BYTE* const istart = (const BYTE*)src;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
+        args->iend[0] = istart + 6;  /* jumpTable */
+        args->iend[1] = args->iend[0] + length1;
+        args->iend[2] = args->iend[1] + length2;
+        args->iend[3] = args->iend[2] + length3;
+
+        /* HUF_initDStream() requires this, and this small of an input
+         * won't benefit from the ASM loop anyways.
+         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+         * starts.
+         */
+        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+            return 1;
+        if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+    }
+    /* ip[] contains the position that is currently loaded into bits[]. */
+    args->ip[0] = args->iend[1] - sizeof(U64);
+    args->ip[1] = args->iend[2] - sizeof(U64);
+    args->ip[2] = args->iend[3] - sizeof(U64);
+    args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
+
+    /* op[] contains the output pointers. */
+    args->op[0] = (BYTE*)dst;
+    args->op[1] = args->op[0] + (dstSize+3)/4;
+    args->op[2] = args->op[1] + (dstSize+3)/4;
+    args->op[3] = args->op[2] + (dstSize+3)/4;
+
+    /* No point to call the ASM loop for tiny outputs. */
+    if (args->op[3] >= oend)
+        return 1;
+
+    /* bits[] is the bit container.
+        * It is read from the MSB down to the LSB.
+        * It is shifted left as it is read, and zeros are
+        * shifted in. After the lowest valid bit a 1 is
+        * set, so that CountTrailingZeros(bits[]) can be used
+        * to count how many bits we've consumed.
+        */
+    args->bits[0] = HUF_initDStream(args->ip[0]);
+    args->bits[1] = HUF_initDStream(args->ip[1]);
+    args->bits[2] = HUF_initDStream(args->ip[2]);
+    args->bits[3] = HUF_initDStream(args->ip[3]);
+
+    /* If ip[] >= ilimit, it is guaranteed to be safe to
+        * reload bits[]. It may be beyond its section, but is
+        * guaranteed to be valid (>= istart).
+        */
+    args->ilimit = ilimit;
+
+    args->oend = oend;
+    args->dt = dt;
+
+    return 0;
+}
+
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
+{
+    /* Validate that we haven't overwritten. */
+    if (args->op[stream] > segmentEnd)
+        return ERROR(corruption_detected);
+    /* Validate that we haven't read beyond iend[].
+        * Note that ip[] may be < iend[] because the MSB is
+        * the next bit to read, and we may have consumed 100%
+        * of the stream, so down to iend[i] - 8 is valid.
+        */
+    if (args->ip[stream] < args->iend[stream] - 8)
+        return ERROR(corruption_detected);
+
+    /* Construct the BIT_DStream_t. */
+    assert(sizeof(size_t) == 8);
+    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
+    bit->start = (const char*)args->iend[0];
+    bit->limitPtr = bit->start + sizeof(size_t);
+    bit->ptr = (const char*)args->ip[stream];
+
+    return 0;
+}
+#endif
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decoding */
+
+/**
+ * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
+ * a time.
+ */
+static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+    U64 D4;
+    if (MEM_isLittleEndian()) {
+        D4 = (symbol << 8) + nbBits;
+    } else {
+        D4 = symbol + (nbBits << 8);
+    }
+    D4 *= 0x0001000100010001ULL;
+    return D4;
+}
+
+/**
+ * Increase the tableLog to targetTableLog and rescales the stats.
+ * If tableLog > targetTableLog this is a no-op.
+ * @returns New tableLog
+ */
+static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
+{
+    if (tableLog > targetTableLog)
+        return tableLog;
+    if (tableLog < targetTableLog) {
+        U32 const scale = targetTableLog - tableLog;
+        U32 s;
+        /* Increase the weight for all non-zero probability symbols by scale. */
+        for (s = 0; s < nbSymbols; ++s) {
+            huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
+        }
+        /* Update rankVal to reflect the new weights.
+         * All weights except 0 get moved to weight + scale.
+         * Weights [1, scale] are empty.
+         */
+        for (s = targetTableLog; s > scale; --s) {
+            rankVal[s] = rankVal[s - scale];
+        }
+        for (s = scale; s > 0; --s) {
+            rankVal[s] = 0;
+        }
+    }
+    return targetTableLog;
+}
+
+typedef struct {
+        U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
+        U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
+        U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+        BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
+        BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+} HUF_ReadDTableX1_Workspace;
+
+
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+{
+    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
+    HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
+
+    DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
+    if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
+    if (HUF_isError(iSize)) return iSize;
+
+
+    /* Table header */
+    {   DTableDesc dtd = HUF_getDTableDesc(DTable);
+        U32 const maxTableLog = dtd.maxTableLog + 1;
+        U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
+        tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Compute symbols and rankStart given rankVal:
+     *
+     * rankVal already contains the number of values of each weight.
+     *
+     * symbols contains the symbols ordered by weight. First are the rankVal[0]
+     * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
+     * symbols[0] is filled (but unused) to avoid a branch.
+     *
+     * rankStart contains the offset where each rank belongs in the DTable.
+     * rankStart[0] is not filled because there are no entries in the table for
+     * weight 0.
+     */
+    {
+        int n;
+        int nextRankStart = 0;
+        int const unroll = 4;
+        int const nLimit = (int)nbSymbols - unroll + 1;
+        for (n=0; n<(int)tableLog+1; n++) {
+            U32 const curr = nextRankStart;
+            nextRankStart += wksp->rankVal[n];
+            wksp->rankStart[n] = curr;
+        }
+        for (n=0; n < nLimit; n += unroll) {
+            int u;
+            for (u=0; u < unroll; ++u) {
+                size_t const w = wksp->huffWeight[n+u];
+                wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u);
+            }
+        }
+        for (; n < (int)nbSymbols; ++n) {
+            size_t const w = wksp->huffWeight[n];
+            wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;
+        }
+    }
+
+    /* fill DTable
+     * We fill all entries of each weight in order.
+     * That way length is a constant for each iteration of the outer loop.
+     * We can switch based on the length to a different inner loop which is
+     * optimized for that particular case.
+     */
+    {
+        U32 w;
+        int symbol=wksp->rankVal[0];
+        int rankStart=0;
+        for (w=1; w<tableLog+1; ++w) {
+            int const symbolCount = wksp->rankVal[w];
+            int const length = (1 << w) >> 1;
+            int uStart = rankStart;
+            BYTE const nbBits = (BYTE)(tableLog + 1 - w);
+            int s;
+            int u;
+            switch (length) {
+            case 1:
+                for (s=0; s<symbolCount; ++s) {
+                    HUF_DEltX1 D;
+                    D.byte = wksp->symbols[symbol + s];
+                    D.nbBits = nbBits;
+                    dt[uStart] = D;
+                    uStart += 1;
+                }
+                break;
+            case 2:
+                for (s=0; s<symbolCount; ++s) {
+                    HUF_DEltX1 D;
+                    D.byte = wksp->symbols[symbol + s];
+                    D.nbBits = nbBits;
+                    dt[uStart+0] = D;
+                    dt[uStart+1] = D;
+                    uStart += 2;
+                }
+                break;
+            case 4:
+                for (s=0; s<symbolCount; ++s) {
+                    U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
+                    MEM_write64(dt + uStart, D4);
+                    uStart += 4;
+                }
+                break;
+            case 8:
+                for (s=0; s<symbolCount; ++s) {
+                    U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
+                    MEM_write64(dt + uStart, D4);
+                    MEM_write64(dt + uStart + 4, D4);
+                    uStart += 8;
+                }
+                break;
+            default:
+                for (s=0; s<symbolCount; ++s) {
+                    U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
+                    for (u=0; u < length; u += 16) {
+                        MEM_write64(dt + uStart + u + 0, D4);
+                        MEM_write64(dt + uStart + u + 4, D4);
+                        MEM_write64(dt + uStart + u + 8, D4);
+                        MEM_write64(dt + uStart + u + 12, D4);
+                    }
+                    assert(u == length);
+                    uStart += length;
+                }
+                break;
+            }
+            symbol += symbolCount;
+            rankStart += symbolCount * length;
+        }
+    }
+    return iSize;
+}
+
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BIT_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+HINT_INLINE size_t
+HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    if ((pEnd - p) > 3) {
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
+            HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+        }
+    } else {
+        BIT_reloadDStream(bitDPtr);
+    }
+
+    /* [0-3] symbols remaining */
+    if (MEM_32bits())
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+    BIT_DStream_t bitD;
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
+
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - 3;
+        const void* const dtPtr = DTable + 1;
+        const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+        U32 endSignal = 1;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
+        if ((size_t)(oend - op4) >= sizeof(size_t)) {
+            for ( ; (endSignal) & (op4 < olimit) ; ) {
+                HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
+                endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+            }
+        }
+
+        /* check corruption */
+        /* note : should not be necessary : op# advance in lock step, and we control op4.
+         *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX1(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+#if HUF_NEED_BMI2_FUNCTION
+static BMI2_TARGET_ATTRIBUTE
+size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+#endif
+
+#if HUF_NEED_DEFAULT_FUNCTION
+static
+size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+
+static HUF_ASM_X86_64_BMI2_ATTRS
+size_t
+HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    void const* dt = DTable + 1;
+    const BYTE* const iend = (const BYTE*)cSrc + 6;
+    BYTE* const oend = (BYTE*)dst + dstSize;
+    HUF_DecompressAsmArgs args;
+    {
+        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+        if (ret != 0)
+            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+    }
+
+    assert(args.ip[0] >= args.ilimit);
+    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+
+    /* Our loop guarantees that ip[] >= ilimit and that we haven't
+    * overwritten any op[].
+    */
+    assert(args.ip[0] >= iend);
+    assert(args.ip[1] >= iend);
+    assert(args.ip[2] >= iend);
+    assert(args.ip[3] >= iend);
+    assert(args.op[3] <= oend);
+    (void)iend;
+
+    /* finish bit streams one by one. */
+    {
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* segmentEnd = (BYTE*)dst;
+        int i;
+        for (i = 0; i < 4; ++i) {
+            BIT_DStream_t bit;
+            if (segmentSize <= (size_t)(oend - segmentEnd))
+                segmentEnd += segmentSize;
+            else
+                segmentEnd = oend;
+            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
+            /* Decompress and validate that we've produced exactly the expected length. */
+            args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
+            if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
+        }
+    }
+
+    /* decoded size */
+    return dstSize;
+}
+#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+
+typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+                                               const void *cSrc,
+                                               size_t cSrcSize,
+                                               const HUF_DTable *DTable);
+
+HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+
+static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+# if ZSTD_ENABLE_ASM_X86_64_BMI2
+        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+# else
+        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+# endif
+    }
+#else
+    (void)bmi2;
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+#else
+    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+#endif
+}
+
+
+size_t HUF_decompress1X1_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress4X1_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+}
+
+
+#endif /* HUF_FORCE_DECOMPRESS_X2 */
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X1
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /* double-symbols decoding */
+typedef struct { BYTE symbol; } sortedSymbol_t;
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+/**
+ * Constructs a HUF_DEltX2 in a U32.
+ */
+static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
+{
+    U32 seq;
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
+    if (MEM_isLittleEndian()) {
+        seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
+        return seq + (nbBits << 16) + ((U32)level << 24);
+    } else {
+        seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
+        return (seq << 16) + (nbBits << 8) + (U32)level;
+    }
+}
+
+/**
+ * Constructs a HUF_DEltX2.
+ */
+static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
+{
+    HUF_DEltX2 DElt;
+    U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
+    DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
+    ZSTD_memcpy(&DElt, &val, sizeof(val));
+    return DElt;
+}
+
+/**
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
+ */
+static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
+{
+    U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
+    return (U64)DElt + ((U64)DElt << 32);
+}
+
+/**
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
+ * nbBits long.
+ *
+ * @param DTableRank The start of the rank in the DTable.
+ * @param begin The first symbol to fill (inclusive).
+ * @param end The last symbol to fill (exclusive).
+ * @param nbBits Each symbol is nbBits long.
+ * @param tableLog The table log.
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
+ * @param level The level in the table. Must be 1 or 2.
+ */
+static void HUF_fillDTableX2ForWeight(
+    HUF_DEltX2* DTableRank,
+    sortedSymbol_t const* begin, sortedSymbol_t const* end,
+    U32 nbBits, U32 tableLog,
+    U16 baseSeq, int const level)
+{
+    U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
+    const sortedSymbol_t* ptr;
+    assert(level >= 1 && level <= 2);
+    switch (length) {
+    case 1:
+        for (ptr = begin; ptr != end; ++ptr) {
+            HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
+            *DTableRank++ = DElt;
+        }
+        break;
+    case 2:
+        for (ptr = begin; ptr != end; ++ptr) {
+            HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
+            DTableRank[0] = DElt;
+            DTableRank[1] = DElt;
+            DTableRank += 2;
+        }
+        break;
+    case 4:
+        for (ptr = begin; ptr != end; ++ptr) {
+            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
+            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+            DTableRank += 4;
+        }
+        break;
+    case 8:
+        for (ptr = begin; ptr != end; ++ptr) {
+            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
+            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
+            DTableRank += 8;
+        }
+        break;
+    default:
+        for (ptr = begin; ptr != end; ++ptr) {
+            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
+            HUF_DEltX2* const DTableRankEnd = DTableRank + length;
+            for (; DTableRank != DTableRankEnd; DTableRank += 8) {
+                ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
+            }
+        }
+        break;
+    }
+}
+
+/* HUF_fillDTableX2Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
+                           const U32* rankVal, const int minWeight, const int maxWeight1,
+                           const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    /* Fill skipped values (all positions up to rankVal[minWeight]).
+     * These are positions only get a single symbol because the combined weight
+     * is too large.
+     */
+    if (minWeight>1) {
+        U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
+        U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
+        int const skipSize = rankVal[minWeight];
+        assert(length > 1);
+        assert((U32)skipSize < length);
+        switch (length) {
+        case 2:
+            assert(skipSize == 1);
+            ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
+            break;
+        case 4:
+            assert(skipSize <= 4);
+            ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
+            break;
+        default:
+            {
+                int i;
+                for (i = 0; i < skipSize; i += 8) {
+                    ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
+                }
+            }
+        }
+    }
+
+    /* Fill each of the second level symbols by weight. */
+    {
+        int w;
+        for (w = minWeight; w < maxWeight1; ++w) {
+            int const begin = rankStart[w];
+            int const end = rankStart[w+1];
+            U32 const nbBits = nbBitsBaseline - w;
+            U32 const totalBits = nbBits + consumedBits;
+            HUF_fillDTableX2ForWeight(
+                DTable + rankVal[w],
+                sortedSymbols + begin, sortedSymbols + end,
+                totalBits, targetLog,
+                baseSeq, /* level */ 2);
+        }
+    }
+}
+
+static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32* const rankVal = rankValOrigin[0];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    int w;
+    int const wEnd = (int)maxWeight + 1;
+
+    /* Fill DTable in order of weight. */
+    for (w = 1; w < wEnd; ++w) {
+        int const begin = (int)rankStart[w];
+        int const end = (int)rankStart[w+1];
+        U32 const nbBits = nbBitsBaseline - w;
+
+        if (targetLog-nbBits >= minBits) {
+            /* Enough room for a second symbol. */
+            int start = rankVal[w];
+            U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
+            int minWeight = nbBits + scaleLog;
+            int s;
+            if (minWeight < 1) minWeight = 1;
+            /* Fill the DTable for every symbol of weight w.
+             * These symbols get at least 1 second symbol.
+             */
+            for (s = begin; s != end; ++s) {
+                HUF_fillDTableX2Level2(
+                    DTable + start, targetLog, nbBits,
+                    rankValOrigin[nbBits], minWeight, wEnd,
+                    sortedList, rankStart,
+                    nbBitsBaseline, sortedList[s].symbol);
+                start += length;
+            }
+        } else {
+            /* Only a single symbol. */
+            HUF_fillDTableX2ForWeight(
+                DTable + rankVal[w],
+                sortedList + begin, sortedList + end,
+                nbBits, targetLog,
+                /* baseSeq */ 0, /* level */ 1);
+        }
+    }
+}
+
+typedef struct {
+    rankValCol_t rankVal[HUF_TABLELOG_MAX];
+    U32 rankStats[HUF_TABLELOG_MAX + 1];
+    U32 rankStart0[HUF_TABLELOG_MAX + 3];
+    sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
+    BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
+    U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+} HUF_ReadDTableX2_Workspace;
+
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize)
+{
+    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize, int bmi2)
+{
+    U32 tableLog, maxW, nbSymbols;
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    U32 maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+    U32 *rankStart;
+
+    HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
+
+    if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);
+
+    rankStart = wksp->rankStart0 + 1;
+    ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
+    ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
+
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+    if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
+
+    /* find maxWeight */
+    for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 curr = nextRankStart;
+            nextRankStart += wksp->rankStats[w];
+            rankStart[w] = curr;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        rankStart[maxW+1] = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = wksp->weightList[s];
+            U32 const r = rankStart[w]++;
+            wksp->sortedSymbol[r].symbol = (BYTE)s;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = wksp->rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 curr = nextRankVal;
+                nextRankVal += wksp->rankStats[w] << (w+rescale);
+                rankVal0[w] = curr;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = wksp->rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUF_fillDTableX2(dt, maxTableLog,
+                   wksp->sortedSymbol,
+                   wksp->rankStart0, wksp->rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    ZSTD_memcpy(op, &dt[val].sequence, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    ZSTD_memcpy(op, &dt[val].sequence, 1);
+    if (dt[val].length==1) {
+        BIT_skipBits(DStream, dt[val].nbBits);
+    } else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+        }
+    }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
+        if (dtLog <= 11 && MEM_64bits()) {
+            /* up to 10 symbols at a time */
+            while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            }
+        } else {
+            /* up to 8 symbols at a time */
+            while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+                HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            }
+        }
+    } else {
+        BIT_reloadDStream(bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    if ((size_t)(pEnd - p) >= 2) {
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+        while (p <= pEnd-2)
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+    }
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - (sizeof(size_t)-1);
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal = 1;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        if ((size_t)(oend - op4) >= sizeof(size_t)) {
+            for ( ; (endSignal) & (op4 < olimit); ) {
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+                endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+                endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+                endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+#else
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+                endSignal = (U32)LIKELY((U32)
+                            (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
+#endif
+            }
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+#if HUF_NEED_BMI2_FUNCTION
+static BMI2_TARGET_ATTRIBUTE
+size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+#endif
+
+#if HUF_NEED_DEFAULT_FUNCTION
+static
+size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+
+static HUF_ASM_X86_64_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable) {
+    void const* dt = DTable + 1;
+    const BYTE* const iend = (const BYTE*)cSrc + 6;
+    BYTE* const oend = (BYTE*)dst + dstSize;
+    HUF_DecompressAsmArgs args;
+    {
+        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+        if (ret != 0)
+            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+    }
+
+    assert(args.ip[0] >= args.ilimit);
+    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
+
+    /* note : op4 already verified within main loop */
+    assert(args.ip[0] >= iend);
+    assert(args.ip[1] >= iend);
+    assert(args.ip[2] >= iend);
+    assert(args.ip[3] >= iend);
+    assert(args.op[3] <= oend);
+    (void)iend;
+
+    /* finish bitStreams one by one */
+    {
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* segmentEnd = (BYTE*)dst;
+        int i;
+        for (i = 0; i < 4; ++i) {
+            BIT_DStream_t bit;
+            if (segmentSize <= (size_t)(oend - segmentEnd))
+                segmentEnd += segmentSize;
+            else
+                segmentEnd = oend;
+            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
+            args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
+            if (args.op[i] != segmentEnd)
+                return ERROR(corruption_detected);
+        }
+    }
+
+    /* decoded size */
+    return dstSize;
+}
+#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+
+static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+# if ZSTD_ENABLE_ASM_X86_64_BMI2
+        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+# else
+        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+# endif
+    }
+#else
+    (void)bmi2;
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+#else
+    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+#endif
+}
+
+HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+
+size_t HUF_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+                                               workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+                                         workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+
+#endif /* HUF_FORCE_DECOMPRESS_X1 */
+
+
+/* ***********************************/
+/* Universal decompression selectors */
+/* ***********************************/
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#else
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#endif
+}
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#else
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#endif
+}
+
+
+#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}},  /* Q==1 : impossible */
+    {{ 150,216}, { 381,119}},   /* Q == 2 : 12-18% */
+    {{ 170,205}, { 514,112}},   /* Q == 3 : 18-25% */
+    {{ 177,199}, { 539,110}},   /* Q == 4 : 25-32% */
+    {{ 197,194}, { 644,107}},   /* Q == 5 : 32-38% */
+    {{ 221,192}, { 735,107}},   /* Q == 6 : 38-44% */
+    {{ 256,189}, { 881,106}},   /* Q == 7 : 44-50% */
+    {{ 359,188}, {1167,109}},   /* Q == 8 : 50-56% */
+    {{ 582,187}, {1570,114}},   /* Q == 9 : 56-62% */
+    {{ 688,187}, {1712,122}},   /* Q ==10 : 62-69% */
+    {{ 825,186}, {1965,136}},   /* Q ==11 : 69-75% */
+    {{ 976,185}, {2131,150}},   /* Q ==12 : 75-81% */
+    {{1180,186}, {2070,175}},   /* Q ==13 : 81-87% */
+    {{1377,185}, {1731,202}},   /* Q ==14 : 87-93% */
+    {{1412,185}, {1695,202}},   /* Q ==15 : 93-99% */
+};
+#endif
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    assert(dstSize > 0);
+    assert(dstSize <= 128*1024);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dstSize;
+    (void)cSrcSize;
+    return 0;
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dstSize;
+    (void)cSrcSize;
+    return 1;
+#else
+    /* decoder timing evaluation */
+    {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+        U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+        DTime1 += DTime1 >> 5;  /* small advantage to algorithm using less memory, to reduce cache eviction */
+        return DTime1 < DTime0;
+    }
+#endif
+}
+
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+                                     size_t dstSize, const void* cSrc,
+                                     size_t cSrcSize, void* workSpace,
+                                     size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#else
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                            cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#endif
+    }
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                  const void* cSrc, size_t cSrcSize,
+                                  void* workSpace, size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+#else
+        return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize):
+                        HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+#endif
+    }
+}
+
+
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#else
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#endif
+}
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+#endif
+
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#else
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#endif
+}
+
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#else
+        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#endif
+    }
+}
+
+#ifndef ZSTD_NO_UNUSED_FUNCTIONS
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_readDTableX1_wksp(DTable, src, srcSize,
+                                 workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+#endif
+
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+  U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+  return HUF_readDTableX2_wksp(DTable, src, srcSize,
+                               workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+#endif
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+#endif
+
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+#endif
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
+#endif
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
+#else
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+#endif
+    }
+}
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+#else
+        return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+#endif
+    }
+}
+
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                         workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                             const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                      workSpace, sizeof(workSpace));
+}
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/huf_decompress_amd64.S b/GraphBLAS/zstd/zstd_subset/decompress/huf_decompress_amd64.S
new file mode 100644
index 000000000..3f0e5c26c
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/huf_decompress_amd64.S
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "../common/portability_macros.h"
+
+/* Stack marking
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
+ */
+#if defined(__ELF__) && defined(__GNUC__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+/* Calling convention:
+ *
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
+ * %rbp isn't maintained (no frame pointer).
+ * %rsp contains the stack pointer that grows down.
+ *      No red-zone is assumed, only addresses >= %rsp are used.
+ * All register contents are preserved.
+ *
+ * TODO: Support Windows calling convention.
+ */
+
+ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
+.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
+.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
+.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
+.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
+.text
+
+/* Sets up register mappings for clarity.
+ * op[], bits[], dtable & ip[0] each get their own register.
+ * ip[1,2,3] & olimit alias var[].
+ * %rax is a scratch register.
+ */
+
+#define op0    rsi
+#define op1    rbx
+#define op2    rcx
+#define op3    rdi
+
+#define ip0    r8
+#define ip1    r9
+#define ip2    r10
+#define ip3    r11
+
+#define bits0  rbp
+#define bits1  rdx
+#define bits2  r12
+#define bits3  r13
+#define dtable r14
+#define olimit r15
+
+/* var[] aliases ip[1,2,3] & olimit
+ * ip[1,2,3] are saved every iteration.
+ * olimit is only used in compute_olimit.
+ */
+#define var0   r15
+#define var1   r9
+#define var2   r10
+#define var3   r11
+
+/* 32-bit var registers */
+#define vard0  r15d
+#define vard1  r9d
+#define vard2  r10d
+#define vard3  r11d
+
+/* Calls X(N) for each stream 0, 1, 2, 3. */
+#define FOR_EACH_STREAM(X) \
+    X(0);                  \
+    X(1);                  \
+    X(2);                  \
+    X(3)
+
+/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
+#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
+    X(0, idx);                             \
+    X(1, idx);                             \
+    X(2, idx);                             \
+    X(3, idx)
+
+/* Define both _HUF_* & HUF_* symbols because MacOS
+ * C symbols are prefixed with '_' & Linux symbols aren't.
+ */
+_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
+HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
+    /* Save all registers - even if they are callee saved for simplicity. */
+    push %rax
+    push %rbx
+    push %rcx
+    push %rdx
+    push %rbp
+    push %rsi
+    push %rdi
+    push %r8
+    push %r9
+    push %r10
+    push %r11
+    push %r12
+    push %r13
+    push %r14
+    push %r15
+
+    /* Read HUF_DecompressAsmArgs* args from %rax */
+    movq %rdi, %rax
+    movq  0(%rax), %ip0
+    movq  8(%rax), %ip1
+    movq 16(%rax), %ip2
+    movq 24(%rax), %ip3
+    movq 32(%rax), %op0
+    movq 40(%rax), %op1
+    movq 48(%rax), %op2
+    movq 56(%rax), %op3
+    movq 64(%rax), %bits0
+    movq 72(%rax), %bits1
+    movq 80(%rax), %bits2
+    movq 88(%rax), %bits3
+    movq 96(%rax), %dtable
+    push %rax      /* argument */
+    push 104(%rax) /* ilimit */
+    push 112(%rax) /* oend */
+    push %olimit   /* olimit space */
+
+    subq $24, %rsp
+
+.L_4X1_compute_olimit:
+    /* Computes how many iterations we can do safely
+     * %r15, %rax may be clobbered
+     * rbx, rdx must be saved
+     * op3 & ip0 mustn't be clobbered
+     */
+    movq %rbx, 0(%rsp)
+    movq %rdx, 8(%rsp)
+
+    movq 32(%rsp), %rax /* rax = oend */
+    subq %op3,    %rax  /* rax = oend - op3 */
+
+    /* r15 = (oend - op3) / 5 */
+    movabsq $-3689348814741910323, %rdx
+    mulq %rdx
+    movq %rdx, %r15
+    shrq $2, %r15
+
+    movq %ip0,     %rax /* rax = ip0 */
+    movq 40(%rsp), %rdx /* rdx = ilimit */
+    subq %rdx,     %rax /* rax = ip0 - ilimit */
+    movq %rax,     %rbx /* rbx = ip0 - ilimit */
+
+    /* rdx = (ip0 - ilimit) / 7 */
+    movabsq $2635249153387078803, %rdx
+    mulq %rdx
+    subq %rdx, %rbx
+    shrq %rbx
+    addq %rbx, %rdx
+    shrq $2, %rdx
+
+    /* r15 = min(%rdx, %r15) */
+    cmpq %rdx, %r15
+    cmova %rdx, %r15
+
+    /* r15 = r15 * 5 */
+    leaq (%r15, %r15, 4), %r15
+
+    /* olimit = op3 + r15 */
+    addq %op3, %olimit
+
+    movq 8(%rsp), %rdx
+    movq 0(%rsp), %rbx
+
+    /* If (op3 + 20 > olimit) */
+    movq %op3, %rax    /* rax = op3 */
+    addq $20,  %rax    /* rax = op3 + 20 */
+    cmpq %rax, %olimit /* op3 + 20 > olimit */
+    jb .L_4X1_exit
+
+    /* If (ip1 < ip0) go to exit */
+    cmpq %ip0, %ip1
+    jb .L_4X1_exit
+
+    /* If (ip2 < ip1) go to exit */
+    cmpq %ip1, %ip2
+    jb .L_4X1_exit
+
+    /* If (ip3 < ip2) go to exit */
+    cmpq %ip2, %ip3
+    jb .L_4X1_exit
+
+/* Reads top 11 bits from bits[n]
+ * Loads dt[bits[n]] into var[n]
+ */
+#define GET_NEXT_DELT(n)                \
+    movq $53, %var##n;                  \
+    shrxq %var##n, %bits##n, %var##n;   \
+    movzwl (%dtable,%var##n,2),%vard##n
+
+/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
+ * Moves var[n] to %rax
+ * bits[n] <<= var[n] & 63
+ * op[n][idx] = %rax >> 8
+ * %ah is a way to access bits [8, 16) of %rax
+ */
+#define DECODE_FROM_DELT(n, idx)       \
+    movq %var##n, %rax;                \
+    shlxq %var##n, %bits##n, %bits##n; \
+    movb %ah, idx(%op##n)
+
+/* Assumes GET_NEXT_DELT has been called.
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
+ */
+#define DECODE_AND_GET_NEXT(n, idx) \
+    DECODE_FROM_DELT(n, idx);       \
+    GET_NEXT_DELT(n)                \
+
+/* // ctz & nbBytes is stored in bits[n]
+ * // nbBits is stored in %rax
+ * ctz  = CTZ[bits[n]]
+ * nbBits  = ctz & 7
+ * nbBytes = ctz >> 3
+ * op[n]  += 5
+ * ip[n]  -= nbBytes
+ * // Note: x86-64 is little-endian ==> no bswap
+ * bits[n] = MEM_readST(ip[n]) | 1
+ * bits[n] <<= nbBits
+ */
+#define RELOAD_BITS(n)             \
+    bsfq %bits##n, %bits##n;       \
+    movq %bits##n, %rax;           \
+    andq $7, %rax;                 \
+    shrq $3, %bits##n;             \
+    leaq 5(%op##n), %op##n;        \
+    subq %bits##n, %ip##n;         \
+    movq (%ip##n), %bits##n;       \
+    orq $1, %bits##n;              \
+    shlx %rax, %bits##n, %bits##n
+
+    /* Store clobbered variables on the stack */
+    movq %olimit, 24(%rsp)
+    movq %ip1, 0(%rsp)
+    movq %ip2, 8(%rsp)
+    movq %ip3, 16(%rsp)
+
+    /* Call GET_NEXT_DELT for each stream */
+    FOR_EACH_STREAM(GET_NEXT_DELT)
+
+    .p2align 6
+
+.L_4X1_loop_body:
+    /* Decode 5 symbols in each of the 4 streams (20 total)
+     * Must have called GET_NEXT_DELT for each stream
+     */
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
+
+    /* Load ip[1,2,3] from stack (var[] aliases them)
+     * ip[] is needed for RELOAD_BITS
+     * Each will be stored back to the stack after RELOAD
+     */
+    movq 0(%rsp), %ip1
+    movq 8(%rsp), %ip2
+    movq 16(%rsp), %ip3
+
+    /* Reload each stream & fetch the next table entry
+     * to prepare for the next iteration
+     */
+    RELOAD_BITS(0)
+    GET_NEXT_DELT(0)
+
+    RELOAD_BITS(1)
+    movq %ip1, 0(%rsp)
+    GET_NEXT_DELT(1)
+
+    RELOAD_BITS(2)
+    movq %ip2, 8(%rsp)
+    GET_NEXT_DELT(2)
+
+    RELOAD_BITS(3)
+    movq %ip3, 16(%rsp)
+    GET_NEXT_DELT(3)
+
+    /* If op3 < olimit: continue the loop */
+    cmp %op3, 24(%rsp)
+    ja .L_4X1_loop_body
+
+    /* Reload ip[1,2,3] from stack */
+    movq 0(%rsp), %ip1
+    movq 8(%rsp), %ip2
+    movq 16(%rsp), %ip3
+
+    /* Re-compute olimit */
+    jmp .L_4X1_compute_olimit
+
+#undef GET_NEXT_DELT
+#undef DECODE_FROM_DELT
+#undef DECODE
+#undef RELOAD_BITS
+.L_4X1_exit:
+    addq $24, %rsp
+
+    /* Restore stack (oend & olimit) */
+    pop %rax /* olimit */
+    pop %rax /* oend */
+    pop %rax /* ilimit */
+    pop %rax /* arg */
+
+    /* Save ip / op / bits */
+    movq %ip0,  0(%rax)
+    movq %ip1,  8(%rax)
+    movq %ip2, 16(%rax)
+    movq %ip3, 24(%rax)
+    movq %op0, 32(%rax)
+    movq %op1, 40(%rax)
+    movq %op2, 48(%rax)
+    movq %op3, 56(%rax)
+    movq %bits0, 64(%rax)
+    movq %bits1, 72(%rax)
+    movq %bits2, 80(%rax)
+    movq %bits3, 88(%rax)
+
+    /* Restore registers */
+    pop %r15
+    pop %r14
+    pop %r13
+    pop %r12
+    pop %r11
+    pop %r10
+    pop %r9
+    pop %r8
+    pop %rdi
+    pop %rsi
+    pop %rbp
+    pop %rdx
+    pop %rcx
+    pop %rbx
+    pop %rax
+    ret
+
+_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
+HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
+    /* Save all registers - even if they are callee saved for simplicity. */
+    push %rax
+    push %rbx
+    push %rcx
+    push %rdx
+    push %rbp
+    push %rsi
+    push %rdi
+    push %r8
+    push %r9
+    push %r10
+    push %r11
+    push %r12
+    push %r13
+    push %r14
+    push %r15
+
+    movq %rdi, %rax
+    movq  0(%rax), %ip0
+    movq  8(%rax), %ip1
+    movq 16(%rax), %ip2
+    movq 24(%rax), %ip3
+    movq 32(%rax), %op0
+    movq 40(%rax), %op1
+    movq 48(%rax), %op2
+    movq 56(%rax), %op3
+    movq 64(%rax), %bits0
+    movq 72(%rax), %bits1
+    movq 80(%rax), %bits2
+    movq 88(%rax), %bits3
+    movq 96(%rax), %dtable
+    push %rax      /* argument */
+    push %rax      /* olimit */
+    push 104(%rax) /* ilimit */
+
+    movq 112(%rax), %rax
+    push %rax /* oend3 */
+
+    movq %op3, %rax
+    push %rax /* oend2 */
+
+    movq %op2, %rax
+    push %rax /* oend1 */
+
+    movq %op1, %rax
+    push %rax /* oend0 */
+
+    /* Scratch space */
+    subq $8, %rsp
+
+.L_4X2_compute_olimit:
+    /* Computes how many iterations we can do safely
+     * %r15, %rax may be clobbered
+     * rdx must be saved
+     * op[1,2,3,4] & ip0 mustn't be clobbered
+     */
+    movq %rdx, 0(%rsp)
+
+    /* We can consume up to 7 input bytes each iteration. */
+    movq %ip0,     %rax  /* rax = ip0 */
+    movq 40(%rsp), %rdx  /* rdx = ilimit */
+    subq %rdx,     %rax  /* rax = ip0 - ilimit */
+    movq %rax,    %r15   /* r15 = ip0 - ilimit */
+
+    /* rdx = rax / 7 */
+    movabsq $2635249153387078803, %rdx
+    mulq %rdx
+    subq %rdx, %r15
+    shrq %r15
+    addq %r15, %rdx
+    shrq $2, %rdx
+
+    /* r15 = (ip0 - ilimit) / 7 */
+    movq %rdx, %r15
+
+    /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
+    movq 8(%rsp),  %rax /* rax = oend0 */
+    subq %op0,     %rax /* rax = oend0 - op0 */
+    movq 16(%rsp), %rdx /* rdx = oend1 */
+    subq %op1,     %rdx /* rdx = oend1 - op1 */
+
+    cmpq  %rax,    %rdx
+    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
+
+    movq 24(%rsp), %rax /* rax = oend2 */
+    subq %op2,     %rax /* rax = oend2 - op2 */
+
+    cmpq  %rax,    %rdx
+    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
+
+    movq 32(%rsp), %rax /* rax = oend3 */
+    subq %op3,     %rax /* rax = oend3 - op3 */
+
+    cmpq  %rax,    %rdx
+    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
+
+    movabsq $-3689348814741910323, %rax
+    mulq %rdx
+    shrq $3,       %rdx /* rdx = rdx / 10 */
+
+    /* r15 = min(%rdx, %r15) */
+    cmpq  %rdx, %r15
+    cmova %rdx, %r15
+
+    /* olimit = op3 + 5 * r15 */
+    movq %r15, %rax
+    leaq (%op3, %rax, 4), %olimit
+    addq %rax, %olimit
+
+    movq 0(%rsp), %rdx
+
+    /* If (op3 + 10 > olimit) */
+    movq %op3, %rax    /* rax = op3 */
+    addq $10,  %rax    /* rax = op3 + 10 */
+    cmpq %rax, %olimit /* op3 + 10 > olimit */
+    jb .L_4X2_exit
+
+    /* If (ip1 < ip0) go to exit */
+    cmpq %ip0, %ip1
+    jb .L_4X2_exit
+
+    /* If (ip2 < ip1) go to exit */
+    cmpq %ip1, %ip2
+    jb .L_4X2_exit
+
+    /* If (ip3 < ip2) go to exit */
+    cmpq %ip2, %ip3
+    jb .L_4X2_exit
+
+#define DECODE(n, idx)              \
+    movq %bits##n, %rax;            \
+    shrq $53, %rax;                 \
+    movzwl 0(%dtable,%rax,4),%r8d;  \
+    movzbl 2(%dtable,%rax,4),%r15d; \
+    movzbl 3(%dtable,%rax,4),%eax;  \
+    movw %r8w, (%op##n);            \
+    shlxq %r15, %bits##n, %bits##n; \
+    addq %rax, %op##n
+
+#define RELOAD_BITS(n)              \
+    bsfq %bits##n, %bits##n;        \
+    movq %bits##n, %rax;            \
+    shrq $3, %bits##n;              \
+    andq $7, %rax;                  \
+    subq %bits##n, %ip##n;          \
+    movq (%ip##n), %bits##n;        \
+    orq $1, %bits##n;               \
+    shlxq %rax, %bits##n, %bits##n
+
+
+    movq %olimit, 48(%rsp)
+
+    .p2align 6
+
+.L_4X2_loop_body:
+    /* We clobber r8, so store it on the stack */
+    movq %r8, 0(%rsp)
+
+    /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
+
+    /* Reload r8 */
+    movq 0(%rsp), %r8
+
+    FOR_EACH_STREAM(RELOAD_BITS)
+
+    cmp %op3, 48(%rsp)
+    ja .L_4X2_loop_body
+    jmp .L_4X2_compute_olimit
+
+#undef DECODE
+#undef RELOAD_BITS
+.L_4X2_exit:
+    addq $8, %rsp
+    /* Restore stack (oend & olimit) */
+    pop %rax /* oend0 */
+    pop %rax /* oend1 */
+    pop %rax /* oend2 */
+    pop %rax /* oend3 */
+    pop %rax /* ilimit */
+    pop %rax /* olimit */
+    pop %rax /* arg */
+
+    /* Save ip / op / bits */
+    movq %ip0,  0(%rax)
+    movq %ip1,  8(%rax)
+    movq %ip2, 16(%rax)
+    movq %ip3, 24(%rax)
+    movq %op0, 32(%rax)
+    movq %op1, 40(%rax)
+    movq %op2, 48(%rax)
+    movq %op3, 56(%rax)
+    movq %bits0, 64(%rax)
+    movq %bits1, 72(%rax)
+    movq %bits2, 80(%rax)
+    movq %bits3, 88(%rax)
+
+    /* Restore registers */
+    pop %r15
+    pop %r14
+    pop %r13
+    pop %r12
+    pop %r11
+    pop %r10
+    pop %r9
+    pop %r8
+    pop %rdi
+    pop %rsi
+    pop %rbp
+    pop %rdx
+    pop %rcx
+    pop %rbx
+    pop %rax
+    ret
+
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/zstd_ddict.c b/GraphBLAS/zstd/zstd_subset/decompress/zstd_ddict.c
new file mode 100644
index 000000000..889764a5e
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/zstd_ddict.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_ddict.c :
+ * concentrates all logic that needs to know the internals of ZSTD_DDict object */
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "../common/cpu.h"         /* bmi2 */
+#include "../common/mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "../common/huf.h"
+#include "zstd_decompress_internal.h"
+#include "zstd_ddict.h"
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#  include "../legacy/zstd_legacy.h"
+#endif
+
+
+
+/*-*******************************************************
+*  Types
+*********************************************************/
+struct ZSTD_DDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictSize;
+    ZSTD_entropyDTables_t entropy;
+    U32 dictID;
+    U32 entropyPresent;
+    ZSTD_customMem cMem;
+};  /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictContent;
+}
+
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictSize;
+}
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_copyDDictParameters");
+    assert(dctx != NULL);
+    assert(ddict != NULL);
+    dctx->dictID = ddict->dictID;
+    dctx->prefixStart = ddict->dictContent;
+    dctx->virtualStart = ddict->dictContent;
+    dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+    dctx->previousDstEnd = dctx->dictEnd;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+    dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+    if (ddict->entropyPresent) {
+        dctx->litEntropy = 1;
+        dctx->fseEntropy = 1;
+        dctx->LLTptr = ddict->entropy.LLTable;
+        dctx->MLTptr = ddict->entropy.MLTable;
+        dctx->OFTptr = ddict->entropy.OFTable;
+        dctx->HUFptr = ddict->entropy.hufTable;
+        dctx->entropy.rep[0] = ddict->entropy.rep[0];
+        dctx->entropy.rep[1] = ddict->entropy.rep[1];
+        dctx->entropy.rep[2] = ddict->entropy.rep[2];
+    } else {
+        dctx->litEntropy = 0;
+        dctx->fseEntropy = 0;
+    }
+}
+
+
+static size_t
+ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict,
+                           ZSTD_dictContentType_e dictContentType)
+{
+    ddict->dictID = 0;
+    ddict->entropyPresent = 0;
+    if (dictContentType == ZSTD_dct_rawContent) return 0;
+
+    if (ddict->dictSize < 8) {
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+        return 0;   /* pure content mode */
+    }
+    {   U32 const magic = MEM_readLE32(ddict->dictContent);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            if (dictContentType == ZSTD_dct_fullDict)
+                return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+            return 0;   /* pure content mode */
+        }
+    }
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
+
+    /* load entropy tables */
+    RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy(
+            &ddict->entropy, ddict->dictContent, ddict->dictSize)),
+        dictionary_corrupted, "");
+    ddict->entropyPresent = 1;
+    return 0;
+}
+
+
+static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType)
+{
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
+        ddict->dictBuffer = NULL;
+        ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
+    } else {
+        void* const internalBuffer = ZSTD_customMalloc(dictSize, ddict->cMem);
+        ddict->dictBuffer = internalBuffer;
+        ddict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        ZSTD_memcpy(internalBuffer, dict, dictSize);
+    }
+    ddict->dictSize = dictSize;
+    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+
+    /* parse dictionary content */
+    FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+
+    return 0;
+}
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_customMem customMem)
+{
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+
+    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_customMalloc(sizeof(ZSTD_DDict), customMem);
+        if (ddict == NULL) return NULL;
+        ddict->cMem = customMem;
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
+        return ddict;
+    }
+}
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
+}
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, to start decompression without startup delay.
+ *  Dictionary content is simply referenced, it will be accessed during decompression.
+ *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
+}
+
+
+const ZSTD_DDict* ZSTD_initStaticDDict(
+                                void* sBuffer, size_t sBufferSize,
+                                const void* dict, size_t dictSize,
+                                ZSTD_dictLoadMethod_e dictLoadMethod,
+                                ZSTD_dictContentType_e dictContentType)
+{
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
+    assert(dict != NULL);
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        ZSTD_memcpy(ddict+1, dict, dictSize);  /* local copy */
+        dict = ddict+1;
+    }
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
+        return NULL;
+    return ddict;
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = ddict->cMem;
+        ZSTD_customFree(ddict->dictBuffer, cMem);
+        ZSTD_customFree(ddict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_estimateDDictSize() :
+ *  Estimate amount of memory that will be needed to create a dictionary for decompression.
+ *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
+size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;
+    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/zstd_ddict.h b/GraphBLAS/zstd/zstd_subset/decompress/zstd_ddict.h
new file mode 100644
index 000000000..bd03268b5
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/zstd_ddict.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DDICT_H
+#define ZSTD_DDICT_H
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include "../common/zstd_deps.h"   /* size_t */
+#include "../zstd.h"     /* ZSTD_DDict, and several public functions */
+
+
+/*-*******************************************************
+ *  Interface
+ *********************************************************/
+
+/* note: several prototypes are already published in `zstd.h` :
+ * ZSTD_createDDict()
+ * ZSTD_createDDict_byReference()
+ * ZSTD_createDDict_advanced()
+ * ZSTD_freeDDict()
+ * ZSTD_initStaticDDict()
+ * ZSTD_sizeof_DDict()
+ * ZSTD_estimateDDictSize()
+ * ZSTD_getDictID_fromDict()
+ */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict);
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict);
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+
+
+#endif /* ZSTD_DDICT_H */
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress.c b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress.c
new file mode 100644
index 000000000..5bd412df4
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress.c
@@ -0,0 +1,2259 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() allocates its context,
+ * on stack (0), or into heap (1, default; requires malloc()).
+ * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+/*!
+*  LEGACY_SUPPORT :
+*  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 0
+#endif
+
+/*!
+ *  MAXWINDOWSIZE_DEFAULT :
+ *  maximum window size accepted by DStream __by default__.
+ *  Frames requiring more memory will be rejected.
+ *  It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize().
+ */
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1)
+#endif
+
+/*!
+ *  NO_FORWARD_PROGRESS_MAX :
+ *  maximum allowed nb of calls to ZSTD_decompressStream()
+ *  without any forward progress
+ *  (defined as: no byte read from input, and no byte flushed to output)
+ *  before triggering an error.
+ */
+#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
+#  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
+#endif
+
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "../common/mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "../common/huf.h"
+#include "../common/xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */
+#include "../common/zstd_internal.h"  /* blockProperties_t */
+#include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+#include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+#include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
+#include "../common/bits.h"  /* ZSTD_highbit32 */
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#  include "../legacy/zstd_legacy.h"
+#endif
+
+
+
+/*************************************
+ * Multiple DDicts Hashset internals *
+ *************************************/
+
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+                                                    * Currently, that means a 0.75 load factor.
+                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+                                                    * the load factor of the ddict hash set.
+                                                    */
+
+#define DDICT_HASHSET_TABLE_BASE_SIZE 64
+#define DDICT_HASHSET_RESIZE_FACTOR 2
+
+/* Hash function to determine starting position of dict insertion within the table
+ * Returns an index between [0, hashSet->ddictPtrTableSize]
+ */
+static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) {
+    const U64 hash = XXH64(&dictID, sizeof(U32), 0);
+    /* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */
+    return hash & (hashSet->ddictPtrTableSize - 1);
+}
+
+/* Adds DDict to a hashset without resizing it.
+ * If inserting a DDict with a dictID that already exists in the set, replaces the one in the set.
+ * Returns 0 if successful, or a zstd error code if something went wrong.
+ */
+static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) {
+    const U32 dictID = ZSTD_getDictID_fromDDict(ddict);
+    size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
+    const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
+    RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!");
+    DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
+    while (hashSet->ddictPtrTable[idx] != NULL) {
+        /* Replace existing ddict if inserting ddict with same dictID */
+        if (ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) {
+            DEBUGLOG(4, "DictID already exists, replacing rather than adding");
+            hashSet->ddictPtrTable[idx] = ddict;
+            return 0;
+        }
+        idx &= idxRangeMask;
+        idx++;
+    }
+    DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
+    hashSet->ddictPtrTable[idx] = ddict;
+    hashSet->ddictPtrCount++;
+    return 0;
+}
+
+/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and
+ * rehashes all values, allocates new table, frees old table.
+ * Returns 0 on success, otherwise a zstd error code.
+ */
+static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
+    size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR;
+    const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem);
+    const ZSTD_DDict** oldTable = hashSet->ddictPtrTable;
+    size_t oldTableSize = hashSet->ddictPtrTableSize;
+    size_t i;
+
+    DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize);
+    RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!");
+    hashSet->ddictPtrTable = newTable;
+    hashSet->ddictPtrTableSize = newTableSize;
+    hashSet->ddictPtrCount = 0;
+    for (i = 0; i < oldTableSize; ++i) {
+        if (oldTable[i] != NULL) {
+            FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), "");
+        }
+    }
+    ZSTD_customFree((void*)oldTable, customMem);
+    DEBUGLOG(4, "Finished re-hash");
+    return 0;
+}
+
+/* Fetches a DDict with the given dictID
+ * Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL.
+ */
+static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) {
+    size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
+    const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
+    DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
+    for (;;) {
+        size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]);
+        if (currDictID == dictID || currDictID == 0) {
+            /* currDictID == 0 implies a NULL ddict entry */
+            break;
+        } else {
+            idx &= idxRangeMask;    /* Goes to start of table when we reach the end */
+            idx++;
+        }
+    }
+    DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
+    return hashSet->ddictPtrTable[idx];
+}
+
+/* Allocates space for and returns a ddict hash set
+ * The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with.
+ * Returns NULL if allocation failed.
+ */
+static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) {
+    ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem);
+    DEBUGLOG(4, "Allocating new hash set");
+    if (!ret)
+        return NULL;
+    ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem);
+    if (!ret->ddictPtrTable) {
+        ZSTD_customFree(ret, customMem);
+        return NULL;
+    }
+    ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE;
+    ret->ddictPtrCount = 0;
+    return ret;
+}
+
+/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself.
+ * Note: The ZSTD_DDict* within the table are NOT freed.
+ */
+static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
+    DEBUGLOG(4, "Freeing ddict hash set");
+    if (hashSet && hashSet->ddictPtrTable) {
+        ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem);
+    }
+    if (hashSet) {
+        ZSTD_customFree(hashSet, customMem);
+    }
+}
+
+/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set.
+ * Returns 0 on success, or a ZSTD error.
+ */
+static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) {
+    DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize);
+    if (hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) {
+        FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), "");
+    }
+    FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), "");
+    return 0;
+}
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support sizeof NULL */
+    return sizeof(*dctx)
+           + ZSTD_sizeof_DDict(dctx->ddictLocal)
+           + dctx->inBuffSize + dctx->outBuffSize;
+}
+
+size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
+
+
+static size_t ZSTD_startingInputLength(ZSTD_format_e format)
+{
+    size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format);
+    /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
+    assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) );
+    return startingInputLength;
+}
+
+static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+{
+    assert(dctx->streamStage == zdss_init);
+    dctx->format = ZSTD_f_zstd1;
+    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+    dctx->outBufferMode = ZSTD_bm_buffered;
+    dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+    dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
+}
+
+static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+{
+    dctx->staticSize  = 0;
+    dctx->ddict       = NULL;
+    dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
+    dctx->dictUses = ZSTD_dont_use;
+    dctx->inBuff      = NULL;
+    dctx->inBuffSize  = 0;
+    dctx->outBuffSize = 0;
+    dctx->streamStage = zdss_init;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+    dctx->legacyContext = NULL;
+    dctx->previousLegacyVersion = 0;
+#endif
+    dctx->noForwardProgress = 0;
+    dctx->oversizedDuration = 0;
+#if DYNAMIC_BMI2
+    dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+#endif
+    dctx->ddictSet = NULL;
+    ZSTD_DCtx_resetParameters(dctx);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentEndForFuzzing = NULL;
+#endif
+}
+
+ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize)
+{
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL;  /* minimum size */
+
+    ZSTD_initDCtx_internal(dctx);
+    dctx->staticSize = workspaceSize;
+    dctx->inBuff = (char*)(dctx+1);
+    return dctx;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) {
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+
+    {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem);
+        if (!dctx) return NULL;
+        dctx->customMem = customMem;
+        ZSTD_initDCtx_internal(dctx);
+        return dctx;
+    }
+}
+
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_internal(customMem);
+}
+
+ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    DEBUGLOG(3, "ZSTD_createDCtx");
+    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
+}
+
+static void ZSTD_clearDict(ZSTD_DCtx* dctx)
+{
+    ZSTD_freeDDict(dctx->ddictLocal);
+    dctx->ddictLocal = NULL;
+    dctx->ddict = NULL;
+    dctx->dictUses = ZSTD_dont_use;
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx");
+    {   ZSTD_customMem const cMem = dctx->customMem;
+        ZSTD_clearDict(dctx);
+        ZSTD_customFree(dctx->inBuff, cMem);
+        dctx->inBuff = NULL;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (dctx->legacyContext)
+            ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
+#endif
+        if (dctx->ddictSet) {
+            ZSTD_freeDDictHashSet(dctx->ddictSet, cMem);
+            dctx->ddictSet = NULL;
+        }
+        ZSTD_customFree(dctx, cMem);
+        return 0;
+    }
+}
+
+/* no longer useful */
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx);
+    ZSTD_memcpy(dstDCtx, srcDCtx, toCopy);  /* no need to copy workspace */
+}
+
+/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on
+ * the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then
+ * accordingly sets the ddict to be used to decompress the frame.
+ *
+ * If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is.
+ *
+ * ZSTD_d_refMultipleDDicts must be enabled for this function to be called.
+ */
+static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) {
+    assert(dctx->refMultipleDDicts && dctx->ddictSet);
+    DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame");
+    if (dctx->ddict) {
+        const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID);
+        if (frameDDict) {
+            DEBUGLOG(4, "DDict found!");
+            ZSTD_clearDict(dctx);
+            dctx->dictID = dctx->fParams.dictID;
+            dctx->ddict = frameDDict;
+            dctx->dictUses = ZSTD_use_indefinitely;
+        }
+    }
+}
+
+
+/*-*************************************************************
+ *   Frame header decoding
+ ***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if (magic == ZSTD_MAGICNUMBER) return 1;
+        if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(buffer, size)) return 1;
+#endif
+    return 0;
+}
+
+/*! ZSTD_isSkippableFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ */
+unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+    return 0;
+}
+
+/** ZSTD_frameHeaderSize_internal() :
+ *  srcSize must be large enough to reach header size fields.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
+ * @return : size of the Frame Header
+ *           or an error code, which can be tested with ZSTD_isError() */
+static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+    RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, "");
+
+    {   BYTE const fhd = ((const BYTE*)src)[minInputSize-1];
+        U32 const dictID= fhd & 3;
+        U32 const singleSegment = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return minInputSize + !singleSegment
+             + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId]
+             + (singleSegment && !fcsId);
+    }
+}
+
+/** ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+{
+    return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameHeader_advanced() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+**           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+
+    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
+
+    if (srcSize > 0) {
+        /* note : technically could be considered an assert(), since it's an invalid entry */
+        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
+    }
+    if (srcSize < minInputSize) {
+        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
+            /* when receiving less than @minInputSize bytes,
+             * control these bytes at least correspond to a supported magic number
+             * in order to error out early if they don't.
+            **/
+            size_t const toCopy = MIN(4, srcSize);
+            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
+            assert(src != NULL);
+            ZSTD_memcpy(hbuf, src, toCopy);
+            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
+                /* not a zstd frame : let's check if it's a skippable frame */
+                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
+                ZSTD_memcpy(hbuf, src, toCopy);
+                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
+                    RETURN_ERROR(prefix_unknown,
+                                "first bytes don't correspond to any supported magic number");
+        }   }   }
+        return minInputSize;
+    }
+
+    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+    if ( (format != ZSTD_f_zstd1_magicless)
+      && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+        if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+            /* skippable frame */
+            if (srcSize < ZSTD_SKIPPABLEHEADERSIZE)
+                return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */
+            ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+            zfhPtr->frameType = ZSTD_skippableFrame;
+            return 0;
+        }
+        RETURN_ERROR(prefix_unknown, "");
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    {   size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format);
+        if (srcSize < fhsize) return fhsize;
+        zfhPtr->headerSize = (U32)fhsize;
+    }
+
+    {   BYTE const fhdByte = ip[minInputSize-1];
+        size_t pos = minInputSize;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const singleSegment = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U64 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN;
+        RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported,
+                        "reserved bits, must be zero");
+
+        if (!singleSegment) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, "");
+            windowSize = (1ULL << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+        switch(dictIDSizeCode)
+        {
+            default:
+                assert(0);  /* impossible */
+                ZSTD_FALLTHROUGH;
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default:
+                assert(0);  /* impossible */
+                ZSTD_FALLTHROUGH;
+            case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (singleSegment) windowSize = frameContentSize;
+
+        zfhPtr->frameType = ZSTD_frame;
+        zfhPtr->frameContentSize = frameContentSize;
+        zfhPtr->windowSize = windowSize;
+        zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+        zfhPtr->dictID = dictID;
+        zfhPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : this function does not consume input, it only reads it.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
+{
+    return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+}
+
+/** ZSTD_getFrameContentSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+ *         - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize)) {
+        unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
+        return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
+    }
+#endif
+    {   ZSTD_frameHeader zfh;
+        if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+            return ZSTD_CONTENTSIZE_ERROR;
+        if (zfh.frameType == ZSTD_skippableFrame) {
+            return 0;
+        } else {
+            return zfh.frameContentSize;
+    }   }
+}
+
+static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+{
+    size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE;
+    U32 sizeU32;
+
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+
+    sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+    RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                    frameParameter_unsupported, "");
+    {
+        size_t const skippableSize = skippableHeaderSize + sizeU32;
+        RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+        return skippableSize;
+    }
+}
+
+/*! ZSTD_readSkippableFrame() :
+ * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ *
+ * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+ * in the magicVariant.
+ *
+ * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+                                            const void* src, size_t srcSize)
+{
+    U32 const magicNumber = MEM_readLE32(src);
+    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+
+    /* check input validity */
+    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+
+    /* deliver payload */
+    if (skippableContentSize > 0  && dst != NULL)
+        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+    if (magicVariant != NULL)
+        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+    return skippableContentSize;
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  compatible with legacy mode
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long totalDstSize = 0;
+
+    while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+        U32 const magicNumber = MEM_readLE32(src);
+
+        if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+            size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+            if (ZSTD_isError(skippableSize)) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+            assert(skippableSize <= srcSize);
+
+            src = (const BYTE *)src + skippableSize;
+            srcSize -= skippableSize;
+            continue;
+        }
+
+        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+
+            /* check for overflow */
+            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+            totalDstSize += ret;
+        }
+        {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+            if (ZSTD_isError(frameSrcSize)) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+
+            src = (const BYTE *)src + frameSrcSize;
+            srcSize -= frameSrcSize;
+        }
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    if (srcSize) return ZSTD_CONTENTSIZE_ERROR;
+
+    return totalDstSize;
+}
+
+/** ZSTD_getDecompressedSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size if known, 0 otherwise
+             note : 0 can mean any of the following :
+                   - frame content is empty
+                   - decompressed size field is not present in frame header
+                   - frame header unknown / not supported
+                   - frame header not complete (`srcSize` too small) */
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN);
+    return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret;
+}
+
+
+/** ZSTD_decodeFrameHeader() :
+ * `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+ * If multiple DDict references are enabled, also will choose the correct DDict to use.
+ * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
+{
+    size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
+    if (ZSTD_isError(result)) return result;    /* invalid header */
+    RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small");
+
+    /* Reference DDict requested by frame if dctx references multiple ddicts */
+    if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) {
+        ZSTD_DCtx_selectFrameDDict(dctx);
+    }
+
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    /* Skip the dictID check in fuzzing mode, because it makes the search
+     * harder.
+     */
+    RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID),
+                    dictionary_wrong, "");
+#endif
+    dctx->validateChecksum = (dctx->fParams.checksumFlag && !dctx->forceIgnoreChecksum) ? 1 : 0;
+    if (dctx->validateChecksum) XXH64_reset(&dctx->xxhState, 0);
+    dctx->processedCSize += headerSize;
+    return 0;
+}
+
+static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+{
+    ZSTD_frameSizeInfo frameSizeInfo;
+    frameSizeInfo.compressedSize = ret;
+    frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+    return frameSizeInfo;
+}
+
+static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
+{
+    ZSTD_frameSizeInfo frameSizeInfo;
+    ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize))
+        return ZSTD_findFrameSizeInfoLegacy(src, srcSize);
+#endif
+
+    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+        && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+        frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+        assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+               frameSizeInfo.compressedSize <= srcSize);
+        return frameSizeInfo;
+    } else {
+        const BYTE* ip = (const BYTE*)src;
+        const BYTE* const ipstart = ip;
+        size_t remainingSize = srcSize;
+        size_t nbBlocks = 0;
+        ZSTD_frameHeader zfh;
+
+        /* Extract Frame Header */
+        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
+            if (ZSTD_isError(ret))
+                return ZSTD_errorFrameSizeInfo(ret);
+            if (ret > 0)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+        }
+
+        ip += zfh.headerSize;
+        remainingSize -= zfh.headerSize;
+
+        /* Iterate over each block */
+        while (1) {
+            blockProperties_t blockProperties;
+            size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+            if (ZSTD_isError(cBlockSize))
+                return ZSTD_errorFrameSizeInfo(cBlockSize);
+
+            if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+
+            ip += ZSTD_blockHeaderSize + cBlockSize;
+            remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+            nbBlocks++;
+
+            if (blockProperties.lastBlock) break;
+        }
+
+        /* Final frame content checksum */
+        if (zfh.checksumFlag) {
+            if (remainingSize < 4)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+            ip += 4;
+        }
+
+        frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+        frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                        ? zfh.frameContentSize
+                                        : nbBlocks * zfh.blockSizeMax;
+        return frameSizeInfo;
+    }
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+    return frameSizeInfo.compressedSize;
+}
+
+/** ZSTD_decompressBound() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame or a skippeable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the maximum decompressed size of the compressed source
+ */
+unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+{
+    unsigned long long bound = 0;
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ZSTD_CONTENTSIZE_ERROR;
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+        bound += decompressedBound;
+    }
+    return bound;
+}
+
+
+/*-*************************************************************
+ *   Frame decoding
+ ***************************************************************/
+
+/** ZSTD_insertBlock() :
+ *  insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize);
+    ZSTD_checkContinuity(dctx, blockStart, blockSize);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_copyRawBlock");
+    RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, "");
+    if (dst == NULL) {
+        if (srcSize == 0) return 0;
+        RETURN_ERROR(dstBuffer_null, "");
+    }
+    ZSTD_memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+                               BYTE b,
+                               size_t regenSize)
+{
+    RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, "");
+    if (dst == NULL) {
+        if (regenSize == 0) return 0;
+        RETURN_ERROR(dstBuffer_null, "");
+    }
+    ZSTD_memset(dst, b, regenSize);
+    return regenSize;
+}
+
+static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming)
+{
+#if ZSTD_TRACE
+    if (dctx->traceCtx && ZSTD_trace_decompress_end != NULL) {
+        ZSTD_Trace trace;
+        ZSTD_memset(&trace, 0, sizeof(trace));
+        trace.version = ZSTD_VERSION_NUMBER;
+        trace.streaming = streaming;
+        if (dctx->ddict) {
+            trace.dictionaryID = ZSTD_getDictID_fromDDict(dctx->ddict);
+            trace.dictionarySize = ZSTD_DDict_dictSize(dctx->ddict);
+            trace.dictionaryIsCold = dctx->ddictIsCold;
+        }
+        trace.uncompressedSize = (size_t)uncompressedSize;
+        trace.compressedSize = (size_t)compressedSize;
+        trace.dctx = dctx;
+        ZSTD_trace_decompress_end(dctx->traceCtx, &trace);
+    }
+#else
+    (void)dctx;
+    (void)uncompressedSize;
+    (void)compressedSize;
+    (void)streaming;
+#endif
+}
+
+
+/*! ZSTD_decompressFrame() :
+ * @dctx must be properly initialized
+ *  will update *srcPtr and *srcSizePtr,
+ *  to make *srcPtr progress by one frame. */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+                                   void* dst, size_t dstCapacity,
+                             const void** srcPtr, size_t *srcSizePtr)
+{
+    const BYTE* const istart = (const BYTE*)(*srcPtr);
+    const BYTE* ip = istart;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart;
+    BYTE* op = ostart;
+    size_t remainingSrcSize = *srcSizePtr;
+
+    DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr);
+
+    /* check */
+    RETURN_ERROR_IF(
+        remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize,
+        srcSize_wrong, "");
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal(
+                ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize,
+                        srcSize_wrong, "");
+        FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , "");
+        ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSrcSize -= ZSTD_blockHeaderSize;
+        RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTD_setRleBlock(op, (size_t)(oend-op), *ip, blockProperties.origSize);
+            break;
+        case bt_reserved :
+        default:
+            RETURN_ERROR(corruption_detected, "invalid block type");
+        }
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        if (dctx->validateChecksum)
+            XXH64_update(&dctx->xxhState, op, decodedSize);
+        if (decodedSize != 0)
+            op += decodedSize;
+        assert(ip != NULL);
+        ip += cBlockSize;
+        remainingSrcSize -= cBlockSize;
+        if (blockProperties.lastBlock) break;
+    }
+
+    if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize,
+                        corruption_detected, "");
+    }
+    if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+        RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, "");
+        if (!dctx->forceIgnoreChecksum) {
+            U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
+            U32 checkRead;
+            checkRead = MEM_readLE32(ip);
+            RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, "");
+        }
+        ip += 4;
+        remainingSrcSize -= 4;
+    }
+    ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+    /* Allow caller to get size read */
+    *srcPtr = ip;
+    *srcSizePtr = remainingSrcSize;
+    return (size_t)(op-ostart);
+}
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  const void* dict, size_t dictSize,
+                                  const ZSTD_DDict* ddict)
+{
+    void* const dststart = dst;
+    int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
+    assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
+
+    if (ddict) {
+        dict = ZSTD_DDict_dictContent(ddict);
+        dictSize = ZSTD_DDict_dictSize(ddict);
+    }
+
+    while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (ZSTD_isLegacy(src, srcSize)) {
+            size_t decodedSize;
+            size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+            if (ZSTD_isError(frameSize)) return frameSize;
+            RETURN_ERROR_IF(dctx->staticSize, memory_allocation,
+                "legacy support is not compatible with static dctx");
+
+            decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
+            if (ZSTD_isError(decodedSize)) return decodedSize;
+
+            assert(decodedSize <= dstCapacity);
+            dst = (BYTE*)dst + decodedSize;
+            dstCapacity -= decodedSize;
+
+            src = (const BYTE*)src + frameSize;
+            srcSize -= frameSize;
+
+            continue;
+        }
+#endif
+
+        {   U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
+            if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+                size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
+                assert(skippableSize <= srcSize);
+
+                src = (const BYTE *)src + skippableSize;
+                srcSize -= skippableSize;
+                continue;
+        }   }
+
+        if (ddict) {
+            /* we were called from ZSTD_decompress_usingDDict */
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), "");
+        } else {
+            /* this will initialize correctly with no dict if dict == NULL, so
+             * use this in all cases but ddict */
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), "");
+        }
+        ZSTD_checkContinuity(dctx, dst, dstCapacity);
+
+        {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
+                                                    &src, &srcSize);
+            RETURN_ERROR_IF(
+                (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown)
+             && (moreThan1Frame==1),
+                srcSize_wrong,
+                "At least one frame successfully completed, "
+                "but following bytes are garbage: "
+                "it's more likely to be a srcSize error, "
+                "specifying more input bytes than size of frame(s). "
+                "Note: one could be unlucky, it might be a corruption error instead, "
+                "happening right at the place where we expect zstd magic bytes. "
+                "But this is _much_ less likely than a srcSize field error.");
+            if (ZSTD_isError(res)) return res;
+            assert(res <= dstCapacity);
+            if (res != 0)
+                dst = (BYTE*)dst + res;
+            dstCapacity -= res;
+        }
+        moreThan1Frame = 1;
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed");
+
+    return (size_t)((BYTE*)dst - (BYTE*)dststart);
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize,
+                           const void* dict, size_t dictSize)
+{
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+
+static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx)
+{
+    switch (dctx->dictUses) {
+    default:
+        assert(0 /* Impossible */);
+        ZSTD_FALLTHROUGH;
+    case ZSTD_dont_use:
+        ZSTD_clearDict(dctx);
+        return NULL;
+    case ZSTD_use_indefinitely:
+        return dctx->ddict;
+    case ZSTD_use_once:
+        dctx->dictUses = ZSTD_dont_use;
+        return dctx->ddict;
+    }
+}
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx));
+}
+
+
+size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1)
+    size_t regenSize;
+    ZSTD_DCtx* const dctx =  ZSTD_createDCtx_internal(ZSTD_defaultCMem);
+    RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!");
+    regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTD_DCtx dctx;
+    ZSTD_initDCtx_internal(&dctx);
+    return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+/**
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * be streamed.
+ *
+ * For blocks that can be streamed, this allows us to reduce the latency until we produce
+ * output, and avoid copying the input.
+ *
+ * @param inputSize - The total amount of input that the caller currently has.
+ */
+static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) {
+    if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock))
+        return dctx->expected;
+    if (dctx->bType != bt_raw)
+        return dctx->expected;
+    return BOUNDED(1, inputSize, dctx->expected);
+}
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
+    switch(dctx->stage)
+    {
+    default:   /* should not happen */
+        assert(0);
+        ZSTD_FALLTHROUGH;
+    case ZSTDds_getFrameHeaderSize:
+        ZSTD_FALLTHROUGH;
+    case ZSTDds_decodeFrameHeader:
+        return ZSTDnit_frameHeader;
+    case ZSTDds_decodeBlockHeader:
+        return ZSTDnit_blockHeader;
+    case ZSTDds_decompressBlock:
+        return ZSTDnit_block;
+    case ZSTDds_decompressLastBlock:
+        return ZSTDnit_lastBlock;
+    case ZSTDds_checkChecksum:
+        return ZSTDnit_checksum;
+    case ZSTDds_decodeSkippableHeader:
+        ZSTD_FALLTHROUGH;
+    case ZSTDds_skipFrame:
+        return ZSTDnit_skippableFrame;
+    }
+}
+
+static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }
+
+/** ZSTD_decompressContinue() :
+ *  srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress())
+ *  @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+ *            or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize);
+    /* Sanity check */
+    RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed");
+    ZSTD_checkContinuity(dctx, dst, dstCapacity);
+
+    dctx->processedCSize += srcSize;
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        assert(src != NULL);
+        if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
+            if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
+                ZSTD_memcpy(dctx->headerBuffer, src, srcSize);
+                dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize;  /* remaining to load to get full skippable frame header */
+                dctx->stage = ZSTDds_decodeSkippableHeader;
+                return 0;
+        }   }
+        dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format);
+        if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+        ZSTD_memcpy(dctx->headerBuffer, src, srcSize);
+        dctx->expected = dctx->headerSize - srcSize;
+        dctx->stage = ZSTDds_decodeFrameHeader;
+        return 0;
+
+    case ZSTDds_decodeFrameHeader:
+        assert(src != NULL);
+        ZSTD_memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize);
+        FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), "");
+        dctx->expected = ZSTD_blockHeaderSize;
+        dctx->stage = ZSTDds_decodeBlockHeader;
+        return 0;
+
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+            RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum");
+            dctx->expected = cBlockSize;
+            dctx->bType = bp.blockType;
+            dctx->rleSize = bp.origSize;
+            if (cBlockSize) {
+                dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+                return 0;
+            }
+            /* empty block */
+            if (bp.lastBlock) {
+                if (dctx->fParams.checksumFlag) {
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0; /* end of frame */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->expected = ZSTD_blockHeaderSize;  /* jump to next header */
+                dctx->stage = ZSTDds_decodeBlockHeader;
+            }
+            return 0;
+        }
+
+    case ZSTDds_decompressLastBlock:
+    case ZSTDds_decompressBlock:
+        DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock");
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
+                dctx->expected = 0;  /* Streaming not supported */
+                break;
+            case bt_raw :
+                assert(srcSize <= dctx->expected);
+                rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
+                FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed");
+                assert(rSize == srcSize);
+                dctx->expected -= rSize;
+                break;
+            case bt_rle :
+                rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize);
+                dctx->expected = 0;  /* Streaming not supported */
+                break;
+            case bt_reserved :   /* should never happen */
+            default:
+                RETURN_ERROR(corruption_detected, "invalid block type");
+            }
+            FORWARD_IF_ERROR(rSize, "");
+            RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum");
+            DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize);
+            dctx->decodedSize += rSize;
+            if (dctx->validateChecksum) XXH64_update(&dctx->xxhState, dst, rSize);
+            dctx->previousDstEnd = (char*)dst + rSize;
+
+            /* Stay on the same stage until we are finished streaming the block. */
+            if (dctx->expected > 0) {
+                return rSize;
+            }
+
+            if (dctx->stage == ZSTDds_decompressLastBlock) {   /* end of frame */
+                DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize);
+                RETURN_ERROR_IF(
+                    dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && dctx->decodedSize != dctx->fParams.frameContentSize,
+                    corruption_detected, "");
+                if (dctx->fParams.checksumFlag) {  /* another round for frame checksum */
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1);
+                    dctx->expected = 0;   /* ends here */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->stage = ZSTDds_decodeBlockHeader;
+                dctx->expected = ZSTD_blockHeaderSize;
+            }
+            return rSize;
+        }
+
+    case ZSTDds_checkChecksum:
+        assert(srcSize == 4);  /* guaranteed by dctx->expected */
+        {
+            if (dctx->validateChecksum) {
+                U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
+                U32 const check32 = MEM_readLE32(src);
+                DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32);
+                RETURN_ERROR_IF(check32 != h32, checksum_wrong, "");
+            }
+            ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1);
+            dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+
+    case ZSTDds_decodeSkippableHeader:
+        assert(src != NULL);
+        assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
+        ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->stage = ZSTDds_skipFrame;
+        return 0;
+
+    case ZSTDds_skipFrame:
+        dctx->expected = 0;
+        dctx->stage = ZSTDds_getFrameHeaderSize;
+        return 0;
+
+    default:
+        assert(0);   /* impossible */
+        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+    }
+}
+
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+    dctx->prefixStart = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+    dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+    return 0;
+}
+
+/*! ZSTD_loadDEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of entropy tables read */
+size_t
+ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+                  const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small");
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
+    dictPtr += 8;   /* skip header = magic + dictID */
+
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+#ifdef HUF_FORCE_DECOMPRESS_X1
+        /* in minimal huffman, we always use X1 variants */
+        size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
+#else
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, (size_t)(dictEnd - dictPtr),
+                                                workspace, workspaceSize);
+#endif
+        RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        unsigned offcodeMaxValue = MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr));
+        RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->OFTable,
+                            offcodeNCount, offcodeMaxValue,
+                            OF_base, OF_bits,
+                            offcodeLog,
+                            entropy->workspace, sizeof(entropy->workspace),
+                            /* bmi2 */0);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+        RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->MLTable,
+                            matchlengthNCount, matchlengthMaxValue,
+                            ML_base, ML_bits,
+                            matchlengthLog,
+                            entropy->workspace, sizeof(entropy->workspace),
+                            /* bmi2 */ 0);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr));
+        RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->LLTable,
+                            litlengthNCount, litlengthMaxValue,
+                            LL_base, LL_bits,
+                            litlengthLog,
+                            entropy->workspace, sizeof(entropy->workspace),
+                            /* bmi2 */ 0);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+    {   int i;
+        size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12));
+        for (i=0; i<3; i++) {
+            U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4;
+            RETURN_ERROR_IF(rep==0 || rep > dictContentSize,
+                            dictionary_corrupted, "");
+            entropy->rep[i] = rep;
+    }   }
+
+    return (size_t)(dictPtr - (const BYTE*)dict);
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+
+    /* load entropy tables */
+    {   size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize);
+        RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, "");
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+    dctx->litEntropy = dctx->fseEntropy = 1;
+
+    /* reference dictionary content */
+    return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+{
+    assert(dctx != NULL);
+#if ZSTD_TRACE
+    dctx->traceCtx = (ZSTD_trace_decompress_begin != NULL) ? ZSTD_trace_decompress_begin(dctx) : 0;
+#endif
+    dctx->expected = ZSTD_startingInputLength(dctx->format);  /* dctx->format must be properly set */
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->processedCSize = 0;
+    dctx->decodedSize = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->prefixStart = NULL;
+    dctx->virtualStart = NULL;
+    dctx->dictEnd = NULL;
+    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    dctx->bType = bt_reserved;
+    ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+    ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+    dctx->LLTptr = dctx->entropy.LLTable;
+    dctx->MLTptr = dctx->entropy.MLTable;
+    dctx->OFTptr = dctx->entropy.OFTable;
+    dctx->HUFptr = dctx->entropy.hufTable;
+    return 0;
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+    if (dict && dictSize)
+        RETURN_ERROR_IF(
+            ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)),
+            dictionary_corrupted, "");
+    return 0;
+}
+
+
+/* ======   ZSTD_DDict   ====== */
+
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict);
+        size_t const dictSize = ZSTD_DDict_dictSize(ddict);
+        const void* const dictEnd = dictStart + dictSize;
+        dctx->ddictIsCold = (dctx->dictEnd != dictEnd);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        ZSTD_copyDDictParameters(dctx, ddict);
+    }
+    return 0;
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompress frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary (most common case).
+ *  - The frame was built with dictID intentionally removed.
+ *    Needed dictionary is a hidden piece of information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+ *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use
+ *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+{
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+    if (ZSTD_isError(hError)) return 0;
+    return zfp.dictID;
+}
+
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_DDict* ddict)
+{
+    /* pass content and size in case legacy frames are encountered */
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
+                                     NULL, 0,
+                                     ddict);
+}
+
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+ZSTD_DStream* ZSTD_createDStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createDStream");
+    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
+}
+
+ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticDCtx(workspace, workspaceSize);
+}
+
+ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_internal(customMem);
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream* zds)
+{
+    return ZSTD_freeDCtx(zds);
+}
+
+
+/* ***  Initialization  *** */
+
+size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    ZSTD_clearDict(dctx);
+    if (dict && dictSize != 0) {
+        dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem);
+        RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!");
+        dctx->ddict = dctx->ddictLocal;
+        dctx->dictUses = ZSTD_use_indefinitely;
+    }
+    return 0;
+}
+
+size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), "");
+    dctx->dictUses = ZSTD_use_once;
+    return 0;
+}
+
+size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+
+/* ZSTD_initDStream_usingDict() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize)
+{
+    DEBUGLOG(4, "ZSTD_initDStream_usingDict");
+    FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , "");
+    return ZSTD_startingInputLength(zds->format);
+}
+
+/* note : this variant can't fail */
+size_t ZSTD_initDStream(ZSTD_DStream* zds)
+{
+    DEBUGLOG(4, "ZSTD_initDStream");
+    return ZSTD_initDStream_usingDDict(zds, NULL);
+}
+
+/* ZSTD_initDStream_usingDDict() :
+ * ddict will just be referenced, and must outlive decompression session
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+{
+    FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+    return ZSTD_startingInputLength(dctx->format);
+}
+
+/* ZSTD_resetDStream() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+{
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+    return ZSTD_startingInputLength(dctx->format);
+}
+
+
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    ZSTD_clearDict(dctx);
+    if (ddict) {
+        dctx->ddict = ddict;
+        dctx->dictUses = ZSTD_use_indefinitely;
+        if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) {
+            if (dctx->ddictSet == NULL) {
+                dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem);
+                if (!dctx->ddictSet) {
+                    RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!");
+                }
+            }
+            assert(!dctx->staticSize);  /* Impossible: ddictSet cannot have been allocated if static dctx */
+            FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), "");
+        }
+    }
+    return 0;
+}
+
+/* ZSTD_DCtx_setMaxWindowSize() :
+ * note : no direct equivalence in ZSTD_DCtx_setParameter,
+ * since this version sets windowSize, and the other sets windowLog */
+size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
+{
+    ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax);
+    size_t const min = (size_t)1 << bounds.lowerBound;
+    size_t const max = (size_t)1 << bounds.upperBound;
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, "");
+    RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, "");
+    dctx->maxWindowSize = maxWindowSize;
+    return 0;
+}
+
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format)
+{
+    return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, (int)format);
+}
+
+ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+{
+    ZSTD_bounds bounds = { 0, 0, 0 };
+    switch(dParam) {
+        case ZSTD_d_windowLogMax:
+            bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+            return bounds;
+        case ZSTD_d_format:
+            bounds.lowerBound = (int)ZSTD_f_zstd1;
+            bounds.upperBound = (int)ZSTD_f_zstd1_magicless;
+            ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+            return bounds;
+        case ZSTD_d_stableOutBuffer:
+            bounds.lowerBound = (int)ZSTD_bm_buffered;
+            bounds.upperBound = (int)ZSTD_bm_stable;
+            return bounds;
+        case ZSTD_d_forceIgnoreChecksum:
+            bounds.lowerBound = (int)ZSTD_d_validateChecksum;
+            bounds.upperBound = (int)ZSTD_d_ignoreChecksum;
+            return bounds;
+        case ZSTD_d_refMultipleDDicts:
+            bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+            bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+            return bounds;
+        default:;
+    }
+    bounds.error = ERROR(parameter_unsupported);
+    return bounds;
+}
+
+/* ZSTD_dParam_withinBounds:
+ * @return 1 if value is within dParam bounds,
+ * 0 otherwise */
+static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value)
+{
+    ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam);
+    if (ZSTD_isError(bounds.error)) return 0;
+    if (value < bounds.lowerBound) return 0;
+    if (value > bounds.upperBound) return 0;
+    return 1;
+}
+
+#define CHECK_DBOUNDS(p,v) {                \
+    RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \
+}
+
+size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value)
+{
+    switch (param) {
+        case ZSTD_d_windowLogMax:
+            *value = (int)ZSTD_highbit32((U32)dctx->maxWindowSize);
+            return 0;
+        case ZSTD_d_format:
+            *value = (int)dctx->format;
+            return 0;
+        case ZSTD_d_stableOutBuffer:
+            *value = (int)dctx->outBufferMode;
+            return 0;
+        case ZSTD_d_forceIgnoreChecksum:
+            *value = (int)dctx->forceIgnoreChecksum;
+            return 0;
+        case ZSTD_d_refMultipleDDicts:
+            *value = (int)dctx->refMultipleDDicts;
+            return 0;
+        default:;
+    }
+    RETURN_ERROR(parameter_unsupported, "");
+}
+
+size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    switch(dParam) {
+        case ZSTD_d_windowLogMax:
+            if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT;
+            CHECK_DBOUNDS(ZSTD_d_windowLogMax, value);
+            dctx->maxWindowSize = ((size_t)1) << value;
+            return 0;
+        case ZSTD_d_format:
+            CHECK_DBOUNDS(ZSTD_d_format, value);
+            dctx->format = (ZSTD_format_e)value;
+            return 0;
+        case ZSTD_d_stableOutBuffer:
+            CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value);
+            dctx->outBufferMode = (ZSTD_bufferMode_e)value;
+            return 0;
+        case ZSTD_d_forceIgnoreChecksum:
+            CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value);
+            dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value;
+            return 0;
+        case ZSTD_d_refMultipleDDicts:
+            CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value);
+            if (dctx->staticSize != 0) {
+                RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!");
+            }
+            dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+            return 0;
+        default:;
+    }
+    RETURN_ERROR(parameter_unsupported, "");
+}
+
+size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+{
+    if ( (reset == ZSTD_reset_session_only)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        dctx->streamStage = zdss_init;
+        dctx->noForwardProgress = 0;
+    }
+    if ( (reset == ZSTD_reset_parameters)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+        ZSTD_clearDict(dctx);
+        ZSTD_DCtx_resetParameters(dctx);
+    }
+    return 0;
+}
+
+
+size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+{
+    return ZSTD_sizeof_DCtx(dctx);
+}
+
+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+{
+    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
+    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
+    unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+    size_t const minRBSize = (size_t) neededSize;
+    RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+                    frameParameter_windowTooLarge, "");
+    return minRBSize;
+}
+
+size_t ZSTD_estimateDStreamSize(size_t windowSize)
+{
+    size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    size_t const inBuffSize = blockSize;  /* no block can be larger */
+    size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN);
+    return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
+}
+
+size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+{
+    U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;   /* note : should be user-selectable, but requires an additional parameter (or a dctx) */
+    ZSTD_frameHeader zfh;
+    size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+    if (ZSTD_isError(err)) return err;
+    RETURN_ERROR_IF(err>0, srcSize_wrong, "");
+    RETURN_ERROR_IF(zfh.windowSize > windowSizeMax,
+                    frameParameter_windowTooLarge, "");
+    return ZSTD_estimateDStreamSize((size_t)zfh.windowSize);
+}
+
+
+/* *****   Decompression   ***** */
+
+static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+    return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR;
+}
+
+static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+    if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize))
+        zds->oversizedDuration++;
+    else
+        zds->oversizedDuration = 0;
+}
+
+static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds)
+{
+    return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */
+static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output)
+{
+    ZSTD_outBuffer const expect = zds->expectedOutBuffer;
+    /* No requirement when ZSTD_obm_stable is not enabled. */
+    if (zds->outBufferMode != ZSTD_bm_stable)
+        return 0;
+    /* Any buffer is allowed in zdss_init, this must be the same for every other call until
+     * the context is reset.
+     */
+    if (zds->streamStage == zdss_init)
+        return 0;
+    /* The buffer must match our expectation exactly. */
+    if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size)
+        return 0;
+    RETURN_ERROR(dstBuffer_wrong, "ZSTD_d_stableOutBuffer enabled but output differs!");
+}
+
+/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream()
+ * and updates the stage and the output buffer state. This call is extracted so it can be
+ * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode.
+ * NOTE: You must break after calling this function since the streamStage is modified.
+ */
+static size_t ZSTD_decompressContinueStream(
+            ZSTD_DStream* zds, char** op, char* oend,
+            void const* src, size_t srcSize) {
+    int const isSkipFrame = ZSTD_isSkipFrame(zds);
+    if (zds->outBufferMode == ZSTD_bm_buffered) {
+        size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart;
+        size_t const decodedSize = ZSTD_decompressContinue(zds,
+                zds->outBuff + zds->outStart, dstSize, src, srcSize);
+        FORWARD_IF_ERROR(decodedSize, "");
+        if (!decodedSize && !isSkipFrame) {
+            zds->streamStage = zdss_read;
+        } else {
+            zds->outEnd = zds->outStart + decodedSize;
+            zds->streamStage = zdss_flush;
+        }
+    } else {
+        /* Write directly into the output buffer */
+        size_t const dstSize = isSkipFrame ? 0 : (size_t)(oend - *op);
+        size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize);
+        FORWARD_IF_ERROR(decodedSize, "");
+        *op += decodedSize;
+        /* Flushing is not needed. */
+        zds->streamStage = zdss_read;
+        assert(*op <= oend);
+        assert(zds->outBufferMode == ZSTD_bm_stable);
+    }
+    return 0;
+}
+
+size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    const char* const src = (const char*)input->src;
+    const char* const istart = input->pos != 0 ? src + input->pos : src;
+    const char* const iend = input->size != 0 ? src + input->size : src;
+    const char* ip = istart;
+    char* const dst = (char*)output->dst;
+    char* const ostart = output->pos != 0 ? dst + output->pos : dst;
+    char* const oend = output->size != 0 ? dst + output->size : dst;
+    char* op = ostart;
+    U32 someMoreWork = 1;
+
+    DEBUGLOG(5, "ZSTD_decompressStream");
+    RETURN_ERROR_IF(
+        input->pos > input->size,
+        srcSize_wrong,
+        "forbidden. in: pos: %u   vs size: %u",
+        (U32)input->pos, (U32)input->size);
+    RETURN_ERROR_IF(
+        output->pos > output->size,
+        dstSize_tooSmall,
+        "forbidden. out: pos: %u   vs size: %u",
+        (U32)output->pos, (U32)output->size);
+    DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos));
+    FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), "");
+
+    while (someMoreWork) {
+        switch(zds->streamStage)
+        {
+        case zdss_init :
+            DEBUGLOG(5, "stage zdss_init => transparent reset ");
+            zds->streamStage = zdss_loadHeader;
+            zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+            zds->legacyVersion = 0;
+#endif
+            zds->hostageByte = 0;
+            zds->expectedOutBuffer = *output;
+            ZSTD_FALLTHROUGH;
+
+        case zdss_loadHeader :
+            DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+            if (zds->legacyVersion) {
+                RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+                    "legacy support is incompatible with static dctx");
+                {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+                    if (hint==0) zds->streamStage = zdss_init;
+                    return hint;
+            }   }
+#endif
+            {   size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+                if (zds->refMultipleDDicts && zds->ddictSet) {
+                    ZSTD_DCtx_selectFrameDDict(zds);
+                }
+                if (ZSTD_isError(hSize)) {
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+                    U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
+                    if (legacyVersion) {
+                        ZSTD_DDict const* const ddict = ZSTD_getDDict(zds);
+                        const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL;
+                        size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0;
+                        DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion);
+                        RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+                            "legacy support is incompatible with static dctx");
+                        FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext,
+                                    zds->previousLegacyVersion, legacyVersion,
+                                    dict, dictSize), "");
+                        zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
+                        {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input);
+                            if (hint==0) zds->streamStage = zdss_init;   /* or stay in stage zdss_loadHeader */
+                            return hint;
+                    }   }
+#endif
+                    return hSize;   /* error */
+                }
+                if (hSize != 0) {   /* need more input */
+                    size_t const toLoad = hSize - zds->lhSize;   /* if hSize!=0, hSize > zds->lhSize */
+                    size_t const remainingInput = (size_t)(iend-ip);
+                    assert(iend >= ip);
+                    if (toLoad > remainingInput) {   /* not enough input to load full header */
+                        if (remainingInput > 0) {
+                            ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput);
+                            zds->lhSize += remainingInput;
+                        }
+                        input->pos = input->size;
+                        /* check first few bytes */
+                        FORWARD_IF_ERROR(
+                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
+                            "First few bytes detected incorrect" );
+                        /* return hint input size */
+                        return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    assert(ip != NULL);
+                    ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* check for single-pass mode opportunity */
+            if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                && zds->fParams.frameType != ZSTD_skippableFrame
+                && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
+                if (cSize <= (size_t)(iend-istart)) {
+                    /* shortcut : using single-pass mode */
+                    size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                    if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    ip = istart + cSize;
+                    op += decompressedSize;
+                    zds->expected = 0;
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+            }   }
+
+            /* Check output buffer is large enough for ZSTD_odm_stable. */
+            if (zds->outBufferMode == ZSTD_bm_stable
+                && zds->fParams.frameType != ZSTD_skippableFrame
+                && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) {
+                RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small");
+            }
+
+            /* Consume header (see ZSTDds_decodeFrameHeader) */
+            DEBUGLOG(4, "Consume header");
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+
+            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                zds->stage = ZSTDds_skipFrame;
+            } else {
+                FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), "");
+                zds->expected = ZSTD_blockHeaderSize;
+                zds->stage = ZSTDds_decodeBlockHeader;
+            }
+
+            /* control buffer memory usage */
+            DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)",
+                        (U32)(zds->fParams.windowSize >>10),
+                        (U32)(zds->maxWindowSize >> 10) );
+            zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+            RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                            frameParameter_windowTooLarge, "");
+
+            /* Adapt buffer sizes to frame header instructions */
+            {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
+                        : 0;
+
+                ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+
+                {   int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize);
+                    int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds);
+
+                    if (tooSmall || tooLarge) {
+                        size_t const bufferSize = neededInBuffSize + neededOutBuffSize;
+                        DEBUGLOG(4, "inBuff  : from %u to %u",
+                                    (U32)zds->inBuffSize, (U32)neededInBuffSize);
+                        DEBUGLOG(4, "outBuff : from %u to %u",
+                                    (U32)zds->outBuffSize, (U32)neededOutBuffSize);
+                        if (zds->staticSize) {  /* static DCtx */
+                            DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
+                            assert(zds->staticSize >= sizeof(ZSTD_DCtx));  /* controlled at init */
+                            RETURN_ERROR_IF(
+                                bufferSize > zds->staticSize - sizeof(ZSTD_DCtx),
+                                memory_allocation, "");
+                        } else {
+                            ZSTD_customFree(zds->inBuff, zds->customMem);
+                            zds->inBuffSize = 0;
+                            zds->outBuffSize = 0;
+                            zds->inBuff = (char*)ZSTD_customMalloc(bufferSize, zds->customMem);
+                            RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, "");
+                        }
+                        zds->inBuffSize = neededInBuffSize;
+                        zds->outBuff = zds->inBuff + zds->inBuffSize;
+                        zds->outBuffSize = neededOutBuffSize;
+            }   }   }
+            zds->streamStage = zdss_read;
+            ZSTD_FALLTHROUGH;
+
+        case zdss_read:
+            DEBUGLOG(5, "stage zdss_read");
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip));
+                DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize);
+                if (neededInSize==0) {  /* end of frame */
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+                    ip += neededInSize;
+                    /* Function modifies the stage so we must break */
+                    break;
+            }   }
+            if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
+            zds->streamStage = zdss_load;
+            ZSTD_FALLTHROUGH;
+
+        case zdss_load:
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+                size_t const toLoad = neededInSize - zds->inPos;
+                int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                size_t loadedSize;
+                /* At this point we shouldn't be decompressing a block that we can stream. */
+                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+                if (isSkipFrame) {
+                    loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                } else {
+                    RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos,
+                                    corruption_detected,
+                                    "should never happen");
+                    loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                }
+                ip += loadedSize;
+                zds->inPos += loadedSize;
+                if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                zds->inPos = 0;   /* input is consumed */
+                FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), "");
+                /* Function modifies the stage so we must break */
+                break;
+            }
+        case zdss_flush:
+            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+                size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+                op += flushedSize;
+                zds->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {  /* flush completed */
+                    zds->streamStage = zdss_read;
+                    if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                (int)(zds->outBuffSize - zds->outStart),
+                                (U32)zds->fParams.blockSizeMax);
+                        zds->outStart = zds->outEnd = 0;
+                    }
+                    break;
+            }   }
+            /* cannot complete flush */
+            someMoreWork = 0;
+            break;
+
+        default:
+            assert(0);    /* impossible */
+            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+    }   }
+
+    /* result */
+    input->pos = (size_t)(ip - (const char*)(input->src));
+    output->pos = (size_t)(op - (char*)(output->dst));
+
+    /* Update the expected output buffer for ZSTD_obm_stable. */
+    zds->expectedOutBuffer = *output;
+
+    if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+        zds->noForwardProgress ++;
+        if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+            assert(0);
+        }
+    } else {
+        zds->noForwardProgress = 0;
+    }
+    {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
+        if (!nextSrcSizeHint) {   /* frame fully decoded */
+            if (zds->outEnd == zds->outStart) {  /* output fully flushed */
+                if (zds->hostageByte) {
+                    if (input->pos >= input->size) {
+                        /* can't release hostage (not present) */
+                        zds->streamStage = zdss_read;
+                        return 1;
+                    }
+                    input->pos++;  /* release hostage */
+                }   /* zds->hostageByte */
+                return 0;
+            }  /* zds->outEnd == zds->outStart */
+            if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+                input->pos--;   /* note : pos > 0, otherwise, impossible to finish reading last block */
+                zds->hostageByte=1;
+            }
+            return 1;
+        }  /* nextSrcSizeHint==0 */
+        nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block);   /* preload header of next block */
+        assert(zds->inPos <= nextSrcSizeHint);
+        nextSrcSizeHint -= zds->inPos;   /* part already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_block.c b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_block.c
new file mode 100644
index 000000000..e1ff21582
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_block.c
@@ -0,0 +1,2098 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_decompress_block :
+ * this module takes care of decompressing _compressed_ block */
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "../common/compiler.h"    /* prefetch */
+#include "../common/cpu.h"         /* bmi2 */
+#include "../common/mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "../common/fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "../common/huf.h"
+#include "../common/zstd_internal.h"
+#include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+#include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+#include "zstd_decompress_block.h"
+#include "../common/bits.h"  /* ZSTD_highbit32 */
+
+/*_*******************************************************
+*  Macros
+**********************************************************/
+
+/* These two optional macros force the use one way or another of the two
+ * ZSTD_decompressSequences implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
+#endif
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+ *   Block decoding
+ ***************************************************************/
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr)
+{
+    RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
+
+    {   U32 const cBlockHeader = MEM_readLE24(src);
+        U32 const cSize = cBlockHeader >> 3;
+        bpPtr->lastBlock = cBlockHeader & 1;
+        bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+        bpPtr->origSize = cSize;   /* only useful for RLE */
+        if (bpPtr->blockType == bt_rle) return 1;
+        RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
+        return cSize;
+    }
+}
+
+/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
+static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+    const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+{
+    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+    {
+        /* room for litbuffer to fit without read faulting */
+        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
+        dctx->litBufferEnd = dctx->litBuffer + litSize;
+        dctx->litBufferLocation = ZSTD_in_dst;
+    }
+    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+    {
+        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+        if (splitImmediately) {
+            /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+            dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+        }
+        else {
+            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+            dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+        }
+        dctx->litBufferLocation = ZSTD_split;
+    }
+    else
+    {
+        /* fits entirely within litExtraBuffer, so no split is necessary */
+        dctx->litBuffer = dctx->litExtraBuffer;
+        dctx->litBufferEnd = dctx->litBuffer + litSize;
+        dctx->litBufferLocation = ZSTD_not_in_dst;
+    }
+}
+
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity, const streaming_operation streaming);
+/*! ZSTD_decodeLiteralsBlock() :
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+ * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+ * block will be output.  Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
+ *
+ * @return : nb of bytes read from src (< srcSize )
+ *  note : symbol not declared but exposed for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                          void* dst, size_t dstCapacity, const streaming_operation streaming)
+{
+    DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
+    RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
+
+    {   const BYTE* const istart = (const BYTE*) src;
+        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+        switch(litEncType)
+        {
+        case set_repeat:
+            DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
+            RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
+            ZSTD_FALLTHROUGH;
+
+        case set_compressed:
+            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+            {   size_t lhSize, litSize, litCSize;
+                U32 singleStream=0;
+                U32 const lhlCode = (istart[0] >> 2) & 3;
+                U32 const lhc = MEM_readLE32(istart);
+                size_t hufSuccess;
+                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                switch(lhlCode)
+                {
+                case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    /* 2 - 2 - 10 - 10 */
+                    singleStream = !lhlCode;
+                    lhSize = 3;
+                    litSize  = (lhc >> 4) & 0x3FF;
+                    litCSize = (lhc >> 14) & 0x3FF;
+                    break;
+                case 2:
+                    /* 2 - 2 - 14 - 14 */
+                    lhSize = 4;
+                    litSize  = (lhc >> 4) & 0x3FFF;
+                    litCSize = lhc >> 18;
+                    break;
+                case 3:
+                    /* 2 - 2 - 18 - 18 */
+                    lhSize = 5;
+                    litSize  = (lhc >> 4) & 0x3FFFF;
+                    litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
+                    break;
+                }
+                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+
+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
+                if (litEncType==set_repeat) {
+                    if (singleStream) {
+                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                    } else {
+                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                    }
+                } else {
+                    if (singleStream) {
+#if defined(HUF_FORCE_DECOMPRESS_X2)
+                        hufSuccess = HUF_decompress1X_DCtx_wksp(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace));
+#else
+                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+#endif
+                    } else {
+                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                    }
+                }
+                if (dctx->litBufferLocation == ZSTD_split)
+                {
+                    ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                    ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                    dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                    dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
+                }
+
+                RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                dctx->litEntropy = 1;
+                if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
+                return litCSize + lhSize;
+            }
+
+        case set_basic:
+            {   size_t litSize, lhSize;
+                U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+
+                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                    RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
+                    if (dctx->litBufferLocation == ZSTD_split)
+                    {
+                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                        ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                    }
+                    else
+                    {
+                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
+                    }
+                    dctx->litPtr = dctx->litBuffer;
+                    dctx->litSize = litSize;
+                    return lhSize+litSize;
+                }
+                /* direct reference into compressed stream */
+                dctx->litPtr = istart+lhSize;
+                dctx->litSize = litSize;
+                dctx->litBufferEnd = dctx->litPtr + litSize;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                return lhSize+litSize;
+            }
+
+        case set_rle:
+            {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t litSize, lhSize;
+                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                if (dctx->litBufferLocation == ZSTD_split)
+                {
+                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
+                    ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
+                }
+                else
+                {
+                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
+                }
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                return lhSize+1;
+            }
+        default:
+            RETURN_ERROR(corruption_detected, "impossible");
+        }
+    }
+}
+
+/* Default FSE distribution tables.
+ * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+ * They were generated programmatically with following method :
+ * - start from default distributions, present in /lib/common/zstd_internal.h
+ * - generate tables normally, using ZSTD_buildFSETable()
+ * - printout the content of tables
+ * - pretify output, report below, test with fuzzer to ensure it's correct */
+
+/* Default FSE distribution table for Literal Lengths */
+static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+     {  1,  1,  1, LL_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+     /* nextState, nbAddBits, nbBits, baseVal */
+     {  0,  0,  4,    0},  { 16,  0,  4,    0},
+     { 32,  0,  5,    1},  {  0,  0,  5,    3},
+     {  0,  0,  5,    4},  {  0,  0,  5,    6},
+     {  0,  0,  5,    7},  {  0,  0,  5,    9},
+     {  0,  0,  5,   10},  {  0,  0,  5,   12},
+     {  0,  0,  6,   14},  {  0,  1,  5,   16},
+     {  0,  1,  5,   20},  {  0,  1,  5,   22},
+     {  0,  2,  5,   28},  {  0,  3,  5,   32},
+     {  0,  4,  5,   48},  { 32,  6,  5,   64},
+     {  0,  7,  5,  128},  {  0,  8,  6,  256},
+     {  0, 10,  6, 1024},  {  0, 12,  6, 4096},
+     { 32,  0,  4,    0},  {  0,  0,  4,    1},
+     {  0,  0,  5,    2},  { 32,  0,  5,    4},
+     {  0,  0,  5,    5},  { 32,  0,  5,    7},
+     {  0,  0,  5,    8},  { 32,  0,  5,   10},
+     {  0,  0,  5,   11},  {  0,  0,  6,   13},
+     { 32,  1,  5,   16},  {  0,  1,  5,   18},
+     { 32,  1,  5,   22},  {  0,  2,  5,   24},
+     { 32,  3,  5,   32},  {  0,  3,  5,   40},
+     {  0,  6,  4,   64},  { 16,  6,  4,   64},
+     { 32,  7,  5,  128},  {  0,  9,  6,  512},
+     {  0, 11,  6, 2048},  { 48,  0,  4,    0},
+     { 16,  0,  4,    1},  { 32,  0,  5,    2},
+     { 32,  0,  5,    3},  { 32,  0,  5,    5},
+     { 32,  0,  5,    6},  { 32,  0,  5,    8},
+     { 32,  0,  5,    9},  { 32,  0,  5,   11},
+     { 32,  0,  5,   12},  {  0,  0,  6,   15},
+     { 32,  1,  5,   18},  { 32,  1,  5,   20},
+     { 32,  2,  5,   24},  { 32,  2,  5,   28},
+     { 32,  3,  5,   40},  { 32,  4,  5,   48},
+     {  0, 16,  6,65536},  {  0, 15,  6,32768},
+     {  0, 14,  6,16384},  {  0, 13,  6, 8192},
+};   /* LL_defaultDTable */
+
+/* Default FSE distribution table for Offset Codes */
+static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, OF_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  5,    0},     {  0,  6,  4,   61},
+    {  0,  9,  5,  509},     {  0, 15,  5,32765},
+    {  0, 21,  5,2097149},   {  0,  3,  5,    5},
+    {  0,  7,  4,  125},     {  0, 12,  5, 4093},
+    {  0, 18,  5,262141},    {  0, 23,  5,8388605},
+    {  0,  5,  5,   29},     {  0,  8,  4,  253},
+    {  0, 14,  5,16381},     {  0, 20,  5,1048573},
+    {  0,  2,  5,    1},     { 16,  7,  4,  125},
+    {  0, 11,  5, 2045},     {  0, 17,  5,131069},
+    {  0, 22,  5,4194301},   {  0,  4,  5,   13},
+    { 16,  8,  4,  253},     {  0, 13,  5, 8189},
+    {  0, 19,  5,524285},    {  0,  1,  5,    1},
+    { 16,  6,  4,   61},     {  0, 10,  5, 1021},
+    {  0, 16,  5,65533},     {  0, 28,  5,268435453},
+    {  0, 27,  5,134217725}, {  0, 26,  5,67108861},
+    {  0, 25,  5,33554429},  {  0, 24,  5,16777213},
+};   /* OF_defaultDTable */
+
+
+/* Default FSE distribution table for Match Lengths */
+static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, ML_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  6,    3},  {  0,  0,  4,    4},
+    { 32,  0,  5,    5},  {  0,  0,  5,    6},
+    {  0,  0,  5,    8},  {  0,  0,  5,    9},
+    {  0,  0,  5,   11},  {  0,  0,  6,   13},
+    {  0,  0,  6,   16},  {  0,  0,  6,   19},
+    {  0,  0,  6,   22},  {  0,  0,  6,   25},
+    {  0,  0,  6,   28},  {  0,  0,  6,   31},
+    {  0,  0,  6,   34},  {  0,  1,  6,   37},
+    {  0,  1,  6,   41},  {  0,  2,  6,   47},
+    {  0,  3,  6,   59},  {  0,  4,  6,   83},
+    {  0,  7,  6,  131},  {  0,  9,  6,  515},
+    { 16,  0,  4,    4},  {  0,  0,  4,    5},
+    { 32,  0,  5,    6},  {  0,  0,  5,    7},
+    { 32,  0,  5,    9},  {  0,  0,  5,   10},
+    {  0,  0,  6,   12},  {  0,  0,  6,   15},
+    {  0,  0,  6,   18},  {  0,  0,  6,   21},
+    {  0,  0,  6,   24},  {  0,  0,  6,   27},
+    {  0,  0,  6,   30},  {  0,  0,  6,   33},
+    {  0,  1,  6,   35},  {  0,  1,  6,   39},
+    {  0,  2,  6,   43},  {  0,  3,  6,   51},
+    {  0,  4,  6,   67},  {  0,  5,  6,   99},
+    {  0,  8,  6,  259},  { 32,  0,  4,    4},
+    { 48,  0,  4,    4},  { 16,  0,  4,    5},
+    { 32,  0,  5,    7},  { 32,  0,  5,    8},
+    { 32,  0,  5,   10},  { 32,  0,  5,   11},
+    {  0,  0,  6,   14},  {  0,  0,  6,   17},
+    {  0,  0,  6,   20},  {  0,  0,  6,   23},
+    {  0,  0,  6,   26},  {  0,  0,  6,   29},
+    {  0,  0,  6,   32},  {  0, 16,  6,65539},
+    {  0, 15,  6,32771},  {  0, 14,  6,16387},
+    {  0, 13,  6, 8195},  {  0, 12,  6, 4099},
+    {  0, 11,  6, 2051},  {  0, 10,  6, 1027},
+};   /* ML_defaultDTable */
+
+
+static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
+{
+    void* ptr = dt;
+    ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
+    ZSTD_seqSymbol* const cell = dt + 1;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->nbBits = 0;
+    cell->nextState = 0;
+    assert(nbAddBits < 255);
+    cell->nbAdditionalBits = nbAddBits;
+    cell->baseValue = baseValue;
+}
+
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * cannot fail if input is valid =>
+ * all inputs are presumed validated at this stage */
+FORCE_INLINE_TEMPLATE
+void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize)
+{
+    ZSTD_seqSymbol* const tableDecode = dt+1;
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+
+    U16* symbolNext = (U16*)wksp;
+    BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
+    U32 highThreshold = tableSize - 1;
+
+
+    /* Sanity Checks */
+    assert(maxSymbolValue <= MaxSeq);
+    assert(tableLog <= MaxFSELog);
+    assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
+    (void)wkspSize;
+    /* Init, lay down lowprob symbols */
+    {   ZSTD_seqSymbol_header DTableH;
+        DTableH.tableLog = tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].baseValue = s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    assert(normalizedCounter[s]>=0);
+                    symbolNext[s] = (U16)normalizedCounter[s];
+        }   }   }
+        ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    assert(tableSize <= 512);
+    /* Specialized symbol spreading for the case when there are
+     * no low probability (-1 count) symbols. When compressing
+     * small blocks we avoid low probability symbols to hit this
+     * case, since header decoding speed matters more.
+     */
+    if (highThreshold == tableSize - 1) {
+        size_t const tableMask = tableSize-1;
+        size_t const step = FSE_TABLESTEP(tableSize);
+        /* First lay down the symbols in order.
+         * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
+         * misses since small blocks generally have small table logs, so nearly
+         * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+         * our buffer to handle the over-write.
+         */
+        {
+            U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                pos += n;
+            }
+        }
+        /* Now we spread those positions across the table.
+         * The benefit of doing it in two stages is that we avoid the
+         * variable size inner loop, which caused lots of branch misses.
+         * Now we can run through all the positions without any branch misses.
+         * We unroll the loop twice, since that is what empirically worked best.
+         */
+        {
+            size_t position = 0;
+            size_t s;
+            size_t const unroll = 2;
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableDecode[uPosition].baseValue = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);
+        }
+    } else {
+        U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            int const n = normalizedCounter[s];
+            for (i=0; i<n; i++) {
+                tableDecode[position].baseValue = s;
+                position = (position + step) & tableMask;
+                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {
+        U32 u;
+        for (u=0; u<tableSize; u++) {
+            U32 const symbol = tableDecode[u].baseValue;
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+            tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+            assert(nbAdditionalBits[symbol] < 255);
+            tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+            tableDecode[u].baseValue = baseValue[symbol];
+        }
+    }
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize)
+{
+    ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
+            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize)
+{
+    ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
+            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+}
+#endif
+
+void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U8* nbAdditionalBits,
+            unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
+                baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+        return;
+    }
+#endif
+    (void)bmi2;
+    ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
+            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
+}
+
+
+/*! ZSTD_buildSeqTable() :
+ * @return : nb bytes read from src,
+ *           or an error code if it fails */
+static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+                                 symbolEncodingType_e type, unsigned max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const U32* baseValue, const U8* nbAdditionalBits,
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
+                                 int bmi2)
+{
+    switch(type)
+    {
+    case set_rle :
+        RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
+        {   U32 const symbol = *(const BYTE*)src;
+            U32 const baseline = baseValue[symbol];
+            U8 const nbBits = nbAdditionalBits[symbol];
+            ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
+        }
+        *DTablePtr = DTableSpace;
+        return 1;
+    case set_basic :
+        *DTablePtr = defaultTable;
+        return 0;
+    case set_repeat:
+        RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
+        return 0;
+    case set_compressed :
+        {   unsigned tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+            RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
+            RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
+            ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
+            *DTablePtr = DTableSpace;
+            return headerSize;
+        }
+    default :
+        assert(0);
+        RETURN_ERROR(GENERIC, "impossible");
+    }
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+    int nbSeq;
+    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
+
+    /* check */
+    RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
+
+    /* SeqHead */
+    nbSeq = *ip++;
+    if (!nbSeq) {
+        *nbSeqPtr=0;
+        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+        return 1;
+    }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
+            ip+=2;
+        } else {
+            RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+        }
+    }
+    *nbSeqPtr = nbSeq;
+
+    /* FSE table descriptors */
+    RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
+    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+        ip++;
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
+                                                      LLtype, MaxLL, LLFSELog,
+                                                      ip, iend-ip,
+                                                      LL_base, LL_bits,
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq,
+                                                      dctx->workspace, sizeof(dctx->workspace),
+                                                      ZSTD_DCtx_get_bmi2(dctx));
+            RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += llhSize;
+        }
+
+        {   size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
+                                                      OFtype, MaxOff, OffFSELog,
+                                                      ip, iend-ip,
+                                                      OF_base, OF_bits,
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq,
+                                                      dctx->workspace, sizeof(dctx->workspace),
+                                                      ZSTD_DCtx_get_bmi2(dctx));
+            RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += ofhSize;
+        }
+
+        {   size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
+                                                      MLtype, MaxML, MLFSELog,
+                                                      ip, iend-ip,
+                                                      ML_base, ML_bits,
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq,
+                                                      dctx->workspace, sizeof(dctx->workspace),
+                                                      ZSTD_DCtx_get_bmi2(dctx));
+            RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += mlhSize;
+        }
+    }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    size_t state;
+    const ZSTD_seqSymbol* table;
+} ZSTD_fseState;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    ZSTD_fseState stateLL;
+    ZSTD_fseState stateOffb;
+    ZSTD_fseState stateML;
+    size_t prevOffset[ZSTD_REP_NUM];
+} seqState_t;
+
+/*! ZSTD_overlapCopy8() :
+ *  Copies 8 bytes from ip to op and updates op and ip where ip <= op.
+ *  If the offset is < 8 then the offset is spread to at least 8 bytes.
+ *
+ *  Precondition: *ip <= *op
+ *  Postcondition: *op - *op >= 8
+ */
+HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
+    assert(*ip <= *op);
+    if (offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[offset];
+        (*op)[0] = (*ip)[0];
+        (*op)[1] = (*ip)[1];
+        (*op)[2] = (*ip)[2];
+        (*op)[3] = (*ip)[3];
+        *ip += dec32table[offset];
+        ZSTD_copy4(*op+4, *ip);
+        *ip -= sub2;
+    } else {
+        ZSTD_copy8(*op, *ip);
+    }
+    *ip += 8;
+    *op += 8;
+    assert(*op - *ip >= 8);
+}
+
+/*! ZSTD_safecopy() :
+ *  Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
+ *  and write up to 16 bytes past oend_w (op >= oend_w is allowed).
+ *  This function is only called in the uncommon case where the sequence is near the end of the block. It
+ *  should be fast for a single long sequence, but can be slow for several short sequences.
+ *
+ *  @param ovtype controls the overlap detection
+ *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ *         - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
+ *           The src buffer must be before the dst buffer.
+ */
+static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
+    ptrdiff_t const diff = op - ip;
+    BYTE* const oend = op + length;
+
+    assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
+           (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
+
+    if (length < 8) {
+        /* Handle short lengths. */
+        while (op < oend) *op++ = *ip++;
+        return;
+    }
+    if (ovtype == ZSTD_overlap_src_before_dst) {
+        /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
+        assert(length >= 8);
+        ZSTD_overlapCopy8(&op, &ip, diff);
+        length -= 8;
+        assert(op - ip >= 8);
+        assert(op <= oend);
+    }
+
+    if (oend <= oend_w) {
+        /* No risk of overwrite. */
+        ZSTD_wildcopy(op, ip, length, ovtype);
+        return;
+    }
+    if (op <= oend_w) {
+        /* Wildcopy until we get close to the end. */
+        assert(oend > oend_w);
+        ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
+        ip += oend_w - op;
+        op += oend_w - op;
+    }
+    /* Handle the leftovers. */
+    while (op < oend) *op++ = *ip++;
+}
+
+/* ZSTD_safecopyDstBeforeSrc():
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
+    ptrdiff_t const diff = op - ip;
+    BYTE* const oend = op + length;
+
+    if (length < 8 || diff > -8) {
+        /* Handle short lengths, close overlaps, and dst not before src. */
+        while (op < oend) *op++ = *ip++;
+        return;
+    }
+
+    if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
+        ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
+        ip += oend - WILDCOPY_OVERLENGTH - op;
+        op += oend - WILDCOPY_OVERLENGTH - op;
+    }
+
+    /* Handle the leftovers. */
+    while (op < oend) *op++ = *ip++;
+}
+
+/* ZSTD_execSequenceEnd():
+ * This version handles cases that are near the end of the output buffer. It requires
+ * more careful checks to make sure there is no overflow. By separating out these hard
+ * and unlikely cases, we can speed up the common cases.
+ *
+ * NOTE: This function needs to be fast for a single long sequence, but doesn't need
+ * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+ */
+FORCE_NOINLINE
+size_t ZSTD_execSequenceEnd(BYTE* op,
+    BYTE* const oend, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+
+    /* bounds checks : careful of address space overflow in 32-bit mode */
+    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
+    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
+    assert(op < op + sequenceLength);
+    assert(oLitEnd < op + sequenceLength);
+
+    /* copy literals */
+    ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
+    op = oLitEnd;
+    *litPtr = iLitEnd;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
+        match = dictEnd - (prefixStart - match);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op = oLitEnd + length1;
+        sequence.matchLength -= length1;
+        match = prefixStart;
+        }
+    }
+    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
+    return sequenceLength;
+}
+
+/* ZSTD_execSequenceEndSplitLitBuffer():
+ * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+ */
+FORCE_NOINLINE
+size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+
+    /* bounds checks : careful of address space overflow in 32-bit mode */
+    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
+    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
+    assert(op < op + sequenceLength);
+    assert(oLitEnd < op + sequenceLength);
+
+    /* copy literals */
+    RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
+    ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
+    op = oLitEnd;
+    *litPtr = iLitEnd;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
+        match = dictEnd - (prefixStart - match);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op = oLitEnd + length1;
+        sequence.matchLength -= length1;
+        match = prefixStart;
+        }
+    }
+    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
+    return sequenceLength;
+}
+
+HINT_INLINE
+size_t ZSTD_execSequence(BYTE* op,
+    BYTE* const oend, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;   /* risk : address space underflow on oend=NULL */
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    assert(op != NULL /* Precondition */);
+    assert(oend_w < oend /* No underflow */);
+
+#if defined(__aarch64__)
+    /* prefetch sequence starting from match that will be used for copy later */
+    PREFETCH_L1(match);
+#endif
+    /* Handle edge cases in a slow path:
+     *   - Read beyond end of literals
+     *   - Match end is within WILDCOPY_OVERLIMIT of oend
+     *   - 32-bit mode and the match length overflows
+     */
+    if (UNLIKELY(
+        iLitEnd > litLimit ||
+        oMatchEnd > oend_w ||
+        (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
+        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
+
+    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
+    assert(op <= oLitEnd /* No overflow */);
+    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
+    assert(oMatchEnd <= oend /* No underflow */);
+    assert(iLitEnd <= litLimit /* Literal length is in bounds */);
+    assert(oLitEnd <= oend_w /* Can wildcopy literals */);
+    assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
+
+    /* Copy Literals:
+     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
+     * We likely don't need the full 32-byte wildcopy.
+     */
+    assert(WILDCOPY_OVERLENGTH >= 16);
+    ZSTD_copy16(op, (*litPtr));
+    if (UNLIKELY(sequence.litLength > 16)) {
+        ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
+    }
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* Copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix -> go into extDict */
+        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
+        match = dictEnd + (match - prefixStart);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op = oLitEnd + length1;
+        sequence.matchLength -= length1;
+        match = prefixStart;
+        }
+    }
+    /* Match within prefix of 1 or more bytes */
+    assert(op <= oMatchEnd);
+    assert(oMatchEnd <= oend_w);
+    assert(match >= prefixStart);
+    assert(sequence.matchLength >= 1);
+
+    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
+     * without overlap checking.
+     */
+    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
+        /* We bet on a full wildcopy for matches, since we expect matches to be
+         * longer than literals (in general). In silesia, ~10% of matches are longer
+         * than 16 bytes.
+         */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
+        return sequenceLength;
+    }
+    assert(sequence.offset < WILDCOPY_VECLEN);
+
+    /* Copy 8 bytes and spread the offset to be >= 8. */
+    ZSTD_overlapCopy8(&op, &match, sequence.offset);
+
+    /* If the match length is > 8 bytes, then continue with the wildcopy. */
+    if (sequence.matchLength > 8) {
+        assert(op < oMatchEnd);
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
+    }
+    return sequenceLength;
+}
+
+HINT_INLINE
+size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    assert(op != NULL /* Precondition */);
+    assert(oend_w < oend /* No underflow */);
+    /* Handle edge cases in a slow path:
+     *   - Read beyond end of literals
+     *   - Match end is within WILDCOPY_OVERLIMIT of oend
+     *   - 32-bit mode and the match length overflows
+     */
+    if (UNLIKELY(
+            iLitEnd > litLimit ||
+            oMatchEnd > oend_w ||
+            (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
+        return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
+
+    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
+    assert(op <= oLitEnd /* No overflow */);
+    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
+    assert(oMatchEnd <= oend /* No underflow */);
+    assert(iLitEnd <= litLimit /* Literal length is in bounds */);
+    assert(oLitEnd <= oend_w /* Can wildcopy literals */);
+    assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
+
+    /* Copy Literals:
+     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
+     * We likely don't need the full 32-byte wildcopy.
+     */
+    assert(WILDCOPY_OVERLENGTH >= 16);
+    ZSTD_copy16(op, (*litPtr));
+    if (UNLIKELY(sequence.litLength > 16)) {
+        ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
+    }
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* Copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix -> go into extDict */
+        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
+        match = dictEnd + (match - prefixStart);
+        if (match + sequence.matchLength <= dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            ZSTD_memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = prefixStart;
+    }   }
+    /* Match within prefix of 1 or more bytes */
+    assert(op <= oMatchEnd);
+    assert(oMatchEnd <= oend_w);
+    assert(match >= prefixStart);
+    assert(sequence.matchLength >= 1);
+
+    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
+     * without overlap checking.
+     */
+    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
+        /* We bet on a full wildcopy for matches, since we expect matches to be
+         * longer than literals (in general). In silesia, ~10% of matches are longer
+         * than 16 bytes.
+         */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
+        return sequenceLength;
+    }
+    assert(sequence.offset < WILDCOPY_VECLEN);
+
+    /* Copy 8 bytes and spread the offset to be >= 8. */
+    ZSTD_overlapCopy8(&op, &match, sequence.offset);
+
+    /* If the match length is > 8 bytes, then continue with the wildcopy. */
+    if (sequence.matchLength > 8) {
+        assert(op < oMatchEnd);
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
+    }
+    return sequenceLength;
+}
+
+
+static void
+ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
+{
+    const void* ptr = dt;
+    const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
+                (U32)DStatePtr->state, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
+{
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = nextState + lowBits;
+}
+
+/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+ * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * bits before reloading. This value is the maximum number of bytes we read
+ * after reloading when we are decoding long offsets.
+ */
+#define LONG_OFFSETS_MAX_EXTRA_BITS_32                       \
+    (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32       \
+        ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32  \
+        : 0)
+
+typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+{
+    seq_t seq;
+    /*
+     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
+     * loaded in one operation and extracted its fields by simply shifting or
+     * bit-extracting on aarch64.
+     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
+     * operations that cause performance drop. This can be avoided by using this
+     * ZSTD_memcpy hack.
+     */
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
+    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
+    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
+    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
+    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
+#else
+    const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+    const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+    const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+#endif
+    seq.matchLength = mlDInfo->baseValue;
+    seq.litLength = llDInfo->baseValue;
+    {   U32 const ofBase = ofDInfo->baseValue;
+        BYTE const llBits = llDInfo->nbAdditionalBits;
+        BYTE const mlBits = mlDInfo->nbAdditionalBits;
+        BYTE const ofBits = ofDInfo->nbAdditionalBits;
+        BYTE const totalBits = llBits+mlBits+ofBits;
+
+        U16 const llNext = llDInfo->nextState;
+        U16 const mlNext = mlDInfo->nextState;
+        U16 const ofNext = ofDInfo->nextState;
+        U32 const llnbBits = llDInfo->nbBits;
+        U32 const mlnbBits = mlDInfo->nbBits;
+        U32 const ofnbBits = ofDInfo->nbBits;
+        /*
+         * As gcc has better branch and block analyzers, sometimes it is only
+         * valuable to mark likeliness for clang, it gives around 3-4% of
+         * performance.
+         */
+
+        /* sequence */
+        {   size_t offset;
+    #if defined(__clang__)
+            if (LIKELY(ofBits > 1)) {
+    #else
+            if (ofBits > 1) {
+    #endif
+                ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+                assert(ofBits <= MaxOff);
+                if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                    offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                    BIT_reloadDStream(&seqState->DStream);
+                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+                } else {
+                    offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+                }
+                seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset;
+            } else {
+                U32 const ll0 = (llDInfo->baseValue == 0);
+                if (LIKELY((ofBits == 0))) {
+                    offset = seqState->prevOffset[ll0];
+                    seqState->prevOffset[1] = seqState->prevOffset[!ll0];
+                    seqState->prevOffset[0] = offset;
+                } else {
+                    offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                    {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                        if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                        seqState->prevOffset[1] = seqState->prevOffset[0];
+                        seqState->prevOffset[0] = offset = temp;
+            }   }   }
+            seq.offset = offset;
+        }
+
+    #if defined(__clang__)
+        if (UNLIKELY(mlBits > 0))
+    #else
+        if (mlBits > 0)
+    #endif
+            seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
+        if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+            BIT_reloadDStream(&seqState->DStream);
+        if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+            BIT_reloadDStream(&seqState->DStream);
+        /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+        ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+    #if defined(__clang__)
+        if (UNLIKELY(llBits > 0))
+    #else
+        if (llBits > 0)
+    #endif
+            seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
+        if (MEM_32bits())
+            BIT_reloadDStream(&seqState->DStream);
+
+        DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                    (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
+    }
+
+    return seq;
+}
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+{
+    size_t const windowSize = dctx->fParams.windowSize;
+    /* No dictionary used. */
+    if (dctx->dictContentEndForFuzzing == NULL) return 0;
+    /* Dictionary is our prefix. */
+    if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
+    /* Dictionary is not our ext-dict. */
+    if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
+    /* Dictionary is not within our window size. */
+    if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
+    /* Dictionary is active. */
+    return 1;
+}
+
+MEM_STATIC void ZSTD_assertValidSequence(
+        ZSTD_DCtx const* dctx,
+        BYTE const* op, BYTE const* oend,
+        seq_t const seq,
+        BYTE const* prefixStart, BYTE const* virtualStart)
+{
+#if DEBUGLEVEL >= 1
+    size_t const windowSize = dctx->fParams.windowSize;
+    size_t const sequenceSize = seq.litLength + seq.matchLength;
+    BYTE const* const oLitEnd = op + seq.litLength;
+    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+    assert(op <= oend);
+    assert((size_t)(oend - op) >= sequenceSize);
+    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+        /* Offset must be within the dictionary. */
+        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+        assert(seq.offset <= windowSize + dictSize);
+    } else {
+        /* Offset must be within our window. */
+        assert(seq.offset <= windowSize);
+    }
+#else
+    (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+#endif
+}
+#endif
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+
+
+FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset,
+                         const int frame)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* litBufferEnd = dctx->litBufferEnd;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
+    (void)frame;
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+        assert(dst != NULL);
+
+        ZSTD_STATIC_ASSERT(
+                BIT_DStream_unfinished < BIT_DStream_completed &&
+                BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+                BIT_DStream_completed < BIT_DStream_overflow);
+
+        /* decompress without overrunning litPtr begins */
+        {
+            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+            /* Align the decompression loop to 32 + 16 bytes.
+                *
+                * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+                * speed swings based on the alignment of the decompression loop. This
+                * performance swing is caused by parts of the decompression loop falling
+                * out of the DSB. The entire decompression loop should fit in the DSB,
+                * when it can't we get much worse performance. You can measure if you've
+                * hit the good case or the bad case with this perf command for some
+                * compressed file test.zst:
+                *
+                *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
+                *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
+                *
+                * If you see most cycles served out of the MITE you've hit the bad case.
+                * If you see most cycles served out of the DSB you've hit the good case.
+                * If it is pretty even then you may be in an okay case.
+                *
+                * This issue has been reproduced on the following CPUs:
+                *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
+                *               Use Instruments->Counters to get DSB/MITE cycles.
+                *               I never got performance swings, but I was able to
+                *               go from the good case of mostly DSB to half of the
+                *               cycles served from MITE.
+                *   - Coffeelake: Intel i9-9900k
+                *   - Coffeelake: Intel i7-9700k
+                *
+                * I haven't been able to reproduce the instability or DSB misses on any
+                * of the following CPUS:
+                *   - Haswell
+                *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
+                *   - Skylake
+                *
+                * Alignment is done for each of the three major decompression loops:
+                *   - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
+                *   - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
+                *   - ZSTD_decompressSequences_body
+                * Alignment choices are made to minimize large swings on bad cases and influence on performance
+                * from changes external to this code, rather than to overoptimize on the current commit.
+                *
+                * If you are seeing performance stability this script can help test.
+                * It tests on 4 commits in zstd where I saw performance change.
+                *
+                *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
+                */
+#if defined(__GNUC__) && defined(__x86_64__)
+            __asm__(".p2align 6");
+#  if __GNUC__ >= 7
+	    /* good for gcc-7, gcc-9, and gcc-11 */
+            __asm__("nop");
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 4");
+#    if __GNUC__ == 8 || __GNUC__ == 10
+	    /* good for gcc-8 and gcc-10 */
+            __asm__("nop");
+            __asm__(".p2align 3");
+#    endif
+#  endif
+#endif
+
+            /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
+                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                    return oneSeqSize;
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                op += oneSeqSize;
+                if (UNLIKELY(!--nbSeq))
+                    break;
+                BIT_reloadDStream(&(seqState.DStream));
+                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+            }
+
+            /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+            if (nbSeq > 0) {
+                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                if (leftoverLit)
+                {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequence.litLength -= leftoverLit;
+                    op += leftoverLit;
+                }
+                litPtr = dctx->litExtraBuffer;
+                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                {
+                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                    assert(!ZSTD_isError(oneSeqSize));
+                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                        return oneSeqSize;
+                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                    op += oneSeqSize;
+                    if (--nbSeq)
+                        BIT_reloadDStream(&(seqState.DStream));
+                }
+            }
+        }
+
+        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+        {
+
+#if defined(__GNUC__) && defined(__x86_64__)
+            __asm__(".p2align 6");
+            __asm__("nop");
+#  if __GNUC__ != 7
+            /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
+            __asm__(".p2align 4");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  elif __GNUC__ >= 11
+            __asm__(".p2align 3");
+#  else
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  endif
+#endif
+
+            for (; ; ) {
+                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                    return oneSeqSize;
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                op += oneSeqSize;
+                if (UNLIKELY(!--nbSeq))
+                    break;
+                BIT_reloadDStream(&(seqState.DStream));
+            }
+        }
+
+        /* check if reached exact end */
+        DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+    {
+        size_t const lastLLSize = litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+        litPtr = dctx->litExtraBuffer;
+        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+        dctx->litBufferLocation = ZSTD_not_in_dst;
+    }
+    {   size_t const lastLLSize = litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+    void* dst, size_t maxDstSize,
+    const void* seqStart, size_t seqSize, int nbSeq,
+    const ZSTD_longOffset_e isLongOffset,
+    const int frame)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+    (void)frame;
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+        assert(dst != NULL);
+
+        ZSTD_STATIC_ASSERT(
+            BIT_DStream_unfinished < BIT_DStream_completed &&
+            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+            BIT_DStream_completed < BIT_DStream_overflow);
+
+#if defined(__GNUC__) && defined(__x86_64__)
+            __asm__(".p2align 6");
+            __asm__("nop");
+#  if __GNUC__ >= 7
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  else
+            __asm__(".p2align 4");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  endif
+#endif
+
+        for ( ; ; ) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+            assert(!ZSTD_isError(oneSeqSize));
+            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+            if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                return oneSeqSize;
+            DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+            op += oneSeqSize;
+            if (UNLIKELY(!--nbSeq))
+                break;
+            BIT_reloadDStream(&(seqState.DStream));
+        }
+
+        /* check if reached exact end */
+        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+
+static size_t
+ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                               void* dst, size_t maxDstSize,
+                                         const void* seqStart, size_t seqSize, int nbSeq,
+                                         const ZSTD_longOffset_e isLongOffset,
+                                         const int frame)
+{
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                   const BYTE* const prefixStart, const BYTE* const dictEnd)
+{
+    prefetchPos += sequence.litLength;
+    {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
+        PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+    }
+    return prefetchPos + sequence.matchLength;
+}
+
+/* This decoding function employs prefetching
+ * to reduce latency impact of cache misses.
+ * It's generally employed when block contains a significant portion of long-distance matches
+ * or when coupled with a "cold" dictionary */
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequencesLong_body(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset,
+                         const int frame)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* litBufferEnd = dctx->litBufferEnd;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    (void)frame;
+
+    /* Regen sequences */
+    if (nbSeq) {
+#define STORED_SEQS 8
+#define STORED_SEQS_MASK (STORED_SEQS-1)
+#define ADVANCED_SEQS STORED_SEQS
+        seq_t sequences[STORED_SEQS];
+        int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+        seqState_t seqState;
+        int seqNb;
+        size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
+
+        dctx->fseEntropy = 1;
+        { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        assert(dst != NULL);
+        assert(iend >= ip);
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        /* prepare in advance */
+        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+            prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+            sequences[seqNb] = sequence;
+        }
+        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+
+        /* decompress without stomping litBuffer */
+        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
+            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+            size_t oneSeqSize;
+
+            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
+            {
+                /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                if (leftoverLit)
+                {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
+                    op += leftoverLit;
+                }
+                litPtr = dctx->litExtraBuffer;
+                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+
+                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+                op += oneSeqSize;
+            }
+            else
+            {
+                /* lit buffer is either wholly contained in first or second split, or not split at all*/
+                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                    ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+
+                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+                op += oneSeqSize;
+            }
+        }
+        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
+
+        /* finish queue */
+        seqNb -= seqAdvance;
+        for ( ; seqNb<nbSeq ; seqNb++) {
+            seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
+            {
+                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                if (leftoverLit)
+                {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequence->litLength -= leftoverLit;
+                    op += leftoverLit;
+                }
+                litPtr = dctx->litExtraBuffer;
+                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                dctx->litBufferLocation = ZSTD_not_in_dst;
+                {
+                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                    assert(!ZSTD_isError(oneSeqSize));
+                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                    op += oneSeqSize;
+                }
+            }
+            else
+            {
+                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                    ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+            }
+        }
+
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
+    {
+        size_t const lastLLSize = litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+        litPtr = dctx->litExtraBuffer;
+        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+    }
+    {   size_t const lastLLSize = litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+
+#if DYNAMIC_BMI2
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static BMI2_TARGET_ATTRIBUTE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+static BMI2_TARGET_ATTRIBUTE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+static BMI2_TARGET_ATTRIBUTE size_t
+ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+#endif /* DYNAMIC_BMI2 */
+
+typedef size_t (*ZSTD_decompressSequences_t)(
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+                            const ZSTD_longOffset_e isLongOffset,
+                            const int frame);
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static size_t
+ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                   const void* seqStart, size_t seqSize, int nbSeq,
+                   const ZSTD_longOffset_e isLongOffset,
+                   const int frame)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequences");
+#if DYNAMIC_BMI2
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
+        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    }
+#endif
+    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+static size_t
+ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                 const void* seqStart, size_t seqSize, int nbSeq,
+                                 const ZSTD_longOffset_e isLongOffset,
+                                 const int frame)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+#if DYNAMIC_BMI2
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
+        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    }
+#endif
+    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+/* ZSTD_decompressSequencesLong() :
+ * decompression function triggered when a minimum share of offsets is considered "long",
+ * aka out of cache.
+ * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
+ * This function will try to mitigate main memory latency through the use of prefetching */
+static size_t
+ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                             void* dst, size_t maxDstSize,
+                             const void* seqStart, size_t seqSize, int nbSeq,
+                             const ZSTD_longOffset_e isLongOffset,
+                             const int frame)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+#if DYNAMIC_BMI2
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
+        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    }
+#endif
+  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+/* ZSTD_getLongOffsetsShare() :
+ * condition : offTable must be valid
+ * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+ *           compared to maximum possible of (1<<OffFSELog) */
+static unsigned
+ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+{
+    const void* ptr = offTable;
+    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+    const ZSTD_seqSymbol* table = offTable + 1;
+    U32 const max = 1 << tableLog;
+    U32 u, total = 0;
+    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+    assert(max <= (1 << OffFSELog));  /* max not too large */
+    for (u=0; u<max; u++) {
+        if (table[u].nbAdditionalBits > 22) total += 1;
+    }
+
+    assert(tableLog <= OffFSELog);
+    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+
+    return total;
+}
+#endif
+
+size_t
+ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+    /* isLongOffset must be true if there are long offsets.
+     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+     * We don't expect that to be the case in 64-bit mode.
+     * In block mode, window size is not known, so we have to be conservative.
+     * (note: but it could be evaluated from current-lowLimit)
+     */
+    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+
+    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+
+    /* Decode literals section */
+    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+        if (ZSTD_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+
+    /* Build Decoding Tables */
+    {
+        /* These macros control at build-time which decompressor implementation
+         * we use. If neither is defined, we do some inspection and dispatch at
+         * runtime.
+         */
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        int usePrefetchDecoder = dctx->ddictIsCold;
+#endif
+        int nbSeq;
+        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+        if (ZSTD_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+        srcSize -= seqHSize;
+
+        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        if ( !usePrefetchDecoder
+          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+            usePrefetchDecoder = (shareLongOffsets >= minShare);
+        }
+#endif
+
+        dctx->ddictIsCold = 0;
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        if (usePrefetchDecoder)
+#endif
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+#endif
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+        /* else */
+        if (dctx->litBufferLocation == ZSTD_split)
+            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+        else
+            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+#endif
+    }
+}
+
+
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+{
+    if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+        dctx->prefixStart = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    size_t dSize;
+    ZSTD_checkContinuity(dctx, dst, dstCapacity);
+    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_block.h b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_block.h
new file mode 100644
index 000000000..c61a9d0c4
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_block.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DEC_BLOCK_H
+#define ZSTD_DEC_BLOCK_H
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include "../common/zstd_deps.h"   /* size_t */
+#include "../zstd.h"    /* DCtx, and some public functions */
+#include "../common/zstd_internal.h"  /* blockProperties_t, and some public functions */
+#include "zstd_decompress_internal.h"  /* ZSTD_seqSymbol */
+
+
+/* ===   Prototypes   === */
+
+/* note: prototypes already published within `zstd.h` :
+ * ZSTD_decompressBlock()
+ */
+
+/* note: prototypes already published within `zstd_internal.h` :
+ * ZSTD_getcBlockSize()
+ * ZSTD_decodeSeqHeaders()
+ */
+
+
+ /* Streaming state is used to inform allocation of the literal buffer */
+typedef enum {
+    not_streaming = 0,
+    is_streaming = 1
+} streaming_operation;
+
+/* ZSTD_decompressBlock_internal() :
+ * decompress block, starting at `src`,
+ * into destination buffer `dst`.
+ * @return : decompressed block size,
+ *           or an error code (which can be tested using ZSTD_isError())
+ */
+size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * this function must be called with valid parameters only
+ * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
+ * in which case it cannot fail.
+ * The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes, which is
+ * defined in zstd_decompress_internal.h.
+ * Internal use only.
+ */
+void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+             const short* normalizedCounter, unsigned maxSymbolValue,
+             const U32* baseValue, const U8* nbAdditionalBits,
+                   unsigned tableLog, void* wksp, size_t wkspSize,
+                   int bmi2);
+
+
+#endif /* ZSTD_DEC_BLOCK_H */
diff --git a/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_internal.h b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_internal.h
new file mode 100644
index 000000000..91e9dceb5
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/decompress/zstd_decompress_internal.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* zstd_decompress_internal:
+ * objects and definitions shared within lib/decompress modules */
+
+ #ifndef ZSTD_DECOMPRESS_INTERNAL_H
+ #define ZSTD_DECOMPRESS_INTERNAL_H
+
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include "../common/mem.h"             /* BYTE, U16, U32 */
+#include "../common/zstd_internal.h"   /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */
+
+
+
+/*-*******************************************************
+ *  Constants
+ *********************************************************/
+static UNUSED_ATTR const U32 LL_base[MaxLL+1] = {
+                 0,    1,    2,     3,     4,     5,     6,      7,
+                 8,    9,   10,    11,    12,    13,    14,     15,
+                16,   18,   20,    22,    24,    28,    32,     40,
+                48,   64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                0x2000, 0x4000, 0x8000, 0x10000 };
+
+static UNUSED_ATTR const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
+
+static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = {
+                     0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31 };
+
+static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+                     3,  4,  5,    6,     7,     8,     9,    10,
+                    11, 12, 13,   14,    15,    16,    17,    18,
+                    19, 20, 21,   22,    23,    24,    25,    26,
+                    27, 28, 29,   30,    31,    32,    33,    34,
+                    35, 37, 39,   41,    43,    47,    51,    59,
+                    67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                    0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+
+/*-*******************************************************
+ *  Decompression types
+ *********************************************************/
+ typedef struct {
+     U32 fastMode;
+     U32 tableLog;
+ } ZSTD_seqSymbol_header;
+
+ typedef struct {
+     U16  nextState;
+     BYTE nbAdditionalBits;
+     BYTE nbBits;
+     U32  baseValue;
+ } ZSTD_seqSymbol;
+
+ #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
+
+#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+
+typedef struct {
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+    U32 rep[ZSTD_REP_NUM];
+    U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+} ZSTD_entropyDTables_t;
+
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef enum { zdss_init=0, zdss_loadHeader,
+               zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+typedef enum {
+    ZSTD_use_indefinitely = -1,  /* Use the dictionary indefinitely */
+    ZSTD_dont_use = 0,           /* Do not use the dictionary (if one exists free it) */
+    ZSTD_use_once = 1            /* Use the dictionary once and set to ZSTD_dont_use */
+} ZSTD_dictUses_e;
+
+/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */
+typedef struct {
+    const ZSTD_DDict** ddictPtrTable;
+    size_t ddictPtrTableSize;
+    size_t ddictPtrCount;
+} ZSTD_DDictHashSet;
+
+#ifndef ZSTD_DECODER_INTERNAL_BUFFER
+#  define ZSTD_DECODER_INTERNAL_BUFFER  (1 << 16)
+#endif
+
+#define ZSTD_LBMIN 64
+#define ZSTD_LBMAX (128 << 10)
+
+/* extra buffer, compensates when dst is not large enough to store litBuffer */
+#define ZSTD_LITBUFFEREXTRASIZE  BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX)
+
+typedef enum {
+    ZSTD_not_in_dst = 0,  /* Stored entirely within litExtraBuffer */
+    ZSTD_in_dst = 1,           /* Stored entirely within dst (in memory after current output write) */
+    ZSTD_split = 2            /* Split between litExtraBuffer and dst */
+} ZSTD_litLocation_e;
+
+struct ZSTD_DCtx_s
+{
+    const ZSTD_seqSymbol* LLTptr;
+    const ZSTD_seqSymbol* MLTptr;
+    const ZSTD_seqSymbol* OFTptr;
+    const HUF_DTable* HUFptr;
+    ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
+    const void* previousDstEnd;   /* detect continuity */
+    const void* prefixStart;      /* start of current segment */
+    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
+    const void* dictEnd;          /* end of previous segment */
+    size_t expected;
+    ZSTD_frameHeader fParams;
+    U64 processedCSize;
+    U64 decodedSize;
+    blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+    ZSTD_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    ZSTD_format_e format;
+    ZSTD_forceIgnoreChecksum_e forceIgnoreChecksum;   /* User specified: if == 1, will ignore checksums in compressed frame. Default == 0 */
+    U32 validateChecksum;         /* if == 1, will validate checksum. Is == 1 if (fParams.checksumFlag == 1) and (forceIgnoreChecksum == 0). */
+    const BYTE* litPtr;
+    ZSTD_customMem customMem;
+    size_t litSize;
+    size_t rleSize;
+    size_t staticSize;
+#if DYNAMIC_BMI2 != 0
+    int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+#endif
+
+    /* dictionary */
+    ZSTD_DDict* ddictLocal;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+    ZSTD_dictUses_e dictUses;
+    ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+    ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+
+    /* streaming */
+    ZSTD_dStreamStage streamStage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    size_t maxWindowSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t lhSize;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+    void* legacyContext;
+    U32 previousLegacyVersion;
+    U32 legacyVersion;
+#endif
+    U32 hostageByte;
+    int noForwardProgress;
+    ZSTD_bufferMode_e outBufferMode;
+    ZSTD_outBuffer expectedOutBuffer;
+
+    /* workspace */
+    BYTE* litBuffer;
+    const BYTE* litBufferEnd;
+    ZSTD_litLocation_e litBufferLocation;
+    BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+
+    size_t oversizedDuration;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    void const* dictContentBeginForFuzzing;
+    void const* dictContentEndForFuzzing;
+#endif
+
+    /* Tracing */
+#if ZSTD_TRACE
+    ZSTD_TraceCtx traceCtx;
+#endif
+};  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) {
+#if DYNAMIC_BMI2 != 0
+	return dctx->bmi2;
+#else
+    (void)dctx;
+	return 0;
+#endif
+}
+
+/*-*******************************************************
+ *  Shared internal functions
+ *********************************************************/
+
+/*! ZSTD_loadDEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
+size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+                   const void* const dict, size_t const dictSize);
+
+/*! ZSTD_checkContinuity() :
+ *  check if next `dst` follows previous position, where decompression ended.
+ *  If yes, do nothing (continue on current segment).
+ *  If not, classify previous segment as "external dictionary", and start a new segment.
+ *  This function cannot fail. */
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize);
+
+
+#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
diff --git a/GraphBLAS/zstd/zstd_subset/zstd.h b/GraphBLAS/zstd/zstd_subset/zstd.h
new file mode 100644
index 000000000..a290ec485
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/zstd.h
@@ -0,0 +1,2618 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <limits.h>   /* INT_MAX */
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDLIB_VISIBLE
+#  if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+#    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDLIB_VISIBLE
+#    define ZSTDLIB_HIDDEN
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBLE
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBLE
+#endif
+
+/* Deprecation warnings :
+ * Should these warnings be a problem, it is generally possible to disable them,
+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+ */
+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZSTD_DEPRECATED(message) [[deprecated(message)]]
+#  elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ >= 3)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZSTD_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+#    define ZSTD_DEPRECATED(message)
+#  endif
+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+
+
+/*******************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    5
+#define ZSTD_VERSION_RELEASE  3
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+/*! ZSTD_versionNumber() :
+ *  Return runtime library version, the value is (MAJOR*100*100 + MINOR*100 + RELEASE). */
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+
+/*! ZSTD_versionString() :
+ *  Return runtime library version, like "1.4.5". Requires v1.3.0+. */
+ZSTDLIB_API const char* ZSTD_versionString(void);
+
+/* *************************************
+ *  Default constant
+ ***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/* *************************************
+ *  Constants
+ ***************************************/
+
+/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
+#define ZSTD_MAGICNUMBER            0xFD2FB528    /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437    /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50    /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK   0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX  17
+#define ZSTD_BLOCKSIZE_MAX     (1<<ZSTD_BLOCKSIZELOG_MAX)
+
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : requires v1.3.0+
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of `src` frame content, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API
+ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+ * `srcSize` must be >= first frame size
+ * @return : the compressed size of the first frame starting at `src`,
+ *           suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+ *        or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
+
+
+/***************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Note : re-using context is just a speed / resource optimization.
+ *         It doesn't change the compression ratio, which remains identical.
+ *  Note 2 : In multi-threaded environments,
+ *         use one different context per thread for parallel execution.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer */
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+ *  Important : in order to behave similarly to `ZSTD_compress()`,
+ *  this function compresses at requested compression level,
+ *  __ignoring any other parameter__ .
+ *  If any advanced parameter was set using the advanced API,
+ *  they will all be reset. Only `compressionLevel` remains.
+ */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer */
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(),
+ *  requires an allocated ZSTD_DCtx.
+ *  Compatible with sticky parameters.
+ */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
+
+
+/*********************************************
+*  Advanced compression API (Requires v1.4.0+)
+**********************************************/
+
+/* API design :
+ *   Parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+ *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+ *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *
+ *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+ *
+ *   This API supersedes all other "advanced" API entry points in the experimental section.
+ *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ */
+
+
+/* Compression strategies, listed from fastest to strongest */
+typedef enum { ZSTD_fast=1,
+               ZSTD_dfast=2,
+               ZSTD_greedy=3,
+               ZSTD_lazy=4,
+               ZSTD_lazy2=5,
+               ZSTD_btlazy2=6,
+               ZSTD_btopt=7,
+               ZSTD_btultra=8,
+               ZSTD_btultra2=9
+               /* note : new strategies _might_ be added in the future.
+                         Only the order (from fast to strong) is guaranteed */
+} ZSTD_strategy;
+
+typedef enum {
+
+    /* compression parameters
+     * Note: When compressing with a ZSTD_CDict these parameters are superseded
+     * by the parameters used to construct the ZSTD_CDict.
+     * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+    ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+                              * Note that exact compression parameters are dynamically determined,
+                              * depending on both compression level and srcSize (when known).
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+                              * Note 1 : it's possible to pass a negative compression level.
+                              * Note 2 : setting a level does not automatically set all other compression parameters
+                              *   to default. Setting this will however eventually dynamically impact the compression
+                              *   parameters which have not been manually set. The manually set
+                              *   ones will 'stick'. */
+    /* Advanced compression parameters :
+     * It's possible to pin down compression parameters to some specific values.
+     * In which case, these values are no longer dynamically selected by the compressor */
+    ZSTD_c_windowLog=101,    /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * This will set a memory budget for streaming decompression,
+                              * with larger values requiring more memory
+                              * and typically compressing more.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+                              *       requires explicitly allowing such size at streaming decompression stage. */
+    ZSTD_c_hashLog=102,      /* Size of the initial probe table, as a power of 2.
+                              * Resulting memory usage is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_c_chainLog=103,     /* Size of the multi-probe search table, as a power of 2.
+                              * Resulting memory usage is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless for "fast" strategy.
+                              * It's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_c_searchLog=104,    /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless for "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_c_minMatch=105,     /* Minimum size of searched matches.
+                              * Note that Zstandard can still find matches of smaller size,
+                              * it just tweaks its search algorithm to look for this size and larger.
+                              * Larger values increase compression and decompression speed, but decrease ratio.
+                              * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+                              * For strategies btopt, btultra & btultra2:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_c_strategy=107,     /* See ZSTD_strategy enum definition.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+    /* LDM mode parameters */
+    ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                     * This parameter is designed to improve compression ratio
+                                     * for large inputs, by finding large matches at long distance.
+                                     * It increases memory usage and window size.
+                                     * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+                                     * except when expressly set to a different value.
+                                     * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and
+                                     * compression strategy >= ZSTD_btopt (== compression level 16+) */
+    ZSTD_c_ldmHashLog=161,   /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_c_ldmMinMatch=162,  /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashRateLog". */
+
+    /* frame parameters */
+    ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression.
+                              * This is automatically the case when using ZSTD_compress2(),
+                              * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_c_dictIDFlag=202,   /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+     * Otherwise, trying to set any other value than default (0) will be a no-op and return an error.
+     * In a situation where it's unknown if the linked library supports multi-threading or not,
+     * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property.
+     */
+    ZSTD_c_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() :
+                              * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+                              * while compression is performed in parallel, within worker thread(s).
+                              * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+                              *  in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned,
+                              * compression is performed inside Caller's thread, and all invocations are blocking */
+    ZSTD_c_jobSize=401,      /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+                              * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlap size, or ZSTDMT_JOBSIZE_MIN (= 512 KB), whichever is largest.
+                              * The minimum size is automatically and transparently enforced. */
+    ZSTD_c_overlapLog=402,   /* Control the overlap size, as a fraction of window size.
+                              * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+                              * It helps preserve compression ratio, while each job is compressed in parallel.
+                              * This value is enforced only when nbWorkers >= 1.
+                              * Larger values increase compression ratio, but decrease speed.
+                              * Possible values range from 0 to 9 :
+                              * - 0 means "default" : value will be determined by the library, depending on strategy
+                              * - 1 means "no overlap"
+                              * - 9 means "full overlap", using a full window size.
+                              * Each intermediate rank increases/decreases load size by a factor 2 :
+                              * 9: full window;  8: w/2;  7: w/4;  6: w/8;  5:w/16;  4: w/32;  3:w/64;  2:w/128;  1:no overlap;  0:default
+                              * default value varies between 6 and 9, depending on strategy */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_c_rsyncable
+     * ZSTD_c_format
+     * ZSTD_c_forceMaxWindow
+     * ZSTD_c_forceAttachDict
+     * ZSTD_c_literalCompressionMode
+     * ZSTD_c_targetCBlockSize
+     * ZSTD_c_srcSizeHint
+     * ZSTD_c_enableDedicatedDictSearch
+     * ZSTD_c_stableInBuffer
+     * ZSTD_c_stableOutBuffer
+     * ZSTD_c_blockDelimiters
+     * ZSTD_c_validateSequences
+     * ZSTD_c_useBlockSplitter
+     * ZSTD_c_useRowMatchFinder
+     * ZSTD_c_prefetchCDictTables
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly;
+     *        also, the enums values themselves are unstable and can still change.
+     */
+     ZSTD_c_experimentalParam1=500,
+     ZSTD_c_experimentalParam2=10,
+     ZSTD_c_experimentalParam3=1000,
+     ZSTD_c_experimentalParam4=1001,
+     ZSTD_c_experimentalParam5=1002,
+     ZSTD_c_experimentalParam6=1003,
+     ZSTD_c_experimentalParam7=1004,
+     ZSTD_c_experimentalParam8=1005,
+     ZSTD_c_experimentalParam9=1006,
+     ZSTD_c_experimentalParam10=1007,
+     ZSTD_c_experimentalParam11=1008,
+     ZSTD_c_experimentalParam12=1009,
+     ZSTD_c_experimentalParam13=1010,
+     ZSTD_c_experimentalParam14=1011,
+     ZSTD_c_experimentalParam15=1012,
+     ZSTD_c_experimentalParam16=1013
+} ZSTD_cParameter;
+
+typedef struct {
+    size_t error;
+    int lowerBound;
+    int upperBound;
+} ZSTD_bounds;
+
+/*! ZSTD_cParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - lower and upper bounds, both inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam);
+
+/*! ZSTD_CCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbWorkers >= 1),
+ *              the following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active for next job only (after a flush()).
+ * @return : an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ *  Total input data size to be compressed as a single frame.
+ *  Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag.
+ *  This value will also be controlled at end of frame, and trigger an error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame.
+ *           In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ *           ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame.
+ *  Note 2 : pledgedSrcSize is only valid once, for the next frame.
+ *           It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN.
+ *  Note 3 : Whenever all input data is provided and consumed in a single round,
+ *           for example with ZSTD_compress2(),
+ *           or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end),
+ *           this value is automatically overridden by srcSize instead.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+typedef enum {
+    ZSTD_reset_session_only = 1,
+    ZSTD_reset_parameters = 2,
+    ZSTD_reset_session_and_parameters = 3
+} ZSTD_ResetDirective;
+
+/*! ZSTD_CCtx_reset() :
+ *  There are 2 different things that can be reset, independently or jointly :
+ *  - The session : will stop compressing current frame, and make CCtx ready to start a new one.
+ *                  Useful after an error, or to interrupt any ongoing compression.
+ *                  Any internal data not yet flushed is cancelled.
+ *                  Compression parameters and dictionary remain unchanged.
+ *                  They will be used to compress next frame.
+ *                  Resetting session never fails.
+ *  - The parameters : changes all parameters back to "default".
+ *                  This removes any reference to any dictionary too.
+ *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ *  - Both : similar to resetting the session, followed by resetting parameters.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+
+/*! ZSTD_compress2() :
+ *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ *  ZSTD_compress2() always starts a new frame.
+ *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - The function is always blocking, returns when compression is completed.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ *           or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+                                   void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+
+/***********************************************
+*  Advanced decompression API (Requires v1.4.0+)
+************************************************/
+
+/* The advanced API pushes parameters one by one into an existing DCtx context.
+ * Parameters are sticky, and remain valid for all following frames
+ * using the same DCtx context.
+ * It's possible to reset parameters to default values using ZSTD_DCtx_reset().
+ * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream().
+ *        Therefore, no new decompression function is necessary.
+ */
+
+typedef enum {
+
+    ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+                              * the streaming API will refuse to allocate memory buffer
+                              * in order to protect the host from unreasonable memory requirements.
+                              * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+                              * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+                              * Special: value 0 means "use default maximum windowLog". */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_d_format
+     * ZSTD_d_stableOutBuffer
+     * ZSTD_d_forceIgnoreChecksum
+     * ZSTD_d_refMultipleDDicts
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly
+     */
+     ZSTD_d_experimentalParam1=1000,
+     ZSTD_d_experimentalParam2=1001,
+     ZSTD_d_experimentalParam3=1002,
+     ZSTD_d_experimentalParam4=1003
+
+} ZSTD_dParameter;
+
+/*! ZSTD_dParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - both lower and upper bounds, inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam);
+
+/*! ZSTD_DCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_dParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is only possible during frame initialization (before starting decompression).
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value);
+
+/*! ZSTD_DCtx_reset() :
+ *  Return a DCtx to clean state.
+ *  Session and parameters can be reset jointly or separately.
+ *  Parameters can only be reset when no active frame is being decompressed.
+ * @return : 0, or an error code, which can be tested with ZSTD_isError()
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+*  For parallel execution, use one separate ZSTD_CStream per thread.
+*
+*  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+*  Parameters are sticky : when starting a new compression on the same context,
+*  it will re-use the same sticky parameters as previous compression session.
+*  When in doubt, it's recommended to fully initialize the context before usage.
+*  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+*  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+*  set more specific parameters, the pledged source size, or load a dictionary.
+*
+*  Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+*  consume input stream. The function will automatically update both `pos`
+*  fields within `input` and `output`.
+*  Note that the function may not consume the entire input, for example, because
+*  the output buffer is already full, in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  and then present again remaining input data.
+*  note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+*        but doesn't guarantee maximal forward progress. This is especially relevant
+*        when compressing with multiple threads. The call won't block if it can
+*        consume some input, but if it can't it will wait for some, but not all,
+*        output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+*           or an error code, which can be tested using ZSTD_isError().
+*
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+*  operation.
+*  note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+*  start a new frame.
+*  note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if frame fully completed and fully flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);  /* accept NULL pointer */
+
+/*===== Streaming compression functions =====*/
+typedef enum {
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+    ZSTD_e_flush=1,    /* flush any data provided so far,
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression.
+                        * note : multithreaded compression will block to flush as much output as possible. */
+    ZSTD_e_end=2       /* flush any remaining data _and_ close current frame.
+                        * note that frame is only closed after compressed data is fully flushed (return value == 0).
+                        * After that point, any additional data starts a new frame.
+                        * note : each frame is independent (does not reference any content from previous frame).
+                        : note : multithreaded compression will block to flush as much output as possible. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compressStream2() : Requires v1.4.0+
+ *  Behaves about the same as ZSTD_compressStream, with additional control on end directive.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode)
+ *  - output->pos must be <= dstCapacity, input->pos must be <= srcSize
+ *  - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ *  - endOp must be a valid directive
+ *  - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller.
+ *  - When nbWorkers>=1, function is non-blocking : it copies a portion of input, distributes jobs to internal worker threads, flush to output whatever is available,
+ *                                                  and then immediately returns, just indicating that there is some data remaining to be flushed.
+ *                                                  The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ *  - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking.
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ *            or an error code, which can be tested using ZSTD_isError().
+ *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ *            Before starting a new compression job, or changing compression parameters,
+ *            it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                         ZSTD_outBuffer* output,
+                                         ZSTD_inBuffer* input,
+                                         ZSTD_EndDirective endOp);
+
+
+/* These buffer sizes are softly recommended.
+ * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
+ * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(),
+ * reducing the amount of memory shuffling and buffering, resulting in minor performance savings.
+ *
+ * However, note that these recommendations are from the perspective of a C caller program.
+ * If the streaming interface is invoked from some other language,
+ * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo,
+ * a major performance rule is to reduce crossing such interface to an absolute minimum.
+ * It's not rare that performance ends being spent more into the interface, rather than compression itself.
+ * In which cases, prefer using large buffers, as large as practical,
+ * for both input and output, to reduce the nb of roundtrips.
+ */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
+
+/* *****************************************************************************
+ * This following is a legacy streaming API, available since v1.0+ .
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+ * Streaming in combination with advanced parameters and dictionary compression
+ * can only be used through the new API.
+ ******************************************************************************/
+
+/*!
+ * Equivalent to:
+ *
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ */
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+/*!
+ * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue).
+ * NOTE: The return value is different. ZSTD_compressStream() returns a hint for
+ * the next read size (if non-zero and not an error). ZSTD_compressStream2()
+ * returns the minimum nb of bytes left to flush (if non-zero and not an error).
+ */
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+*  Alternatively, use advanced API to set specific properties.
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  The function tries to flush all data decoded immediately, respecting output buffer size.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
+*  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+*  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+*        or an error code, which can be tested using ZSTD_isError(),
+*        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+*                                the return value is a suggested next input size (just a hint for better latency)
+*                                that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer */
+
+/*===== Streaming decompression functions =====*/
+
+/* This function is redundant with the advanced API and equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression at an explicit compression level using a Dictionary.
+ *  A dictionary can be any arbitrary data segment (also called a prefix),
+ *  or a buffer with specified information (see zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a known Dictionary.
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/***********************************
+ *  Bulk processing dictionary API
+ **********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages or blocks using the same dictionary,
+ *  it's recommended to digest the dictionary only once, since it's a costly operation.
+ *  ZSTD_createCDict() will create a state from digesting a dictionary.
+ *  The resulting state can be used for future compression operations with very limited startup cost.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict.
+ *  Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content.
+ *  Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer,
+ *      in which case the only thing that it transports is the @compressionLevel.
+ *      This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively,
+ *      expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict().
+ *  If a NULL pointer is passed, no operation is performed. */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times.
+ *  Note : compression level is _decided at dictionary creation time_,
+ *     and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict. */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict()
+ *  If a NULL pointer is passed, no operation is performed. */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/********************************
+ *  Dictionary helper functions
+ *******************************/
+
+/*! ZSTD_getDictID_fromDict() : Requires v1.4.0+
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromCDict() : Requires v1.5.0+
+ *  Provides the dictID of the dictionary loaded into `cdict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict);
+
+/*! ZSTD_getDictID_fromDDict() : Requires v1.4.0+
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() : Requires v1.4.0+
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/*******************************************************************************
+ * Advanced dictionary and prefix API (Requires v1.4.0+)
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+ * only reset with the context is reset with ZSTD_reset_parameters or
+ * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ ******************************************************************************/
+
+
+/*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+ *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 2 : Loading a dictionary involves building tables.
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *           Tables are dependent on compression parameters, and for this reason,
+ *           compression parameters can no longer be changed after loading a dictionary.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+ *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Note that compression parameters are enforced from within CDict,
+ *  and supersede any compression parameter previously set within CCtx.
+ *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+ *  The ignored parameters will be used again if the CCtx is returned to no-dictionary mode.
+ *  The dictionary will remain valid for future compressed frames using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Referencing a NULL CDict means "return to no-dictionary mode".
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() : Requires v1.4.0+
+ *  Reference a prefix (single-usage dictionary) for next compressed frame.
+ *  A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+ *           Its content must remain unmodified during compression.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_c_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *           If there is a need to use the same prefix multiple times, consider loadDictionary instead.
+ *  Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
+ *           Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                 const void* prefix, size_t prefixSize);
+
+/*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+ *  Create an internal DDict from dict buffer,
+ *  to be used to decompress next frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ *            meaning "return to no-dictionary mode".
+ *  Note 1 : Loading a dictionary involves building tables,
+ *           which has a non-negligible impact on CPU usage and latency.
+ *           It's recommended to "load once, use many times", to amortize the cost
+ *  Note 2 :`dict` content will be copied internally, so `dict` can be released after loading.
+ *           Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of
+ *           how dictionary content is loaded and interpreted.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_refDDict() : Requires v1.4.0+
+ *  Reference a prepared dictionary, to be used to decompress next frames.
+ *  The dictionary remains active for decompression of future frames using same DCtx.
+ *
+ *  If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function
+ *  will store the DDict references in a table, and the DDict used for decompression
+ *  will be determined at decompression time, as per the dict ID in the frame.
+ *  The memory for the table is allocated on the first call to refDDict, and can be
+ *  freed with ZSTD_freeDCtx().
+ *
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+ *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+/*! ZSTD_DCtx_refPrefix() : Requires v1.4.0+
+ *  Reference a prefix (single-usage dictionary) to decompress next frame.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_decompressStream() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_decompressStream() returns 0.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section)
+ *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A full dictionary is more costly, as it requires building tables.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                 const void* prefix, size_t prefixSize);
+
+/* ===   Memory management   === */
+
+/*! ZSTD_sizeof_*() : Requires v1.4.0+
+ *  These functions give the _current_ memory usage of selected object.
+ *  Note that object memory usage can evolve (increase or decrease) over time. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+#endif  /* ZSTD_H_235446 */
+
+
+/* **************************************************************************************
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
+ * The definitions in the following section are considered experimental.
+ * They are provided for advanced scenarios.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/* This can be overridden externally to hide static symbols. */
+#ifndef ZSTDLIB_STATIC_API
+#  if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#    define ZSTDLIB_STATIC_API __declspec(dllexport) ZSTDLIB_VISIBLE
+#  elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#    define ZSTDLIB_STATIC_API __declspec(dllimport) ZSTDLIB_VISIBLE
+#  else
+#    define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+#  endif
+#endif
+
+/****************************************************************************************
+ *   experimental API (static linking only)
+ ****************************************************************************************
+ * The following symbols and constants
+ * are not planned to join "stable API" status in the near future.
+ * They can still change in future versions.
+ * Some of them are planned to remain in the static_only section indefinitely.
+ * Some of them might be removed in the future (especially when redundant with existing stable functions)
+ * ***************************************************************************************/
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1)   /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format)    ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE    8
+
+/* compression parameter bounds */
+#define ZSTD_WINDOWLOG_MAX_32    30
+#define ZSTD_WINDOWLOG_MAX_64    31
+#define ZSTD_WINDOWLOG_MAX     ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN       10
+#define ZSTD_HASHLOG_MAX       ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN          6
+#define ZSTD_CHAINLOG_MAX_32     29
+#define ZSTD_CHAINLOG_MAX_64     30
+#define ZSTD_CHAINLOG_MAX      ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN        ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX      (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN        1
+#define ZSTD_MINMATCH_MAX         7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN         3   /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX    ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN        ZSTD_fast
+#define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+
+
+#define ZSTD_OVERLAPLOG_MIN       0
+#define ZSTD_OVERLAPLOG_MAX       9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27   /* by default, the streaming decoder will refuse any frame
+                                           * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+                                           * to preserve host's memory from unreasonable requirements.
+                                           * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+                                           * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
+
+/* LDM parameter bounds */
+#define ZSTD_LDM_HASHLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX      ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN        4
+#define ZSTD_LDM_MINMATCH_MAX     4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN   1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX   8
+#define ZSTD_LDM_HASHRATELOG_MIN     0
+#define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+
+/* Advanced parameter bounds */
+#define ZSTD_TARGETCBLOCKSIZE_MIN   64
+#define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN        0
+#define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+
+
+/* ---  Advanced types  --- */
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef struct {
+    unsigned int offset;      /* The offset of the match. (NOT the same as the offset code)
+                               * If offset == 0 and matchLength == 0, this sequence represents the last
+                               * literals in the block of litLength size.
+                               */
+
+    unsigned int litLength;   /* Literal length of the sequence. */
+    unsigned int matchLength; /* Match length of the sequence. */
+
+                              /* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
+                               * In this case, we will treat the sequence as a marker for a block boundary.
+                               */
+
+    unsigned int rep;         /* Represents which repeat offset is represented by the field 'offset'.
+                               * Ranges from [0, 3].
+                               *
+                               * Repeat offsets are essentially previous offsets from previous sequences sorted in
+                               * recency order. For more detail, see doc/zstd_compression_format.md
+                               *
+                               * If rep == 0, then 'offset' does not contain a repeat offset.
+                               * If rep > 0:
+                               *  If litLength != 0:
+                               *      rep == 1 --> offset == repeat_offset_1
+                               *      rep == 2 --> offset == repeat_offset_2
+                               *      rep == 3 --> offset == repeat_offset_3
+                               *  If litLength == 0:
+                               *      rep == 1 --> offset == repeat_offset_2
+                               *      rep == 2 --> offset == repeat_offset_3
+                               *      rep == 3 --> offset == repeat_offset_1 - 1
+                               *
+                               * Note: This field is optional. ZSTD_generateSequences() will calculate the value of
+                               * 'rep', but repeat offsets do not necessarily need to be calculated from an external
+                               * sequence provider's perspective. For example, ZSTD_compressSequences() does not
+                               * use this 'rep' field at all (as of now).
+                               */
+} ZSTD_Sequence;
+
+typedef struct {
+    unsigned windowLog;       /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;        /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;         /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;       /**< nb of searches : larger == more compression, slower */
+    unsigned minMatch;        /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;    /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;   /**< see ZSTD_strategy definition above */
+} ZSTD_compressionParameters;
+
+typedef struct {
+    int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    int checksumFlag;    /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+    int noDictIDFlag;    /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef enum {
+    ZSTD_dct_auto = 0,       /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict = 2    /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0,  /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef = 1    /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+typedef enum {
+    ZSTD_f_zstd1 = 0,           /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless = 1  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                                 * Useful to save 4 bytes per generated frame.
+                                 * Decoder cannot recognise automatically this format, requiring this instruction. */
+} ZSTD_format_e;
+
+typedef enum {
+    /* Note: this enum controls ZSTD_d_forceIgnoreChecksum */
+    ZSTD_d_validateChecksum = 0,
+    ZSTD_d_ignoreChecksum = 1
+} ZSTD_forceIgnoreChecksum_e;
+
+typedef enum {
+    /* Note: this enum controls ZSTD_d_refMultipleDDicts */
+    ZSTD_rmd_refSingleDDict = 0,
+    ZSTD_rmd_refMultipleDDicts = 1
+} ZSTD_refMultipleDDicts_e;
+
+typedef enum {
+    /* Note: this enum and the behavior it controls are effectively internal
+     * implementation details of the compressor. They are expected to continue
+     * to evolve and should be considered only in the context of extremely
+     * advanced performance tuning.
+     *
+     * Zstd currently supports the use of a CDict in three ways:
+     *
+     * - The contents of the CDict can be copied into the working context. This
+     *   means that the compression can search both the dictionary and input
+     *   while operating on a single set of internal tables. This makes
+     *   the compression faster per-byte of input. However, the initial copy of
+     *   the CDict's tables incurs a fixed cost at the beginning of the
+     *   compression. For small compressions (< 8 KB), that copy can dominate
+     *   the cost of the compression.
+     *
+     * - The CDict's tables can be used in-place. In this model, compression is
+     *   slower per input byte, because the compressor has to search two sets of
+     *   tables. However, this model incurs no start-up cost (as long as the
+     *   working context's tables can be reused). For small inputs, this can be
+     *   faster than copying the CDict's tables.
+     *
+     * - The CDict's tables are not used at all, and instead we use the working
+     *   context alone to reload the dictionary and use params based on the source
+     *   size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+     *   This method is effective when the dictionary sizes are very small relative
+     *   to the input size, and the input size is fairly large to begin with.
+     *
+     * Zstd has a simple internal heuristic that selects which strategy to use
+     * at the beginning of a compression. However, if experimentation shows that
+     * Zstd is making poor choices, it is possible to override that choice with
+     * this enum.
+     */
+    ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+    ZSTD_dictForceAttach   = 1, /* Never copy the dictionary. */
+    ZSTD_dictForceCopy     = 2, /* Always copy the dictionary. */
+    ZSTD_dictForceLoad     = 3  /* Always reload the dictionary */
+} ZSTD_dictAttachPref_e;
+
+typedef enum {
+  ZSTD_lcm_auto = 0,          /**< Automatically determine the compression mode based on the compression level.
+                               *   Negative compression levels will be uncompressed, and positive compression
+                               *   levels will be compressed. */
+  ZSTD_lcm_huffman = 1,       /**< Always attempt Huffman compression. Uncompressed literals will still be
+                               *   emitted if Huffman compression is not profitable. */
+  ZSTD_lcm_uncompressed = 2   /**< Always emit uncompressed literals. */
+} ZSTD_literalCompressionMode_e;
+
+typedef enum {
+  /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final
+   * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable
+   * or ZSTD_ps_disable allow for a force enable/disable the feature.
+   */
+  ZSTD_ps_auto = 0,         /* Let the library automatically determine whether the feature shall be enabled */
+  ZSTD_ps_enable = 1,       /* Force-enable the feature */
+  ZSTD_ps_disable = 2       /* Do not use the feature */
+} ZSTD_paramSwitch_e;
+
+/***************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findDecompressedSize() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - decompressed size of all data in all successive frames
+ *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ *            read each contained frame header.  This is fast as most of the data is skipped,
+ *            however it does mean that all frame data must be present and valid. */
+ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_decompressBound() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - upper-bound for the decompressed size of all data in all successive frames
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *  note 1  : an error can occur if `src` contains an invalid or incorrectly formatted frame.
+ *  note 2  : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`.
+ *            in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value.
+ *  note 3  : when the decompressed size field isn't available, the upper-bound for that frame is calculated by:
+ *              upper-bound = # blocks * min(128 KB, Window_Size)
+ */
+ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+
+/*! ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+typedef enum {
+  ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+  ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+} ZSTD_sequenceFormat_e;
+
+/*! ZSTD_generateSequences() :
+ * Generate sequences using ZSTD_compress2(), given a source buffer.
+ *
+ * Each block will end with a dummy sequence
+ * with offset == 0, matchLength == 0, and litLength == length of last literals.
+ * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+ * simply acts as a block delimiter.
+ *
+ * @zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2().
+ *
+ * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+ * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+ * @return : number of sequences generated
+ */
+
+ZSTDLIB_STATIC_API size_t
+ZSTD_generateSequences( ZSTD_CCtx* zc,
+                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
+                        const void* src, size_t srcSize);
+
+/*! ZSTD_mergeBlockDelimiters() :
+ * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+ * by merging them into the literals of the next sequence.
+ *
+ * As such, the final generated result has no explicit representation of block boundaries,
+ * and the final last literals segment is not represented in the sequences.
+ *
+ * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+ * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters
+ * @return : number of sequences left after merging
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+
+/*! ZSTD_compressSequences() :
+ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
+ * @src contains the entire input (not just the literals).
+ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
+ * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+ * The entire source is compressed into a single frame.
+ *
+ * The compression behavior changes based on cctx params. In particular:
+ *    If ZSTD_c_blockDelimiters == ZSTD_sf_noBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+ *    no block delimiters (defined in ZSTD_Sequence). Block boundaries are roughly determined based on
+ *    the block size derived from the cctx, and sequences may be split. This is the default setting.
+ *
+ *    If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain
+ *    block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided.
+ *
+ *    If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined
+ *    behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for
+ *    specifics regarding offset/matchlength requirements) then the function will bail out and return an error.
+ *
+ *    In addition to the two adjustable experimental params, there are other important cctx params.
+ *    - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN.
+ *    - ZSTD_c_compressionLevel accordingly adjusts the strength of the entropy coder, as it would in typical compression.
+ *    - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset
+ *      is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md
+ *
+ * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+ * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+ *         and cannot emit an RLE block that disagrees with the repcode history
+ * @return : final compressed size, or a ZSTD error code.
+ */
+ZSTDLIB_STATIC_API size_t
+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
+                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                        const void* src, size_t srcSize);
+
+
+/*! ZSTD_writeSkippableFrame() :
+ * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ *
+ * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number,
+ * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15.
+ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so
+ * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
+ *
+ * Returns an error if destination buffer is not large enough, if the source size is not representable
+ * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid).
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+                                            const void* src, size_t srcSize, unsigned magicVariant);
+
+/*! ZSTD_readSkippableFrame() :
+ * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ *
+ * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+ * in the magicVariant.
+ *
+ * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+                                            const void* src, size_t srcSize);
+
+/*! ZSTD_isSkippableFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame.
+ */
+ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+
+
+
+/***************************************
+*  Memory management
+***************************************/
+
+/*! ZSTD_estimate*() :
+ *  These functions make it possible to estimate memory usage
+ *  of a future {D,C}Ctx, before its creation.
+ *
+ *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+ *  for any compression level up to selected one.
+ *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+ *         does not include space for a window buffer.
+ *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ *  The estimate will assume the input may be arbitrarily large,
+ *  which is the worst case.
+ *
+ *  When srcSize can be bound by a known and rather "small" value,
+ *  this fact can be used to provide a tighter estimation
+ *  because the CCtx compression context will need less memory.
+ *  This tighter estimation can be provided by more advanced functions
+ *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+ *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ *         an internal ?Dict will be created, which additional size is not estimated here.
+ *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_STATIC_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_STATIC_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_STATIC_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static
+#ifdef __GNUC__
+__attribute__((__unused__))
+#endif
+ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_STATIC_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+/*! Thread pool :
+ *  These prototypes make it possible to share a thread pool among multiple compression contexts.
+ *  This can limit resources for applications with multiple threads where each one uses
+ *  a threaded compression mode (via ZSTD_c_nbWorkers parameter).
+ *  ZSTD_createThreadPool creates a new thread pool with a given number of threads.
+ *  Note that the lifetime of such pool must exist while being used.
+ *  ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value
+ *  to use an internal thread pool).
+ *  ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer.
+ */
+typedef struct POOL_ctx_s ZSTD_threadPool;
+ZSTDLIB_STATIC_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads);
+ZSTDLIB_STATIC_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool);  /* accept NULL pointer */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool);
+
+
+/*
+ * This API is temporary and is expected to change or disappear in the future!
+ */
+ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced2(
+    const void* dict, size_t dictSize,
+    ZSTD_dictLoadMethod_e dictLoadMethod,
+    ZSTD_dictContentType_e dictContentType,
+    const ZSTD_CCtx_params* cctxParams,
+    ZSTD_customMem customMem);
+
+ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_advanced(
+    const void* dict, size_t dictSize,
+    ZSTD_dictLoadMethod_e dictLoadMethod,
+    ZSTD_dictContentType_e dictContentType,
+    ZSTD_customMem customMem);
+
+
+/***************************************
+*  Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is just referenced, not duplicated.
+ *  As a consequence, `dictBuffer` **must** outlive CDict,
+ *  and its content must remain unmodified throughout the lifetime of CDict.
+ *  note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */
+ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+ * `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+ *  same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+ *  All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_STATIC_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+ *  Ensure param values remain within authorized range.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ *  optimize params for a given `srcSize` and `dictSize`.
+ * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN.
+ * `dictSize` must be `0` when there is no dictionary.
+ *  cPar can be invalid : all parameters will be clamped within valid range in the @return struct.
+ *  This function never fails (wide contract) */
+ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+ *  Note : this function is now DEPRECATED.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ *  This prototype will generate compilation warnings. */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_compress2")
+size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+                                          void* dst, size_t dstCapacity,
+                                    const void* src, size_t srcSize,
+                                    const void* dict,size_t dictSize,
+                                          ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+ *  Note : this function is now DEPRECATED.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ *  This prototype will generate compilation warnings. */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_CDict* cdict,
+                                              ZSTD_frameParameters fParams);
+
+
+/*! ZSTD_CCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
+ *  It saves some memory, but also requires that `dict` outlives its usage within `cctx` */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?) */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_refPrefix_advanced() :
+ *  Same as ZSTD_CCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/* ===   experimental parameters   === */
+/* these parameters can be used with ZSTD_setParameter()
+ * they are not guaranteed to remain supported in the future */
+
+ /* Enables rsyncable mode,
+  * which makes compressed files more rsync friendly
+  * by adding periodic synchronization points to the compressed data.
+  * The target average block size is ZSTD_c_jobSize / 2.
+  * It's possible to modify the job size to increase or decrease
+  * the granularity of the synchronization point.
+  * Once the jobSize is smaller than the window size,
+  * it will result in compression ratio degradation.
+  * NOTE 1: rsyncable mode only works when multithreading is enabled.
+  * NOTE 2: rsyncable performs poorly in combination with long range mode,
+  * since it will decrease the effectiveness of synchronization points,
+  * though mileage may vary.
+  * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+  * If the selected compression level is already running significantly slower,
+  * the overall speed won't be significantly impacted.
+  */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+
+/* Select a compression format.
+ * The value must be of type ZSTD_format_e.
+ * See ZSTD_format_e enum definition for details */
+#define ZSTD_c_format ZSTD_c_experimentalParam2
+
+/* Force back-reference distances to remain < windowSize,
+ * even when referencing into Dictionary content (default:0) */
+#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3
+
+/* Controls whether the contents of a CDict
+ * are used in place, or copied into the working context.
+ * Accepts values from the ZSTD_dictAttachPref_e enum.
+ * See the comments on that enum for an explanation of the feature. */
+#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+
+/* Controlled with ZSTD_paramSwitch_e enum.
+ * Default is ZSTD_ps_auto.
+ * Set to ZSTD_ps_disable to never compress literals.
+ * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals
+ * may still be emitted if huffman is not beneficial to use.)
+ *
+ * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use
+ * literals compression based on the compression parameters - specifically,
+ * negative compression levels do not use literal compression.
+ */
+#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+
+/* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size (default:0) */
+#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+
+/* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size,
+ * but compression ratio may regress significantly if guess considerably underestimates */
+#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
+
+/* Controls whether the new and experimental "dedicated dictionary search
+ * structure" can be used. This feature is still rough around the edges, be
+ * prepared for surprising behavior!
+ *
+ * How to use it:
+ *
+ * When using a CDict, whether to use this feature or not is controlled at
+ * CDict creation, and it must be set in a CCtxParams set passed into that
+ * construction (via ZSTD_createCDict_advanced2()). A compression will then
+ * use the feature or not based on how the CDict was constructed; the value of
+ * this param, set in the CCtx, will have no effect.
+ *
+ * However, when a dictionary buffer is passed into a CCtx, such as via
+ * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control
+ * whether the CDict that is created internally can use the feature or not.
+ *
+ * What it does:
+ *
+ * Normally, the internal data structures of the CDict are analogous to what
+ * would be stored in a CCtx after compressing the contents of a dictionary.
+ * To an approximation, a compression using a dictionary can then use those
+ * data structures to simply continue what is effectively a streaming
+ * compression where the simulated compression of the dictionary left off.
+ * Which is to say, the search structures in the CDict are normally the same
+ * format as in the CCtx.
+ *
+ * It is possible to do better, since the CDict is not like a CCtx: the search
+ * structures are written once during CDict creation, and then are only read
+ * after that, while the search structures in the CCtx are both read and
+ * written as the compression goes along. This means we can choose a search
+ * structure for the dictionary that is read-optimized.
+ *
+ * This feature enables the use of that different structure.
+ *
+ * Note that some of the members of the ZSTD_compressionParameters struct have
+ * different semantics and constraints in the dedicated search structure. It is
+ * highly recommended that you simply set a compression level in the CCtxParams
+ * you pass into the CDict creation call, and avoid messing with the cParams
+ * directly.
+ *
+ * Effects:
+ *
+ * This will only have any effect when the selected ZSTD_strategy
+ * implementation supports this feature. Currently, that's limited to
+ * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2.
+ *
+ * Note that this means that the CDict tables can no longer be copied into the
+ * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be
+ * usable. The dictionary can only be attached or reloaded.
+ *
+ * In general, you should expect compression to be faster--sometimes very much
+ * so--and CDict creation to be slightly slower. Eventually, we will probably
+ * make this mode the default.
+ */
+#define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8
+
+/* ZSTD_c_stableInBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the compressor that input data presented with ZSTD_inBuffer
+ * will ALWAYS be the same between calls.
+ * Technically, the @src pointer must never be changed,
+ * and the @pos field can only be updated by zstd.
+ * However, it's possible to increase the @size field,
+ * allowing scenarios where more data can be appended after compressions starts.
+ * These conditions are checked by the compressor,
+ * and compression will fail if they are not respected.
+ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
+ * MUST not be modified during compression or it will result in data corruption.
+ *
+ * When this flag is enabled zstd won't allocate an input window buffer,
+ * because the user guarantees it can reference the ZSTD_inBuffer until
+ * the frame is complete. But, it will still allocate an output buffer
+ * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+ * avoid the memcpy() from the input buffer to the input window buffer.
+ *
+ * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, compression WILL fail if conditions are not respected.
+ *
+ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
+ * not be modified during compression or it will result in data corruption.
+ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * matches. Normally zstd maintains its own window buffer for this purpose,
+ * but passing this flag tells zstd to rely on user provided buffer instead.
+ */
+#define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+
+/* ZSTD_c_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells he compressor that the ZSTD_outBuffer will not be resized between
+ * calls. Specifically: (out.size - out.pos) will never grow. This gives the
+ * compressor the freedom to say: If the compressed data doesn't fit in the
+ * output buffer then return ZSTD_error_dstSizeTooSmall. This allows us to
+ * always decompress directly into the output buffer, instead of decompressing
+ * into an internal buffer and copying to the output buffer.
+ *
+ * When this flag is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer. It will still allocate the
+ * input window buffer (see ZSTD_c_stableInBuffer).
+ *
+ * Zstd will check that (out.size - out.pos) never grows and return an error
+ * if it does. While not strictly necessary, this should prevent surprises.
+ */
+#define ZSTD_c_stableOutBuffer ZSTD_c_experimentalParam10
+
+/* ZSTD_c_blockDelimiters
+ * Default is 0 == ZSTD_sf_noBlockDelimiters.
+ *
+ * For use with sequence compression API: ZSTD_compressSequences().
+ *
+ * Designates whether or not the given array of ZSTD_Sequence contains block delimiters
+ * and last literals, which are defined as sequences with offset == 0 and matchLength == 0.
+ * See the definition of ZSTD_Sequence for more specifics.
+ */
+#define ZSTD_c_blockDelimiters ZSTD_c_experimentalParam11
+
+/* ZSTD_c_validateSequences
+ * Default is 0 == disabled. Set to 1 to enable sequence validation.
+ *
+ * For use with sequence compression API: ZSTD_compressSequences().
+ * Designates whether or not we validate sequences provided to ZSTD_compressSequences()
+ * during function execution.
+ *
+ * Without validation, providing a sequence that does not conform to the zstd spec will cause
+ * undefined behavior, and may produce a corrupted block.
+ *
+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
+ * specifics regarding offset/matchlength requirements) then the function will bail out and
+ * return an error.
+ *
+ */
+#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12
+
+/* ZSTD_c_useBlockSplitter
+ * Controlled with ZSTD_paramSwitch_e enum.
+ * Default is ZSTD_ps_auto.
+ * Set to ZSTD_ps_disable to never use block splitter.
+ * Set to ZSTD_ps_enable to always use block splitter.
+ *
+ * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use
+ * block splitting based on the compression parameters.
+ */
+#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13
+
+/* ZSTD_c_useRowMatchFinder
+ * Controlled with ZSTD_paramSwitch_e enum.
+ * Default is ZSTD_ps_auto.
+ * Set to ZSTD_ps_disable to never use row-based matchfinder.
+ * Set to ZSTD_ps_enable to force usage of row-based matchfinder.
+ *
+ * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use
+ * the row-based matchfinder based on support for SIMD instructions and the window log.
+ * Note that this only pertains to compression strategies: greedy, lazy, and lazy2
+ */
+#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14
+
+/* ZSTD_c_deterministicRefPrefix
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Zstd produces different results for prefix compression when the prefix is
+ * directly adjacent to the data about to be compressed vs. when it isn't.
+ * This is because zstd detects that the two buffers are contiguous and it can
+ * use a more efficient match finding algorithm. However, this produces different
+ * results than when the two buffers are non-contiguous. This flag forces zstd
+ * to always load the prefix in non-contiguous mode, even if it happens to be
+ * adjacent to the data, to guarantee determinism.
+ *
+ * If you really care about determinism when using a dictionary or prefix,
+ * like when doing delta compression, you should select this option. It comes
+ * at a speed penalty of about ~2.5% if the dictionary and data happened to be
+ * contiguous, and is free if they weren't contiguous. We don't expect that
+ * intentionally making the dictionary and data contiguous will be worth the
+ * cost to memcpy() the data.
+ */
+#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+
+/* ZSTD_c_prefetchCDictTables
+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
+ *
+ * In some situations, zstd uses CDict tables in-place rather than copying them
+ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
+ * In such situations, compression speed is seriously impacted when CDict tables are
+ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
+ * when they are used in-place.
+ *
+ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
+ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
+ * into the working context, so there is no need to prefetch. This parameter is
+ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
+ * useful but memcpy() is too expensive. The exact range of input sizes where this
+ * makes sense is best determined by careful experimentation.
+ *
+ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
+ * but in the future zstd may conditionally enable this feature via an auto-detection
+ * heuristic for cold CDicts.
+ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
+ */
+#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
+
+/*! ZSTD_CCtx_getParameter() :
+ *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
+
+
+/*! ZSTD_CCtx_params :
+ *  Quick howto :
+ *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ *  - ZSTD_CCtxParams_setParameter() : Push parameters one by one into
+ *                                     an existing ZSTD_CCtx_params structure.
+ *                                     This is similar to
+ *                                     ZSTD_CCtx_setParameter().
+ *  - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ *                                    an existing CCtx.
+ *                                    These parameters will be applied to
+ *                                    all subsequent frames.
+ *  - ZSTD_compressStream2() : Do compression using the CCtx.
+ *  - ZSTD_freeCCtxParams() : Free the memory, accept NULL pointer.
+ *
+ *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ *  for static allocation of CCtx for single-threaded compression.
+ */
+ZSTDLIB_STATIC_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);  /* accept NULL pointer */
+
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ *  Initializes the compression parameters of cctxParams according to
+ *  compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ *  Initializes the compression and frame parameters of cctxParams according to
+ *  params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+/*! ZSTD_CCtxParams_setParameter() : Requires v1.4.0+
+ *  Similar to ZSTD_CCtx_setParameter.
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Parameters must be applied to a ZSTD_CCtx using
+ *  ZSTD_CCtx_setParametersUsingCCtxParams().
+ * @result : a code representing success or failure (which can be tested with
+ *           ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtxParams_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  Apply a set of ZSTD_CCtx_params to the compression context.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+/*! ZSTD_compressStream2_simpleArgs() :
+ *  Same as ZSTD_compressStream2(),
+ *  but using only integral types as arguments.
+ *  This variant might be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
+
+/***************************************
+*  Advanced decompression functions
+***************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_STATIC_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict,
+ *  it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but references `dict` content instead of copying it into `dctx`.
+ *  This saves memory if `dict` remains around.,
+ *  However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but gives direct control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?). */
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_refPrefix_advanced() :
+ *  Same as ZSTD_DCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ *  Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ *  This protects a decoder context from reserving too much memory for itself (potential attack scenario).
+ *  This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ *  By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+/*! ZSTD_DCtx_getParameter() :
+ *  Get the requested decompression parameter value, selected by enum ZSTD_dParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value);
+
+/* ZSTD_d_format
+ * experimental parameter,
+ * allowing selection between ZSTD_format_e input compression formats
+ */
+#define ZSTD_d_format ZSTD_d_experimentalParam1
+/* ZSTD_d_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the decompressor, and
+ * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer
+ * MUST be large enough to fit the entire decompressed frame. This will be
+ * checked when the frame content size is known. The data in the ZSTD_outBuffer
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+ * When this flag is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+ * If you need to avoid the input buffer allocation use the buffer-less
+ * streaming API.
+ *
+ * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, decompression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST
+ * not be modified during decompression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate
+ * matches. Normally zstd maintains its own buffer for this purpose, but passing
+ * this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2
+
+/* ZSTD_d_forceIgnoreChecksum
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable
+ *
+ * Tells the decompressor to skip checksum validation during decompression, regardless
+ * of whether checksumming was specified during compression. This offers some
+ * slight performance benefits, and may be useful for debugging.
+ * Param has values of type ZSTD_forceIgnoreChecksum_e
+ */
+#define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3
+
+/* ZSTD_d_refMultipleDDicts
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable
+ *
+ * If enabled and dctx is allocated on the heap, then additional memory will be allocated
+ * to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict()
+ * using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead
+ * store all references. At decompression time, the appropriate dictID is selected
+ * from the set of DDicts based on the dictID in the frame.
+ *
+ * Usage is simply calling ZSTD_refDDict() on multiple dict buffers.
+ *
+ * Param has values of byte ZSTD_refMultipleDDicts_e
+ *
+ * WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory
+ * allocation for the hash table. ZSTD_freeDCtx() also frees this memory.
+ * Memory is allocated as per ZSTD_DCtx::customMem.
+ *
+ * Although this function allocates memory for the table, the user is still responsible for
+ * memory management of the underlying ZSTD_DDict* themselves.
+ */
+#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+
+
+/*! ZSTD_DCtx_setFormat() :
+ *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+ *  Instruct the decoder context about what kind of data to decode next.
+ *  This instruction is mandatory to decode data without a fully-formed header,
+ *  such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+/*! ZSTD_decompressStream_simpleArgs() :
+ *  Same as ZSTD_decompressStream(),
+ *  but using only integral types as arguments.
+ *  This can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
+
+/********************************************************************
+*  Advanced streaming functions
+*  Warning : most of these functions are now redundant with the Advanced API.
+*  Once Advanced API reaches "stable" status,
+*  redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+
+/*! ZSTD_initCStream_srcSize() :
+ * This function is DEPRECATED, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * pledgedSrcSize must be correct. If it is not known at init time, use
+ * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs,
+ * "0" also disables frame content size field. It may be enabled in the future.
+ * This prototype will generate compilation warnings.
+ */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                         int compressionLevel,
+                         unsigned long long pledgedSrcSize);
+
+/*! ZSTD_initCStream_usingDict() :
+ * This function is DEPRECATED, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * Creates of an internal CDict (incompatible with static CCtx), except if
+ * dict == NULL or dictSize < 8, in which case no dict is used.
+ * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if
+ * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.
+ * This prototype will generate compilation warnings.
+ */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           int compressionLevel);
+
+/*! ZSTD_initCStream_advanced() :
+ * This function is DEPRECATED, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+ *     for ((param, value) : params) {
+ *         ZSTD_CCtx_setParameter(zcs, param, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
+ * pledgedSrcSize must be correct.
+ * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * This prototype will generate compilation warnings.
+ */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize,
+                          ZSTD_parameters params,
+                          unsigned long long pledgedSrcSize);
+
+/*! ZSTD_initCStream_usingCDict() :
+ * This function is DEPRECATED, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * note : cdict will just be referenced, and must outlive compression session
+ * This prototype will generate compilation warnings.
+ */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+/*! ZSTD_initCStream_usingCDict_advanced() :
+ *   This function is DEPRECATED, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+ *     for ((fParam, value) : fParams) {
+ *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters.
+ * pledgedSrcSize must be correct. If srcSize is not known at init time, use
+ * value ZSTD_CONTENTSIZE_UNKNOWN.
+ * This prototype will generate compilation warnings.
+ */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize);
+
+/*! ZSTD_resetCStream() :
+ * This function is DEPRECATED, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * Note: ZSTD_resetCStream() interprets pledgedSrcSize == 0 as ZSTD_CONTENTSIZE_UNKNOWN, but
+ *       ZSTD_CCtx_setPledgedSrcSize() does not do the same, so ZSTD_CONTENTSIZE_UNKNOWN must be
+ *       explicitly specified.
+ *
+ *  start a new frame, using same parameters from previous frame.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ *  This prototype will generate compilation warnings.
+ */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression() :
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
+ */
+ZSTDLIB_STATIC_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flush speed is limited by production speed of oldest job
+ *    irrespective of the speed of concurrent (and newer) jobs.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+
+
+/*=====   Advanced Streaming decompression functions  =====*/
+
+/*!
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+ *
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+/*!
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, ddict);
+ *
+ * note : ddict is referenced, it must outlive decompression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+
+/*!
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *
+ * re-use decompression parameters from previous init; saves dictionary loading
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with several restrictions, documented below.
+*  Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+  - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use advanced API to access custom parameters")
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_STATIC_API
+ZSTD_DEPRECATED("use advanced API to access custom parameters")
+size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+/**
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+  Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+  As a consequence, check that values remain within valid application range.
+  For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+  Each application can set its own limits, depending on local restrictions.
+  For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+  There are multiple ways to guarantee this condition.
+
+  The most memory efficient way is to use a round buffer of sufficient size.
+  Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+  up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+  which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+  At which point, decoding can resume from the beginning of the buffer.
+  Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+  There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+  Finally, if you control the compression process, you can also ignore all buffer size rules,
+  as long as the encoder and decoder progress in "lock-step",
+  aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+  Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+  If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by decompressor.
+  The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+  For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+
+/* ============================ */
+/**       Block level API       */
+/* ============================ */
+
+/*!
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+    - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) !
+      ===> In which case, nothing is produced into `dst` !
+      + User __must__ test for such outcome and deal directly with uncompressed data
+      + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0.
+        Doing so would mess up with statistics history, leading to potential data corruption.
+      + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !!
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
+*/
+
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/GraphBLAS/zstd/zstd_subset/zstd_errors.h b/GraphBLAS/zstd/zstd_subset/zstd_errors.h
new file mode 100644
index 000000000..2ec0b0ab1
--- /dev/null
+++ b/GraphBLAS/zstd/zstd_subset/zstd_errors.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDERRORLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDERRORLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stabilityCondition_notRespected = 50,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */
diff --git a/README.md.suitesparse b/README.md.suitesparse
index dee3f1814..87cac6959 100644
--- a/README.md.suitesparse
+++ b/README.md.suitesparse
@@ -2,25 +2,22 @@
 SuiteSparse:  A Suite of Sparse matrix packages at http://suitesparse.com
 -----------------------------------------------------------------------------
 
-Apr 10, 2022.  SuiteSparse VERSION 5.12.0
+Aug 25, 2022.  SuiteSparse VERSION 5.13.0
 
-    Now includes GraphBLAS, SLIP_LU, and a new interface to the SuiteSparse
-    Matrix Collection (ssget), via MATLAB and a Java GUI, to
-    http://sparse.tamu.edu.
+SuiteSparse is a set of sparse-matrix-related packages written or co-authored
+by Tim Davis, available at https://github.com/DrTimothyAldenDavis/SuiteSparse .
 
 Primary author of SuiteSparse (codes and algorithms, excl. METIS): Tim Davis
 
 Code co-authors, in alphabetical order (not including METIS):
-
-    Patrick Amestoy, David Bateman, Jinhao Chen.  Yanqing Chen, Iain Duff,
+    Patrick Amestoy, David Bateman, Jinhao Chen, Yanqing Chen, Iain Duff,
     Les Foster, William Hager, Scott Kolodziej, Chris Lourenco, Stefan
     Larimore, Erick Moreno-Centeno, Ekanathan Palamadai, Sivasankaran
     Rajamanickam, Sanjay Ranka, Wissam Sid-Lakhdar, Nuri Yeralan.
 
 Additional algorithm designers: Esmond Ng and John Gilbert.
 
-Refer to each package for license, copyright, and author information.  All
-codes are authored or co-authored by Timothy A. Davis.
+Refer to each package for license, copyright, and author information.
 
 -----------------------------------------------------------------------------
 How to cite the SuiteSparse meta-package and its component packages:
diff --git a/SuiteSparse_config/Makefile b/SuiteSparse_config/Makefile
index 8eb60fd67..68ef36afe 100644
--- a/SuiteSparse_config/Makefile
+++ b/SuiteSparse_config/Makefile
@@ -7,7 +7,7 @@ export SUITESPARSE
 
 # version of SuiteSparse_config is also version of SuiteSparse meta-package
 LIBRARY = libsuitesparseconfig
-VERSION = 5.12.0
+VERSION = 5.13.0
 SO_VERSION = 5
 
 default: library
diff --git a/SuiteSparse_config/SuiteSparse_config.h b/SuiteSparse_config/SuiteSparse_config.h
index 1797e734b..354b993a7 100644
--- a/SuiteSparse_config/SuiteSparse_config.h
+++ b/SuiteSparse_config/SuiteSparse_config.h
@@ -223,10 +223,10 @@ int SuiteSparse_version     /* returns SUITESPARSE_VERSION */
 */
 #define SUITESPARSE_HAS_VERSION_FUNCTION
 
-#define SUITESPARSE_DATE "Apr 10, 2022"
+#define SUITESPARSE_DATE "Aug 25, 2022"
 #define SUITESPARSE_VER_CODE(main,sub) ((main) * 1000 + (sub))
 #define SUITESPARSE_MAIN_VERSION 5
-#define SUITESPARSE_SUB_VERSION 12
+#define SUITESPARSE_SUB_VERSION 13
 #define SUITESPARSE_SUBSUB_VERSION 0
 #define SUITESPARSE_VERSION \
     SUITESPARSE_VER_CODE(SUITESPARSE_MAIN_VERSION,SUITESPARSE_SUB_VERSION)
diff --git a/SuiteSparse_config/SuiteSparse_config.mk b/SuiteSparse_config/SuiteSparse_config.mk
index a95366431..3ce8de32d 100644
--- a/SuiteSparse_config/SuiteSparse_config.mk
+++ b/SuiteSparse_config/SuiteSparse_config.mk
@@ -7,7 +7,7 @@
 # and GraphBLAS.  The configuration settings for GraphBLAS are determined by
 # GraphBLAS/CMakeLists.txt
 
-SUITESPARSE_VERSION = 5.12.0
+SUITESPARSE_VERSION = 5.13.0
 
     #---------------------------------------------------------------------------
     # determine what system we are on