From f320511dce6374fc7e545a19edda52cfc471ab6f Mon Sep 17 00:00:00 2001
From: Henrik John <l.john@erasmusmc.nl>
Date: Wed, 18 May 2022 17:40:06 +0200
Subject: [PATCH 01/18] Resolve an issue with starting betas being reset

During cross validation the beta values are reset to 0.0, while
they should be reset to the starting betas, if provided.
---
 R/ModelFit.R                            |  1 +
 R/RcppExports.R                         | 12 +++++++
 src/RcppCyclopsInterface.cpp            | 22 +++++++++++++
 src/RcppExports.cpp                     | 43 +++++++++++++++++++++++++
 src/cyclops/CyclicCoordinateDescent.cpp | 20 +++++++++++-
 src/cyclops/CyclicCoordinateDescent.h   |  3 ++
 6 files changed, 100 insertions(+), 1 deletion(-)
diff --git a/R/ModelFit.R b/R/ModelFit.R
index 79f00152..49424278 100644
--- a/R/ModelFit.R
+++ b/R/ModelFit.R
@@ -200,6 +200,7 @@ fitCyclopsModel <- function(cyclopsData,
         }
 
         .cyclopsSetBeta(cyclopsData$cyclopsInterfacePtr, startingCoefficients)
+        .cyclopsSetStartingBeta(cyclopsData$cyclopsInterfacePtr, startingCoefficients)
     }
 
     if (!is.null(fixedCoefficients)) {
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 24c97eb0..2794d953 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -25,10 +25,22 @@
     invisible(.Call(`_Cyclops_cyclopsSetBeta`, inRcppCcdInterface, beta))
 }
 
+.cyclopsGetBeta <- function(inRcppCcdInterface, index) {
+    .Call(`_Cyclops_cyclopsGetBeta`, inRcppCcdInterface, index)
+}
+
+.cyclopsSetStartingBeta <- function(inRcppCcdInterface, inStartingBeta) {
+    invisible(.Call(`_Cyclops_cyclopsSetStartingBeta`, inRcppCcdInterface, inStartingBeta))
+}
+
 .cyclopsSetFixedBeta <- function(inRcppCcdInterface, beta, fixed) {
     invisible(.Call(`_Cyclops_cyclopsSetFixedBeta`, inRcppCcdInterface, beta, fixed))
 }
 
+.cyclopsGetFixedBeta <- function(inRcppCcdInterface, index) {
+    .Call(`_Cyclops_cyclopsGetFixedBeta`, inRcppCcdInterface, index)
+}
+
 .cyclopsGetIsRegularized <- function(inRcppCcdInterface, index) {
     .Call(`_Cyclops_cyclopsGetIsRegularized`, inRcppCcdInterface, index)
 }
diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp
index 829d757b..eca98a13 100644
--- a/src/RcppCyclopsInterface.cpp
+++ b/src/RcppCyclopsInterface.cpp
@@ -112,6 +112,20 @@ void cyclopsSetBeta(SEXP inRcppCcdInterface, const std::vector<double>& beta) {
     interface->getCcd().setBeta(beta);
 }
 
+// [[Rcpp::export(.cyclopsGetBeta)]]
+double cyclopsGetBeta(SEXP inRcppCcdInterface, const int index) {
+	using namespace bsccs;
+	XPtr<RcppCcdInterface> interface(inRcppCcdInterface);
+	return interface->getCcd().getBeta(index);
+}
+
+// [[Rcpp::export(.cyclopsSetStartingBeta)]]
+void cyclopsSetStartingBeta(SEXP inRcppCcdInterface, const std::vector<double>& inStartingBeta) {
+	using namespace bsccs;
+	XPtr<RcppCcdInterface> interface(inRcppCcdInterface);
+	interface->getCcd().setStartingBeta(inStartingBeta);
+}
+
 // [[Rcpp::export(.cyclopsSetFixedBeta)]]
 void cyclopsSetFixedBeta(SEXP inRcppCcdInterface, int beta, bool fixed) {
     using namespace bsccs;
@@ -120,6 +134,14 @@ void cyclopsSetFixedBeta(SEXP inRcppCcdInterface, int beta, bool fixed) {
     interface->getCcd().setFixedBeta(beta - 1, fixed);
 }
 
+// [[Rcpp::export(.cyclopsGetFixedBeta)]]
+bool cyclopsGetFixedBeta(SEXP inRcppCcdInterface, const int index){
+	using namespace bsccs;
+	XPtr<RcppCcdInterface> interface(inRcppCcdInterface);
+	return interface->getCcd().getFixedBeta(index);
+}
+
+
 // [[Rcpp::export(".cyclopsGetIsRegularized")]]
 bool cyclopsGetIsRegularized(SEXP inRcppCcdInterface, const int index) {
     using namespace bsccs;
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index daadf98a..1b059404 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -6,6 +6,11 @@
 
 using namespace Rcpp;
 
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
 // cyclopsGetModelTypeNames
 std::vector<std::string> cyclopsGetModelTypeNames();
 RcppExport SEXP _Cyclops_cyclopsGetModelTypeNames() {
@@ -68,6 +73,29 @@ BEGIN_RCPP
     return R_NilValue;
 END_RCPP
 }
+// cyclopsGetBeta
+double cyclopsGetBeta(SEXP inRcppCcdInterface, const int index);
+RcppExport SEXP _Cyclops_cyclopsGetBeta(SEXP inRcppCcdInterfaceSEXP, SEXP indexSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP);
+    Rcpp::traits::input_parameter< const int >::type index(indexSEXP);
+    rcpp_result_gen = Rcpp::wrap(cyclopsGetBeta(inRcppCcdInterface, index));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cyclopsSetStartingBeta
+void cyclopsSetStartingBeta(SEXP inRcppCcdInterface, const std::vector<double>& inStartingBeta);
+RcppExport SEXP _Cyclops_cyclopsSetStartingBeta(SEXP inRcppCcdInterfaceSEXP, SEXP inStartingBetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP);
+    Rcpp::traits::input_parameter< const std::vector<double>& >::type inStartingBeta(inStartingBetaSEXP);
+    cyclopsSetStartingBeta(inRcppCcdInterface, inStartingBeta);
+    return R_NilValue;
+END_RCPP
+}
 // cyclopsSetFixedBeta
 void cyclopsSetFixedBeta(SEXP inRcppCcdInterface, int beta, bool fixed);
 RcppExport SEXP _Cyclops_cyclopsSetFixedBeta(SEXP inRcppCcdInterfaceSEXP, SEXP betaSEXP, SEXP fixedSEXP) {
@@ -80,6 +108,18 @@ BEGIN_RCPP
     return R_NilValue;
 END_RCPP
 }
+// cyclopsGetFixedBeta
+bool cyclopsGetFixedBeta(SEXP inRcppCcdInterface, const int index);
+RcppExport SEXP _Cyclops_cyclopsGetFixedBeta(SEXP inRcppCcdInterfaceSEXP, SEXP indexSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP);
+    Rcpp::traits::input_parameter< const int >::type index(indexSEXP);
+    rcpp_result_gen = Rcpp::wrap(cyclopsGetFixedBeta(inRcppCcdInterface, index));
+    return rcpp_result_gen;
+END_RCPP
+}
 // cyclopsGetIsRegularized
 bool cyclopsGetIsRegularized(SEXP inRcppCcdInterface, const int index);
 RcppExport SEXP _Cyclops_cyclopsGetIsRegularized(SEXP inRcppCcdInterfaceSEXP, SEXP indexSEXP) {
@@ -771,7 +811,10 @@ static const R_CallMethodDef CallEntries[] = {
     {"_Cyclops_cyclopsGetUseOffsetNames", (DL_FUNC) &_Cyclops_cyclopsGetUseOffsetNames, 0},
     {"_Cyclops_cyclopsGetComputeDevice", (DL_FUNC) &_Cyclops_cyclopsGetComputeDevice, 1},
     {"_Cyclops_cyclopsSetBeta", (DL_FUNC) &_Cyclops_cyclopsSetBeta, 2},
+    {"_Cyclops_cyclopsGetBeta", (DL_FUNC) &_Cyclops_cyclopsGetBeta, 2},
+    {"_Cyclops_cyclopsSetStartingBeta", (DL_FUNC) &_Cyclops_cyclopsSetStartingBeta, 2},
     {"_Cyclops_cyclopsSetFixedBeta", (DL_FUNC) &_Cyclops_cyclopsSetFixedBeta, 3},
+    {"_Cyclops_cyclopsGetFixedBeta", (DL_FUNC) &_Cyclops_cyclopsGetFixedBeta, 2},
     {"_Cyclops_cyclopsGetIsRegularized", (DL_FUNC) &_Cyclops_cyclopsGetIsRegularized, 2},
     {"_Cyclops_cyclopsSetWeights", (DL_FUNC) &_Cyclops_cyclopsSetWeights, 2},
     {"_Cyclops_cyclopsSetCensorWeights", (DL_FUNC) &_Cyclops_cyclopsSetCensorWeights, 2},
diff --git a/src/cyclops/CyclicCoordinateDescent.cpp b/src/cyclops/CyclicCoordinateDescent.cpp
index 7ec11e3e..20d51d31 100644
--- a/src/cyclops/CyclicCoordinateDescent.cpp
+++ b/src/cyclops/CyclicCoordinateDescent.cpp
@@ -203,10 +203,14 @@ void CyclicCoordinateDescent::init(bool offset) {
 	hDelta.resize(J, static_cast<double>(initialBound));
 	hBeta.resize(J, static_cast<double>(0.0));
 
+	// initialize starting betas to default value 0.0
+	startingBeta.resize(J, static_cast<double>(0.0));
+	
 // 	hXBeta.resize(K, static_cast<double>(0.0));
 // 	hXBetaSave.resize(K, static_cast<double>(0.0));
 
 	fixBeta.resize(J, false);
+	
 	hWeights.resize(0);
     cWeights.resize(0); // ESK: For censor weights
 
@@ -217,6 +221,7 @@ void CyclicCoordinateDescent::init(bool offset) {
 	varianceKnown = false;
 	if (offset) {
 		hBeta[0] = static_cast<double>(1);
+		startingBeta[0] = static_cast<double>(1);
 		fixBeta[0] = true;
 		xBetaKnown = false;
 	} else {
@@ -257,7 +262,13 @@ void CyclicCoordinateDescent::resetBeta(void) {
 
     auto start = hXI.getHasOffsetCovariate() ? 1 : 0;
     for (auto j = start; j < J; j++) {
-		hBeta[j] = 0.0;
+
+    	// check if a non-zero starting beta is present
+    	if (startingBeta[j] != 0.0) {
+    		hBeta[j] = startingBeta[j];
+    	} else {
+    		hBeta[j] = 0.0;
+    	}
 	}
 	computeXBeta();
 	sufficientStatisticsKnown = false;
@@ -470,6 +481,13 @@ void CyclicCoordinateDescent::setBeta(const std::vector<double>& beta) {
 	varianceKnown = false;
 }
 
+void CyclicCoordinateDescent::setStartingBeta(const std::vector<double>& inStartingBeta) {
+	// ToDo: This functionality could be merged into setBeta()
+	for (int j = 0; j < J; ++j) {
+		startingBeta[j] = inStartingBeta[j];
+	}
+}
+
 void CyclicCoordinateDescent::setBeta(int i, double beta) {
 #define PROCESS_IN_MS
 #ifdef PROCESS_IN_MS
diff --git a/src/cyclops/CyclicCoordinateDescent.h b/src/cyclops/CyclicCoordinateDescent.h
index 2e4e3575..e18c5ce0 100644
--- a/src/cyclops/CyclicCoordinateDescent.h
+++ b/src/cyclops/CyclicCoordinateDescent.h
@@ -131,6 +131,8 @@ class CyclicCoordinateDescent {
 //	template <typename T>
 	void setBeta(const std::vector<double>& beta);
 
+	void setStartingBeta(const std::vector<double>& inStartingBeta);
+	
 	void setBeta(int i, double beta);
 
 //	void double getHessianComponent(int i, int j);
@@ -328,6 +330,7 @@ class CyclicCoordinateDescent {
 
 	typedef std::vector<double> DoubleVector;
 	DoubleVector hBeta;
+	DoubleVector startingBeta;
 
 // 	DoubleVector& hXBeta; // TODO Delegate to ModelSpecifics
 // 	DoubleVector& hXBetaSave; // Delegate

From da4b7e706073f0aa5bec074de50908d3989041b3 Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Fri, 28 Oct 2022 23:12:58 +0000
Subject: [PATCH 02/18] add example for ScanReduce()

---
 .../fused_kernel/device_scan_reduce.cuh       | 108 +++++++++++++++++-
 1 file changed, 103 insertions(+), 5 deletions(-)

diff --git a/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh b/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh
index 41d6a39c..6764ea95 100644
--- a/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh
+++ b/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh
@@ -51,7 +51,109 @@ struct DeviceFuse
     //@{
 
     /**
-     * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified binary \p scan_op functor, \p reduction_op functor and \p transform_op functor.
+     * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified
+     * binary \p scan_op functor, \p reduction_op functor and \p transform_op functor.
+     *
+     * @par Snippet
+     * The code snippet below illustrates the inclusive prefix scan and transform-reduction of an `int`
+     * device vector.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * #include "device_scan_reduce.cuh"
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for
+     * // input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [1, 2, 2, 1, 3, 0, 1]
+     * int          *d_trans_in;    // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_sum;         // e.g., [-]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive
+     * // prefix scan
+     * void     *d_temp_storage = nullptr;
+     * size_t   temp_storage_bytes = 0;
+     * DeviceFuse::ScanReduce(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_trans_in, d_sum,
+     *   cub::Sum(), min_op, cub::Sum(),
+     *   num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * DeviceScan::InclusiveSum(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_trans_in, d_sum,
+     *   cub::Sum(), min_op, cub::Sum(),
+     *   num_items);
+     *
+     * // d_sum <-- [26]
+     * @endcode
+     *
+     * @tparam ScanInputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading scan
+     *   inputs \iterator
+     *
+     * @tparam TransformInputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading additional
+     *   input for transformation \iterator
+     *
+     * @tparam OutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing reduction
+     *   outputs \iterator
+     *
+     * @tparam ScanOpT
+     *   **[inferred]** Binary scan functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @tparam ReductionOpT
+     *   **[inferred]** Binary reduction functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @tparam TransformOpT
+     *   **[inferred]** Binary transform functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage. When `nullptr`, the
+     *   required allocation size is written to `temp_storage_bytes` and no
+     *   work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of `d_temp_storage` allocation
+     *
+     * @param[in] d_in
+     *   Random-access iterator to the input sequence of data items
+     *
+     * @param[in] d_trans_in
+     *   Random-access iterator to the additional input sequence of data items
+     *
+     * @param[out] d_sum
+     *   Random-access iterator to the output aggregate
+     *
+     * @param[in] num_items
+     *   Total number of input items (i.e., the length of `d_in`)
+     *
+     * @param[in] stream
+     *   **[optional]** CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
      */
     template <
         typename        ScanInputIteratorT,
@@ -95,10 +197,6 @@ struct DeviceFuse
 
 };
 
-/**
- * \example example_device_fuse.cu
- */
-
 CUB_NAMESPACE_END
 
 

From 189ffa36a07b2d775e7b127ea2a6442393c78fed Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Sat, 5 Nov 2022 06:09:00 +0000
Subject: [PATCH 03/18] update fused kernel for cub_1.8.0

---
 src/Makevars                                  |    1 +
 src/cyclops/engine/CudaKernel.cu              |    3 +-
 .../fused_kernel_1.8.0/agent_scan_reduce.cuh  |  454 ++++++++
 .../block_store_exchange.cuh                  | 1019 +++++++++++++++++
 .../fused_kernel_1.8.0/device_scan_reduce.cuh |  206 ++++
 .../dispatch_scan_reduce.cuh                  |  810 +++++++++++++
 6 files changed, 2491 insertions(+), 2 deletions(-)
 create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh
 create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh
 create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh
 create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh

diff --git a/src/Makevars b/src/Makevars
index cd1a5116..5f91f59a 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -1,6 +1,7 @@
 CUDA_HOME = /usr/local/cuda
 
 CUB_PATH = $(CUDA_HOME)/include
+#CUB_PATH = /home/jianxiao/cub_1.8.0/cub
 
 ## Use the R_HOME indirection to support installations of multiple R version
 PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` -lOpenCL -pthread -L$(CUDA_HOME)/lib64 -lcuda -lcudart
diff --git a/src/cyclops/engine/CudaKernel.cu b/src/cyclops/engine/CudaKernel.cu
index 29c36dd6..f072d7e6 100644
--- a/src/cyclops/engine/CudaKernel.cu
+++ b/src/cyclops/engine/CudaKernel.cu
@@ -11,11 +11,10 @@
 #include <thrust/tuple.h>
 #include <thrust/sequence.h>
 #include <thrust/iterator/permutation_iterator.h>
-//#include <thrust/transform_reduce.h>
-//#include <thrust/for_each.h>
 
 #include "CudaKernel.h"
 #include "fused_kernel/device_scan_reduce.cuh"
+//#include "fused_kernel_1.8.0/device_scan_reduce.cuh"
 
 using namespace cub;
 
diff --git a/src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh
new file mode 100644
index 00000000..5bea9c1b
--- /dev/null
+++ b/src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh
@@ -0,0 +1,454 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScanReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan and transform-reduction.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include "block_store_exchange.cuh"
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_namespace.cuh>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+    //BlockReduceAlgorithm        _REDUCE_ALGORITHM>               ///< Cooperative block-wide reduction algorithm to use
+struct AgentScanPolicy1
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+//	VECTOR_LOAD_LENGTH      = _VECTOR_LOAD_LENGTH,          ///< Number of items per vectorized load
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+    //static const BlockReduceAlgorithm   REDUCE_ALGORITHM        = _REDUCE_ALGORITHM;        ///< Cooperative block-wide reduction algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename ScanInputIteratorT,        ///< Random-access input iterator type
+    typename TransformInputIteratorT,       ///< Random-access additional input iterator type
+    typename OutputIteratorT,       ///< Random-access reduction output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename ReductionOpT,          ///< Reduction functor type
+    typename TransformOpT,          ///< Transformation functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScanReduce
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<ScanInputIteratorT>::value_type InputT;
+    typedef typename std::iterator_traits<TransformInputIteratorT>::value_type InputT1;
+   
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ScanInputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<InputT> ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<ScanInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            ScanInputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedScanInputIteratorT;
+
+    typedef typename If<IsPointer<TransformInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT1, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            ScanInputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedTransformInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            InputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    typedef BlockLoad<
+            InputT1,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT1;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            InputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+    typedef BlockStore<
+            InputT1,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT1;
+
+    // Parameterized BlockReduce type
+    typedef BlockReduce<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            InputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            InputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            InputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+        typename BlockLoadT1::TempStorage    load1;       // Smem needed for tile loading
+        typename BlockStoreT1::TempStorage   store1;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+            typename BlockReduceT::TempStorage           reduce;
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    /// Linear thread-id
+    int linear_tid;
+
+    
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedScanInputIteratorT       d_in;               ///< Input data
+    WrappedTransformInputIteratorT      d_trans_in;              ///< Additional input data for transformation
+    OutputIteratorT             d_block_sum;        ///< Output data (block reduction)
+    OutputIteratorT             d_sum;              ///< Output data (reduction)
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    ReductionOpT                reduction_op;       ///< Binary reduction operator
+    TransformOpT                transform_op;       ///< Transformation operator on scan output    
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+   
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        InputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        InputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        InputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScanReduce(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        ScanInputIteratorT  d_in,               ///< Input data
+        TransformInputIteratorT d_trans_in,              ///< Additional input data for transformation
+        OutputIteratorT d_block_sum,        ///< Output data (block reduction)
+        OutputIteratorT d_sum,              ///< Output data (reduction)
+        ScanOpT         scan_op,            ///< Binary scan operator
+        ReductionOpT    reduction_op,       ///< Binary reduction operator
+        TransformOpT    transform_op,       ///< Transformation operator on scan output
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_trans_in(d_trans_in),
+        d_block_sum(d_block_sum),
+        d_sum(d_sum),
+        scan_op(scan_op),
+        reduction_op(reduction_op),
+        transform_op(transform_op),
+        init_value(init_value),
+        linear_tid(RowMajorTid(AgentScanPolicyT::BLOCK_THREADS, 1, 1))
+    {}
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        InputT items[ITEMS_PER_THREAD]; // input for scan
+        InputT1 items1[ITEMS_PER_THREAD]; // input for reduction
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            InputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Perform reduce and store
+        OutputT item;
+        OutputT block_aggregate = OutputT();
+        OutputT thread_aggregate = OutputT();
+
+        if (IS_LAST_TILE) { // partially-full tile
+
+            BlockStoreT(temp_storage.store).Exchange(tile_offset, items, num_remaining);
+            
+            // load additional data
+            BlockLoadT1(temp_storage.load1).Load(d_trans_in + tile_offset, items1, num_remaining);
+            BlockStoreT1(temp_storage.store1).Exchange(tile_offset, items1, num_remaining);
+	    
+            CTA_SYNC();
+
+            // thread transform reduction
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                if ((ITEM * BLOCK_THREADS) + linear_tid < num_remaining)
+                {
+                    item = transform_op(items[ITEM], items1[ITEM]);
+                    thread_aggregate = reduction_op(thread_aggregate, item);
+                }
+            }
+
+        } else { // full tile
+
+            // load additional data
+            BlockLoadT1(temp_storage.load1).Load(d_trans_in + tile_offset, items1);
+            
+            CTA_SYNC();
+
+            // thread transform reduction
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                item = transform_op(items[ITEM], items1[ITEM]);
+                thread_aggregate = reduction_op(thread_aggregate, item);
+            }
+
+        }
+
+        // block reduction
+        block_aggregate = BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+
+        if (threadIdx.x == 0)
+            d_block_sum[blockIdx.x] = block_aggregate;
+
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    } 
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh b/src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh
new file mode 100644
index 00000000..59e6f5f9
--- /dev/null
+++ b/src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh
@@ -0,0 +1,1019 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_namespace.cuh>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+
+        /// Exchange items for partially-full tile (last tile), guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Exchange(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to exchange
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+        }	
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Exchange(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to exchange
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Exchange(block_itr, items, valid_items);
+    }    
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh
new file mode 100644
index 00000000..4fae1c10
--- /dev/null
+++ b/src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh
@@ -0,0 +1,206 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScanReduce provides device-wide, parallel operations for computing a prefix scan and a tranform-reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include "dispatch_scan_reduce.cuh"
+#include <cub/util_namespace.cuh>
+
+/// Optional outer namespace(s)
+//CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+struct DeviceFuse
+{
+    /******************************************************************//**
+     * \name Inclusive scan and transform-reduction
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified
+     * binary \p scan_op functor, \p reduction_op functor and \p transform_op functor.
+     *
+     * @par Snippet
+     * The code snippet below illustrates the inclusive prefix scan and transform-reduction of an `int`
+     * device vector.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * #include "device_scan_reduce.cuh"
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for
+     * // input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [1, 2, 2, 1, 3, 0, 1]
+     * int          *d_trans_in;    // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_sum;         // e.g., [-]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive
+     * // prefix scan
+     * void     *d_temp_storage = nullptr;
+     * size_t   temp_storage_bytes = 0;
+     * DeviceFuse::ScanReduce(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_trans_in, d_sum,
+     *   cub::Sum(), min_op, cub::Sum(),
+     *   num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * DeviceScan::InclusiveSum(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_trans_in, d_sum,
+     *   cub::Sum(), min_op, cub::Sum(),
+     *   num_items);
+     *
+     * // d_sum <-- [26]
+     * @endcode
+     *
+     * @tparam ScanInputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading scan
+     *   inputs \iterator
+     *
+     * @tparam TransformInputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading additional
+     *   input for transformation \iterator
+     *
+     * @tparam OutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing reduction
+     *   outputs \iterator
+     *
+     * @tparam ScanOpT
+     *   **[inferred]** Binary scan functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @tparam ReductionOpT
+     *   **[inferred]** Binary reduction functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @tparam TransformOpT
+     *   **[inferred]** Binary transform functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage. When `nullptr`, the
+     *   required allocation size is written to `temp_storage_bytes` and no
+     *   work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of `d_temp_storage` allocation
+     *
+     * @param[in] d_in
+     *   Random-access iterator to the input sequence of data items
+     *
+     * @param[in] d_trans_in
+     *   Random-access iterator to the additional input sequence of data items
+     *
+     * @param[out] d_sum
+     *   Random-access iterator to the output aggregate
+     *
+     * @param[in] num_items
+     *   Total number of input items (i.e., the length of `d_in`)
+     *
+     * @param[in] stream
+     *   **[optional]** CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        ScanInputIteratorT,
+        typename        TransformInputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        ReductionOpT,
+        typename        TransformOpT>	
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ScanReduce(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ScanInputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        TransformInputIteratorT d_trans_in,                              ///< [in] Pointer to the additional input sequence of data items for transformation
+        OutputIteratorT d_sum,                              ///< [out] Pointer to the output aggregate
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        ReductionOpT    reduction_op,	                    ///< [in] Binary reduction functor
+        TransformOpT    transform_op,	                    ///< [in] Transformation functor on scan output	
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScanReduce<ScanInputIteratorT, TransformInputIteratorT, OutputIteratorT, ScanOpT, ReductionOpT, TransformOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_trans_in,
+            d_sum,
+            scan_op,
+            reduction_op,
+            transform_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh
new file mode 100644
index 00000000..9805552a
--- /dev/null
+++ b/src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh
@@ -0,0 +1,810 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScanReduce provides device-wide, parallel operations for computing a prefix scan and transform-reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "agent_scan_reduce.cuh"
+#include <cub/thread/thread_operators.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_namespace.cuh>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanReduceInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel1(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
+    typename            ScanInputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            TransformInputIteratorT,    ///< Random-access input iterator type for reading additional inputs for transfromation \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing reduction outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            ReductionOpT,       ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            TransformOpT,       ///< Transformation functor on scan output
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanReduceKernel(
+    ScanInputIteratorT      d_in,               ///< Input data
+    TransformInputIteratorT     d_trans_in,              ///< Additional input data for transformation
+    OutputIteratorT     d_block_sum,        ///< Output data (block reduction)
+    OutputIteratorT     d_sum,              ///< Output data (reduction)
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor 
+    ReductionOpT        reduction_op,       ///< Binary reduction functor
+    TransformOpT        transform_op,       ///< Transformation functor on scan output
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    // Thread block type for scanning input tiles
+    typedef AgentScanReduce<
+        ScanPolicyT,
+        ScanInputIteratorT,
+	TransformInputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        ReductionOpT,
+        TransformOpT,
+        InitValueT,
+        OffsetT> AgentScanReduceT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanReduceT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanReduceT(temp_storage, d_in, d_trans_in, d_block_sum, d_sum, scan_op, reduction_op, transform_op, init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+
+
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+//    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                SingleTilePolicyT,          ///< Chained tuning policy
+    typename                ScanInputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OuputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+//__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__launch_bounds__ (int(SingleTilePolicyT::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel1(
+    ScanInputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OuputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+//	    typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            SingleTilePolicyT,
+            ScanInputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OuputT,            ///< Data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DeviceScanReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        // ReducePolicy
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread
+                2,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                       ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(128, 8, OuputT),     ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(256, 20, OuputT),    ///< Threads per block, items per thread
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(256, 20, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(256, 16, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename ScanInputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename TransformInputIteratorT,    ///< Random-access input iterator type for reading additional inputs for transformation \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing reduction outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename ReductionOpT,       ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+    typename TransformOpT,       ///< Transformation functor on scan output    
+    typename InitValueT,         ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT>            ///< Signed integer type for global offsets
+struct DispatchScanReduce :
+    DeviceReducePolicy<
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<ScanInputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+        OffsetT,
+        ReductionOpT>
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The input value type
+    typedef typename std::iterator_traits<ScanInputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ScanInputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<InputT> ScanTileStateT;
+
+    int num_tiles;
+    int init_grid_size;
+    int scan_grid_size;
+    int scan_sm_occupancy;
+    ScanTileStateT tile_state;
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM600
+    struct Policy600
+    {
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(128, 15, InputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM520
+    struct Policy520
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(128, 12, InputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(128, 12, InputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(256, 9, InputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(128, 12, InputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(96, 21, InputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(64, 9, InputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 520)
+    typedef Policy520 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {};
+    struct PtxSingleTilePolicy : PtxPolicy::ScanPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &scan_kernel_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        scan_kernel_config.template Init<PtxAgentScanPolicy>();
+        scan_kernel_config.template Init<PtxSingleTilePolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 600)
+        {
+            scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
+        }
+        else if (ptx_version >= 520)
+        {
+            scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
+        }
+        else if (ptx_version >= 350)
+        {
+            scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
+        }
+        else
+        {
+            scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide prefix scan using the
+     * specified kernel functions.
+     */
+    template <
+//        typename            SingleTilePolicyT,
+        typename            ScanReduceInitKernelPtrT,     ///< Function type of cub::DeviceScanInitKernel
+        typename            ScanReduceSweepKernelPtrT,    ///< Function type of cub::DeviceScanKernelPtrT
+        typename            SingleTileKernelT>            ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ScanInputIteratorT      d_in,                   ///< [in] Pointer to the input sequence of data items
+        TransformInputIteratorT     d_trans_in,                  ///< [in] Pointer to the additional input sequence of data items for transformation
+        OutputIteratorT     d_sum,                  ///< [out] Pointer to the output aggregate
+        ScanOpT             scan_op,                ///< [in] Binary scan functor 
+        ReductionOpT        reduction_op,           ///< [in] Binary reduction functor 	
+        TransformOpT        transform_op,           ///< [in] Transformation functor on scan output
+        InitValueT          init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT             num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                 /*ptx_version*/,        ///< [in] PTX version of dispatch kernels
+        ScanReduceInitKernelPtrT  init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ScanReduceSweepKernelPtrT scan_reduce_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
+        SingleTileKernelT   single_tile_kernel,     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+        KernelConfig        scan_kernel_config)     ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_trans_in;        
+        (void)d_sum;
+        (void)scan_op;
+        (void)reduction_op;
+        (void)transform_op;
+        (void)init_value;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)init_kernel;
+        (void)scan_reduce_kernel;
+        (void)single_tile_kernel;
+        (void)scan_kernel_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+	typedef typename DispatchScanReduce::MaxPolicy MaxPolicyT;
+        typedef typename MaxPolicyT::SingleTilePolicy  SingleTilePolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[2]; // bytes for both scan and reduction
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors (scan)
+            allocation_sizes[1] =
+            {
+                    scan_grid_size * sizeof(OutputT)    // bytes needed for privatized block reductions (reduction)
+            };
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[2]; // temp storage for both scan and reduction
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+   
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+            OutputT* d_block_sum = (OutputT*) allocations[1];
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_reduce_kernel,
+                scan_kernel_config.block_threads))) break;
+
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_reduce_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                scan_reduce_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
+                    d_in,
+                    d_trans_in,
+                    d_block_sum,
+                    d_sum,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    reduction_op,
+                    transform_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+	    
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                SingleTilePolicyT::BLOCK_THREADS,
+                (long long) stream,
+                SingleTilePolicyT::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            single_tile_kernel<<<1, SingleTilePolicyT::BLOCK_THREADS, 0, stream>>>(
+                d_block_sum,
+                d_sum,
+                scan_grid_size,
+                reduction_op,
+                OutputT());
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+     /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ScanInputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        TransformInputIteratorT d_trans_in,                  ///< [in] Pointer to the additional input sequence of data items for transformation
+        OutputIteratorT d_sum,                  ///< [out] Pointer to the output aggregate
+        ScanOpT         scan_op,                ///< [in] Binary scan functor 
+        ReductionOpT    reduction_op,           ///< [in] Binary reduction functor
+        TransformOpT    transform_op,           ///< [in] Transformation functor on scan output	
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchScanReduce::MaxPolicy MaxPolicyT;
+        typedef typename MaxPolicyT::SingleTilePolicy  SingleTilePolicyT;
+
+        cudaError error = cudaSuccess;
+//        debug_synchronous = true;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig scan_kernel_config;
+            InitConfigs(ptx_version, scan_kernel_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_trans_in,
+                d_sum,
+                scan_op,
+                reduction_op,
+                transform_op,
+                init_value,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceScanReduceInitKernel<ScanTileStateT>,
+                DeviceScanReduceKernel<PtxAgentScanPolicy, ScanInputIteratorT, TransformInputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, ReductionOpT, TransformOpT, InitValueT, OffsetT>,
+                DeviceReduceSingleTileKernel1<SingleTilePolicyT, OutputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>,
+                scan_kernel_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+

From 1cf448118cf64d100bb08ba14fedcdfe229cad8e Mon Sep 17 00:00:00 2001
From: Jianxiao Yang <yangjx@g.ucla.edu>
Date: Sat, 10 Dec 2022 19:55:06 +0800
Subject: [PATCH 04/18] add ScanReduce() example for cub_1.17

---
 .../fused_kernel_1.17/device_scan_reduce.cuh  | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh
index 41d6a39c..4b8c1e9c 100644
--- a/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh
+++ b/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh
@@ -52,6 +52,107 @@ struct DeviceFuse
 
     /**
      * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified binary \p scan_op functor, \p reduction_op functor and \p transform_op functor.
+     *
+     * @par Snippet
+     * The code snippet below illustrates the inclusive prefix scan and transform-reduction of an `int`
+     * device vector.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * #include "device_scan_reduce.cuh"
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for
+     * // input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [1, 2, 2, 1, 3, 0, 1]
+     * int          *d_trans_in;    // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_sum;         // e.g., [-]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive
+     * // prefix scan
+     * void     *d_temp_storage = nullptr;
+     * size_t   temp_storage_bytes = 0;
+     * DeviceFuse::ScanReduce(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_trans_in, d_sum,
+     *   cub::Sum(), min_op, cub::Sum(),
+     *   num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * DeviceScan::InclusiveSum(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_trans_in, d_sum,
+     *   cub::Sum(), min_op, cub::Sum(),
+     *   num_items);
+     *
+     * // d_sum <-- [26]
+     * @endcode
+     *
+     * @tparam ScanInputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading scan
+     *   inputs \iterator
+     *
+     * @tparam TransformInputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading additional
+     *   input for transformation \iterator
+     *
+     * @tparam OutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing reduction
+     *   outputs \iterator
+     *
+     * @tparam ScanOpT
+     *   **[inferred]** Binary scan functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @tparam ReductionOpT
+     *   **[inferred]** Binary reduction functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @tparam TransformOpT
+     *   **[inferred]** Binary transform functor type having member
+     *   `T operator()(const T &a, const T &b)`
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage. When `nullptr`, the
+     *   required allocation size is written to `temp_storage_bytes` and no
+     *   work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of `d_temp_storage` allocation
+     *
+     * @param[in] d_in
+     *   Random-access iterator to the input sequence of data items
+     *
+     * @param[in] d_trans_in
+     *   Random-access iterator to the additional input sequence of data items
+     *
+     * @param[out] d_sum
+     *   Random-access iterator to the output aggregate
+     *
+     * @param[in] num_items
+     *   Total number of input items (i.e., the length of `d_in`)
+     *
+     * @param[in] stream
+     *   **[optional]** CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
      */
     template <
         typename        ScanInputIteratorT,

From 270043434fa217f834a0d03bb037aa224011f045 Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Tue, 4 Apr 2023 14:59:40 -0700
Subject: [PATCH 05/18] attempt at 3.3.0 CRAN submission

---
 DESCRIPTION                                    |  2 +-
 NEWS.md                                        | 11 +++++++++++
 R/Survfit.R                                    |  6 ++++--
 README.md                                      |  4 ++--
 cran-comments.md                               | 18 ++++++++++--------
 inst/CITATION                                  | 10 +++++-----
 man/survfit.cyclopsFit.Rd                      |  6 ++++--
 src/cyclops/Types.h                            | 18 +++++++++---------
 ...ierarchyGridSearchCrossValidationDriver.cpp |  6 +++---
 9 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f9555bf2..4b37e0fc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: Cyclops
 Type: Package
 Title: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis
-Version: 3.2.0
+Version: 3.3.0
 Authors@R: c(
     person("Marc A.", "Suchard", email = "msuchard@ucla.edu", role = c("aut","cre")),
     person("Martijn J.", "Schuemie", role = "aut"),
diff --git a/NEWS.md b/NEWS.md
index ee655c35..17c42c9b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,15 @@
+Cyclops v3.3.0
+==============
+
+Changes:
+
+1. bump for R 4.2
+2. fix CRAN warnings
+    a. used `minValues`
+    b. explicit dependence on C++11
+
 Cyclops v3.2.0
+==============
 
 Changes:
 
diff --git a/R/Survfit.R b/R/Survfit.R
index f1f2532f..798935fb 100644
--- a/R/Survfit.R
+++ b/R/Survfit.R
@@ -4,15 +4,17 @@
 #' @description
 #' \code{survfit.cyclopsFit} computes baseline hazard function
 #'
-#' @param cyclopsFit A Cyclops survival model fit object
+#' @param formula A Cyclops survival model fit object
 #' @param type type of baseline survival, choices are: "aalen" (Breslow)
+#' @param ... for future methods
 #'
 #' @return Baseline survival function for mean covariates
 #'
 #' @importFrom survival survfit
 #'
 #' @export
-survfit.cyclopsFit <- function(cyclopsFit, type="aalen") {
+survfit.cyclopsFit <- function(formula, type="aalen", ...) {
+    cyclopsFit <- formula
     delta = meanLinearPredictor(cyclopsFit)
 
     times = getTimeVector(cyclopsFit$cyclopsData)
diff --git a/README.md b/README.md
index 3cf4cb0c..f7eded0b 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@ Cyclops
 =======
 
 [![Build Status](https://github.com/ohdsi/Cyclops/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/Cyclops/actions?query=workflow%3AR-CMD-check)
-[![codecov.io](https://codecov.io/github/OHDSI/Cyclops/coverage.svg?branch=main)](https://codecov.io/github/OHDSI/Cyclops?branch=main)
+[![codecov.io](https://app.codecov.io/github/OHDSI/Cyclops/coverage.svg?branch=main)](https://app.codecov.io/github/OHDSI/Cyclops?branch=main)
 [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/Cyclops)](https://CRAN.R-project.org/package=Cyclops)
 [![CRAN_Status_Badge](https://cranlogs.r-pkg.org/badges/Cyclops)](https://cran.r-project.org/package=Cyclops)
 
@@ -76,7 +76,7 @@ License
 =======
 Cyclops is licensed under Apache License 2.0.   Cyclops contains the TinyThread libray.
 
-The TinyThread library is licensed under the [zlib/libpng](https://opensource.org/licenses/Zlib/) license as described [here](https://tinythreadpp.bitsnbites.eu/).
+The TinyThread library is licensed under the [zlib/libpng](https://opensource.org/license/zlib-license-php/) license as described [here](https://tinythreadpp.bitsnbites.eu/).
 
 Development
 ===========
diff --git a/cran-comments.md b/cran-comments.md
index 953a3c98..d76f48cb 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,9 +1,6 @@
-## Initial submission of major patch update to package
+## Initial submission of patch update to package
 
-* fixed likelihood profiling when objective function is non-concave due
-  to floating-point truncation 
-* fixed parsing of 64-bit integer covariate IDs
-* added Jeffrey's prior for single regression coefficient
+* fixed all CRAN WARNINGS, including generic-inconsistency, uninitialized values
 
 ## Test environments
 * local OS X install, R 4.1
@@ -12,16 +9,21 @@
 
 ## R CMD check results
 * There were no ERRORs or WARNINGs
-* There is 1 occasional NOTE:
-  checking installed package size ... NOTE
+* There is 2 occasional NOTEs:
+  1. checking installed package size ... NOTE
     installed size is 22.5Mb
     sub-directories of 1Mb or more:
       libs 21.7Mb
+  2. Specified C++11: please drop specification unless essential      
 
-This occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug
+The first NOTE occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug
 symbols in their compilation; Cyclops performance is heavily dependent on many template
 instantiations that generate a large library when including debug symbols.  Future
 availability of C++17 'if (constexpr ...)' should decrease library size substantially.
 
+The second NOTE remains because many OHDSI community users remain on R 4.0 that supports C++11
+but does not automatically compile with C++11 enabled.  
+
 ## Downstream dependencies
 * 'EvidenceSynthesis' - checked and works.
+* 'IterativeHardThresholding' - checked and works.
diff --git a/inst/CITATION b/inst/CITATION
index 8d29f2ff..7306e6e7 100644
--- a/inst/CITATION
+++ b/inst/CITATION
@@ -1,11 +1,11 @@
 citHeader("To cite Cyclops in publications use:")
 
 bibentry(bibtype = "Article",
-        author = personList(as.person("M. A. Suchard"),
-                            as.person("S. E. Simpson"),
-                            as.person("I. Zorych"),
-                            as.person("P. Ryan"),
-                            as.person("D. Madigan")),
+        author = c(as.person("M. A. Suchard"),
+                   as.person("S. E. Simpson"),
+                   as.person("I. Zorych"),
+                   as.person("P. Ryan"),
+                   as.person("D. Madigan")),
         title = "Massive parallelization of serial inference algorithms for complex generalized linear models",
         journal = "ACM Transactions on Modeling and Computer Simulation",
         volume = "23",
diff --git a/man/survfit.cyclopsFit.Rd b/man/survfit.cyclopsFit.Rd
index e77faa42..0b4b4051 100644
--- a/man/survfit.cyclopsFit.Rd
+++ b/man/survfit.cyclopsFit.Rd
@@ -4,12 +4,14 @@
 \alias{survfit.cyclopsFit}
 \title{Calculate baseline hazard function}
 \usage{
-\method{survfit}{cyclopsFit}(cyclopsFit, type = "aalen")
+\method{survfit}{cyclopsFit}(formula, type = "aalen", ...)
 }
 \arguments{
-\item{cyclopsFit}{A Cyclops survival model fit object}
+\item{formula}{A Cyclops survival model fit object}
 
 \item{type}{type of baseline survival, choices are: "aalen" (Breslow)}
+
+\item{...}{for future methods}
 }
 \value{
 Baseline survival function for mean covariates
diff --git a/src/cyclops/Types.h b/src/cyclops/Types.h
index 32d514ac..b477e85a 100644
--- a/src/cyclops/Types.h
+++ b/src/cyclops/Types.h
@@ -12,7 +12,7 @@
 #include <vector>
 #include <map>
 
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+// #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
 // C++11
     #include <memory>
     namespace bsccs {
@@ -20,15 +20,15 @@
         using std::make_shared;
         using std::unique_ptr;
     }
-#else
+// #else
 // C++98 (R build)
-    #include "boost/smart_ptr.hpp"
-    namespace bsccs {
-        using boost::shared_ptr;
-        using boost::make_shared;
-        using boost::unique_ptr;
-    }
-#endif
+//    #include "boost/smart_ptr.hpp"
+//    namespace bsccs {
+//        using boost::shared_ptr;
+//        using boost::make_shared;
+//        using boost::unique_ptr;
+//    }
+// #endif
 
 #ifdef WIN_BUILD
     #include <tr1/unordered_map>
diff --git a/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp b/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp
index 49c3476d..68c3d1a0 100644
--- a/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp
+++ b/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp
@@ -74,7 +74,7 @@ void HierarchyGridSearchCrossValidationDriver::drive(CyclicCoordinateDescent& cc
 	std::vector<double> outerPoints;
 	std::vector<double> innerPoints;
 	std::vector<double> outerValues;
-	std::vector<double> minValues;
+	// std::vector<double> minValues;
 
 	for (int outerStep = 0; outerStep < gridSize; outerStep++){
 		std::vector<double> predLogLikelihoodOuter;
@@ -124,9 +124,9 @@ void HierarchyGridSearchCrossValidationDriver::drive(CyclicCoordinateDescent& cc
 		// Report results
 		double maxPoint;
 		double maxValue;
-		double minValue;
+		// double minValue; removed for v3.3.0; TODO why was this here?
 		findMax(&maxPoint, &maxValue);
-		minValues.push_back(minValue);
+		// minValues.push_back(minValue);
 
 		innerPoints.push_back(maxPoint);
 		outerPoints.push_back(outerPoint);

From d88c531efa06933c1007cb785343f1382c95ee0d Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Wed, 5 Apr 2023 08:47:41 -0700
Subject: [PATCH 06/18] fix CRAN warnings and notes; remove explicit dependency
 on C++11 for R < 4.1

---
 .Rbuildignore                         |  2 ++
 .gitignore                            |  2 ++
 DESCRIPTION                           |  1 +
 NEWS.md                               |  3 ++-
 cleanup                               |  3 +++
 configure                             | 18 ++++++++++++++++++
 configure.win                         |  2 ++
 cran-comments.md                      | 13 +++++--------
 src/{Makevars => Makevars.in}         |  2 --
 src/{Makevars.win => Makevars.win.in} |  2 --
 tools/configure.R                     | 23 +++++++++++++++++++++++
 11 files changed, 58 insertions(+), 13 deletions(-)
 create mode 100755 cleanup
 create mode 100755 configure
 create mode 100755 configure.win
 rename src/{Makevars => Makevars.in} (99%)
 rename src/{Makevars.win => Makevars.win.in} (99%)
 create mode 100644 tools/configure.R

diff --git a/.Rbuildignore b/.Rbuildignore
index 5695b3ca..6d87dae1 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -34,3 +34,5 @@ deprecated
 ^\.github$
 ^CRAN-RELEASE$
 .covrignore
+src/Makevars$
+src/Makevars.win$
diff --git a/.gitignore b/.gitignore
index 66ec7364..acfe09e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,5 @@ CRAN-RELEASE
 .DS_Store
 *.gcno
 *.gcda
+src/Makevars
+src/Makevars.win
diff --git a/DESCRIPTION b/DESCRIPTION
index 4b37e0fc..3f493895 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -26,6 +26,7 @@ Description: This model fitting tool incorporates cyclic coordinate descent and
     Suchard, Simpson, Zorych, Ryan and Madigan (2013) <doi:10.1145/2414416.2414791>.
 License: Apache License 2.0
 LazyData: Yes
+Biarch: true
 URL: https://github.com/ohdsi/cyclops
 BugReports: https://github.com/ohdsi/cyclops/issues
 Depends:
diff --git a/NEWS.md b/NEWS.md
index 17c42c9b..3cf71839 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -6,7 +6,8 @@ Changes:
 1. bump for R 4.2
 2. fix CRAN warnings
     a. used `minValues`
-    b. explicit dependence on C++11
+3. fix CRAN notes
+    a. remove explicit dependence on C++11 (except for R <= 4.0)   
 
 Cyclops v3.2.0
 ==============
diff --git a/cleanup b/cleanup
new file mode 100755
index 00000000..db13273f
--- /dev/null
+++ b/cleanup
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+rm -f config.* src/Makevars src/Markvars.win src/config.h
diff --git a/configure b/configure
new file mode 100755
index 00000000..87fb3528
--- /dev/null
+++ b/configure
@@ -0,0 +1,18 @@
+#!/usr/bin/env sh
+
+# adapted from https://github.com/tidyverse/purrr/blob/main/configure
+
+# Check that this is not just ./configure. We need to run this
+# from R CMD INSTALL, to have the R env vars set.
+
+if [ -z "$R_HOME" ]; then
+echo >&2 R_HOME is not set, are you running R CMD INSTALL?
+exit 1
+fi
+
+# Find the R binary we need to use. This is a bit trickier on
+# Windows, because it has two architectures. On windows R_ARCH_BIN
+# is set, so this should work everywhere.
+RBIN="${R_HOME}/bin${R_ARCH_BIN}/R"
+
+"$RBIN" --vanilla --slave -f tools/configure.R
diff --git a/configure.win b/configure.win
new file mode 100755
index 00000000..1f66e1c7
--- /dev/null
+++ b/configure.win
@@ -0,0 +1,2 @@
+#! /usr/bin/env sh
+sh ./configure
diff --git a/cran-comments.md b/cran-comments.md
index d76f48cb..c4255b42 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,6 +1,7 @@
 ## Initial submission of patch update to package
 
-* fixed all CRAN WARNINGS, including generic-inconsistency, uninitialized values
+* fixed all WARNINGS, including generic-inconsistency, uninitialized values
+* fixed NOTE about C++11
 
 ## Test environments
 * local OS X install, R 4.1
@@ -9,21 +10,17 @@
 
 ## R CMD check results
 * There were no ERRORs or WARNINGs
-* There is 2 occasional NOTEs:
-  1. checking installed package size ... NOTE
+* There is 1 occasional NOTE:
+    checking installed package size ... NOTE
     installed size is 22.5Mb
     sub-directories of 1Mb or more:
       libs 21.7Mb
-  2. Specified C++11: please drop specification unless essential      
 
-The first NOTE occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug
+This occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug
 symbols in their compilation; Cyclops performance is heavily dependent on many template
 instantiations that generate a large library when including debug symbols.  Future
 availability of C++17 'if (constexpr ...)' should decrease library size substantially.
 
-The second NOTE remains because many OHDSI community users remain on R 4.0 that supports C++11
-but does not automatically compile with C++11 enabled.  
-
 ## Downstream dependencies
 * 'EvidenceSynthesis' - checked and works.
 * 'IterativeHardThresholding' - checked and works.
diff --git a/src/Makevars b/src/Makevars.in
similarity index 99%
rename from src/Makevars
rename to src/Makevars.in
index a1bfd86d..ee8a04ba 100644
--- a/src/Makevars
+++ b/src/Makevars.in
@@ -24,8 +24,6 @@ PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"`
 ## done by a number of packages, but recommended only for more advanced users
 ## comfortable with autoconf and its related tools.
 
-CXX_STD = CXX11
-
 PKG_CPPFLAGS = -I. -Icyclops -DR_BUILD -DDOUBLE_PRECISION
 PKG_CXXFLAGS = -g1
 
diff --git a/src/Makevars.win b/src/Makevars.win.in
similarity index 99%
rename from src/Makevars.win
rename to src/Makevars.win.in
index dd5cd45c..8a09c711 100644
--- a/src/Makevars.win
+++ b/src/Makevars.win.in
@@ -24,8 +24,6 @@ PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"`
 ## done by a number of packages, but recommended only for more advanced users
 ## comfortable with autoconf and its related tools.
 
-CXX_STD = CXX11
-
 PKG_CPPFLAGS = -I. -Icyclops -DR_BUILD -DWIN_BUILD -DDOUBLE_PRECISION -DRCPP_PARALLEL_USE_TBB=1
 
 # PKG_LIBS += $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" \
diff --git a/tools/configure.R b/tools/configure.R
new file mode 100644
index 00000000..d9853786
--- /dev/null
+++ b/tools/configure.R
@@ -0,0 +1,23 @@
+# Gets run when compiling
+
+makevars_in <- file.path("src", "Makevars.in")
+makevars_win_in <- file.path("src", "Makevars.win.in")
+
+makevars_out <- file.path("src", "Makevars")
+makevars_win_out <- file.path("src", "Makevars.win")
+
+txt <- readLines(makevars_in)
+txt_win <- readLines(makevars_win_in)
+
+if (getRversion() < "4.1") {
+    if (!any(grepl("^CXX_STD", txt))) {
+        txt <- c("CXX_STD = CXX11", txt)
+    }
+
+    if (!any(grepl("^CXX_STD", txt_win))) {
+        txt_win <- c("CXX_STD = CXX11", txt_win)
+    }
+}
+
+cat(txt, file = makevars_out, sep = "\n")
+cat(txt_win, file = makevars_win_out, sep = "\n")

From ec41b492efbdaca83a923ffbe8c897a703435493 Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Wed, 5 Apr 2023 10:57:32 -0700
Subject: [PATCH 07/18] fix line-endings for windoz, remove some std::cerr

---
 DESCRIPTION                                             | 2 +-
 NEWS.md                                                 | 1 +
 src/cyclops/drivers/AbstractCrossValidationDriver.cpp   | 2 --
 src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp | 5 +----
 tools/configure.R                                       | 7 +++++--
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 747d0b51..0158d5b2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: Cyclops
 Type: Package
 Title: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis
-Version: 3.3.0
+Version: 3.3.0.99
 Authors@R: c(
     person("Marc A.", "Suchard", email = "msuchard@ucla.edu", role = c("aut","cre")),
     person("Martijn J.", "Schuemie", role = "aut"),
diff --git a/NEWS.md b/NEWS.md
index 726fbbc8..37d71a8a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,6 +4,7 @@ develop
 Changes:
 
 1. improvements on adaptive likelihood profiling
+2. fix line-endings on Makevar on windows
 
 Cyclops v3.3.0
 ==============
diff --git a/src/cyclops/drivers/AbstractCrossValidationDriver.cpp b/src/cyclops/drivers/AbstractCrossValidationDriver.cpp
index 2d3ac6f0..4ceddf75 100644
--- a/src/cyclops/drivers/AbstractCrossValidationDriver.cpp
+++ b/src/cyclops/drivers/AbstractCrossValidationDriver.cpp
@@ -141,8 +141,6 @@ double AbstractCrossValidationDriver::doCrossValidationStep(
     const auto& arguments = allArguments.crossValidation;
     bool coldStart = allArguments.resetCoefficients;
 
-    std::cerr << "C " << arguments.foldToCompute << std::endl;
-
 	predLogLikelihood.resize(arguments.foldToCompute);
 
 	auto& weightsExclude = this->weightsExclude;
diff --git a/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp b/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp
index b4b67080..9b1da1ee 100644
--- a/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp
+++ b/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp
@@ -112,13 +112,10 @@ MaxPoint AutoSearchCrossValidationDriver::doCrossValidationLoop(
 
 	            std::vector<double> predLogLikelihood;
 
-	            std::cerr << "A" << std::endl;
-
 	            // Newly re-located code
 	            double pointEstimate = doCrossValidationStep(ccd, selector, allArguments, step,
                                                           nThreads, ccdPool, selectorPool,
-                                                          predLogLikelihood);
-                std::cerr << "B" << std::endl;
+                                                          predLogLikelihood);             
 
 	            double stdDevEstimate = computeStDev(predLogLikelihood, pointEstimate);
 
diff --git a/tools/configure.R b/tools/configure.R
index d9853786..b372da41 100644
--- a/tools/configure.R
+++ b/tools/configure.R
@@ -19,5 +19,8 @@ if (getRversion() < "4.1") {
     }
 }
 
-cat(txt, file = makevars_out, sep = "\n")
-cat(txt_win, file = makevars_win_out, sep = "\n")
+if (.Platform$OS.type == "unix") {
+    cat(txt, file = makevars_out, sep = "\n")
+} else {
+    cat(txt_win, file = makevars_win_out, sep = "\n")
+}

From 353d6bda4800b79091f42aebb43636063454acca Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Tue, 11 Apr 2023 22:35:13 +0000
Subject: [PATCH 08/18] log both raw estimates and summary stats for bootstrap

---
 src/cyclops/drivers/BootstrapDriver.cpp | 54 +++++++++++++++++++++++++
 src/cyclops/drivers/BootstrapDriver.h   |  2 +
 2 files changed, 56 insertions(+)

diff --git a/src/cyclops/drivers/BootstrapDriver.cpp b/src/cyclops/drivers/BootstrapDriver.cpp
index e07613f4..049cefaa 100644
--- a/src/cyclops/drivers/BootstrapDriver.cpp
+++ b/src/cyclops/drivers/BootstrapDriver.cpp
@@ -131,4 +131,58 @@ void BootstrapDriver::logResults(const CCDArguments& arguments, std::vector<doub
 	outLog.close();
 }
 
+void BootstrapDriver::logHR(const CCDArguments& arguments, std::vector<double>& savedBeta, std::string treatmentId) {
+
+//	int j = J-1;
+	int j = 0;
+	while (modelData->getColumnLabel(j) != treatmentId) j++;
+
+	ofstream outLog(arguments.outFileName.c_str());
+	if (!outLog) {
+        std::ostringstream stream;
+		stream << "Unable to open log file: " << arguments.bsFileName;
+		error->throwError(stream);
+	}
+
+	string sep(","); // TODO Make option
+
+	// Raw estimates
+	for(rvector::iterator it = estimates[j]->begin(); it != estimates[j]->end(); ++it) outLog << *it << endl;
+
+	// Stats
+	outLog << "Drug_concept_id" << sep << "Condition_concept_id" << sep <<
+		"score" << sep << "standard_error" << sep << "bs_mean" << sep << "bs_lower" << sep <<
+		"bs_upper" << sep << "bs_prob0" << endl;
+
+	outLog << modelData->getColumnLabel(j) <<
+		sep << treatmentId << sep;
+
+	double mean = 0.0;
+	double var = 0.0;
+	double prob0 = 0.0;
+	for (rvector::iterator it = estimates[j]->begin(); it != estimates[j]->end(); ++it) {
+		mean += *it;
+		var += *it * *it;
+		if (*it == 0.0) {
+			prob0 += 1.0;
+		}
+	}
+
+	double size = static_cast<double>(estimates[j]->size());
+	mean /= size;
+	var = (var / size) - (mean * mean);
+	prob0 /= size;
+
+	sort(estimates[j]->begin(), estimates[j]->end());
+	int offsetLower = static_cast<int>(size * 0.025);
+	int offsetUpper = static_cast<int>(size * 0.975);
+
+	double lower = *(estimates[j]->begin() + offsetLower);
+	double upper = *(estimates[j]->begin() + offsetUpper);
+
+	outLog << savedBeta[j] << sep;
+	outLog << std::sqrt(var) << sep << mean << sep << lower << sep << upper << sep << prob0 << endl;
+	outLog.close();
+}
+
 } // namespace
diff --git a/src/cyclops/drivers/BootstrapDriver.h b/src/cyclops/drivers/BootstrapDriver.h
index e72571ee..64fba9e7 100644
--- a/src/cyclops/drivers/BootstrapDriver.h
+++ b/src/cyclops/drivers/BootstrapDriver.h
@@ -39,6 +39,8 @@ class BootstrapDriver : public AbstractDriver {
 
 	void logResults(const CCDArguments& arguments, std::vector<double>& savedBeta, std::string conditionId);
 
+	void logHR(const CCDArguments& arguments, std::vector<double>& savedBeta, std::string treatmentId);
+
 private:
 	const int replicates;
 	AbstractModelData* modelData;

From 30e44e4450ee30fae06e4ad405acf15d4eb83916 Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Tue, 11 Apr 2023 22:40:25 +0000
Subject: [PATCH 09/18] use the same permutation function for
 SelectorType::BY_ROW and SelectorType::BY_PID for now

---
 src/cyclops/drivers/BootstrapSelector.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/cyclops/drivers/BootstrapSelector.cpp b/src/cyclops/drivers/BootstrapSelector.cpp
index 734589c8..b696bdd4 100644
--- a/src/cyclops/drivers/BootstrapSelector.cpp
+++ b/src/cyclops/drivers/BootstrapSelector.cpp
@@ -56,18 +56,20 @@ void BootstrapSelector::permute() {
 
 	// Get non-excluded indices
 	int N_new = indicesIncluded.size();
-	if (type == SelectorType::BY_PID) {
+//	if (type == SelectorType::BY_PID) {
 	    std::uniform_int_distribution<int> uniform(0, N_new - 1);
 		for (int i = 0; i < N_new; i++) {
             int ind =  uniform(prng);
 			int draw = indicesIncluded[ind];
 			selectedSet.insert(draw);
 		}
+/*
 	} else {
         std::ostringstream stream;
         stream << "BootstrapSelector::permute is not yet implemented.";
         error->throwError(stream);
 	}
+*/
 }
 
 void BootstrapSelector::getWeights(int batch, std::vector<double>& weights) {
@@ -80,16 +82,18 @@ void BootstrapSelector::getWeights(int batch, std::vector<double>& weights) {
 		return;
 	}
 
-	if (type == SelectorType::BY_PID) {
+	//if (type == SelectorType::BY_PID) {
 		for (size_t k = 0; k < K; k++) {
 			int count = selectedSet.count(ids.at(k));
 			weights[k] = static_cast<real>(count);
 		}
+/*
 	} else {
         std::ostringstream stream;
         stream << "BootstrapSelector::getWeights is not yet implemented.";
         error->throwError(stream);
 	}
+*/
 }
 
 void BootstrapSelector::getComplement(std::vector<double>& weights) {

From 8f161fe2b5c17456f9f576b54adad9c445247b4e Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Tue, 11 Apr 2023 22:43:10 +0000
Subject: [PATCH 10/18] R wrapper for bootstrap; fix (?) bootstrap for cox

---
 NAMESPACE                    |  1 +
 R/ModelFit.R                 |  7 +++++++
 R/RcppExports.R              |  4 ++++
 src/RcppCyclopsInterface.cpp | 25 +++++++++++++++++++++++++
 src/RcppCyclopsInterface.h   |  4 ++--
 src/RcppExports.cpp          | 14 ++++++++++++++
 src/cyclops/CcdInterface.cpp | 13 ++++++++++---
 src/cyclops/CcdInterface.h   |  3 ++-
 8 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 3045a7c4..648ebbf1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -22,6 +22,7 @@ export(createNonSeparablePrior)
 export(createParameterizedPrior)
 export(createPrior)
 export(createWeightBasedSearchControl)
+export(doBootstrap)
 export(finalizeSqlCyclopsData)
 export(fitCyclopsModel)
 export(fitCyclopsSimulation)
diff --git a/R/ModelFit.R b/R/ModelFit.R
index f74e48f2..c183b3c9 100644
--- a/R/ModelFit.R
+++ b/R/ModelFit.R
@@ -784,6 +784,13 @@ getSEs <- function(object, covariates) {
     ses
 }
 
+#' @export
+doBootstrap <- function(object, outFileName, treatmentId) {
+    .checkInterface(object$cyclopsData, testOnly = TRUE)
+    bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId)
+    bs
+}
+
 #' @title Confidence intervals for Cyclops model parameters
 #'
 #' @description
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 25c2f8ec..3a5bac2a 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -93,6 +93,10 @@
     .Call(`_Cyclops_cyclopsFitModel`, inRcppCcdInterface)
 }
 
+.cyclopsRunBootstrap <- function(inRcppCcdInterface, outFileName, treatmentId) {
+    .Call(`_Cyclops_cyclopsRunBootstrap`, inRcppCcdInterface, outFileName, treatmentId)
+}
+
 .cyclopsLogModel <- function(inRcppCcdInterface) {
     .Call(`_Cyclops_cyclopsLogModel`, inRcppCcdInterface)
 }
diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp
index c58a3706..3085ecbc 100644
--- a/src/RcppCyclopsInterface.cpp
+++ b/src/RcppCyclopsInterface.cpp
@@ -514,6 +514,31 @@ List cyclopsFitModel(SEXP inRcppCcdInterface) {
 	return list;
 }
 
+// [[Rcpp::export(".cyclopsRunBootstrap")]]
+List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId) {
+    using namespace bsccs;
+
+    XPtr<RcppCcdInterface> interface(inRcppCcdInterface);
+    interface->getArguments().doBootstrap = true;
+    interface->getArguments().outFileName = outFileName;
+
+    // Save parameter point-estimates
+    std::vector<double> savedBeta;
+    for (int j = 0; j < interface->getCcd().getBetaSize(); ++j) {
+        savedBeta.push_back(interface->getCcd().getBeta(j));
+    } // TODO Handle above work in interface.runBootstrap
+    double timeUpdate = interface->runBoostrap(savedBeta, treatmentId);
+
+    interface->diagnoseModel(0.0, 0.0);
+
+    List list = List::create(
+        Rcpp::Named("interface")=interface,
+        Rcpp::Named("timeFit")=timeUpdate
+    );
+    RcppCcdInterface::appendRList(list, interface->getResult());
+    return list;
+}
+
 // [[Rcpp::export(".cyclopsLogModel")]]
 List cyclopsLogModel(SEXP inRcppCcdInterface) {
 	using namespace bsccs;
diff --git a/src/RcppCyclopsInterface.h b/src/RcppCyclopsInterface.h
index 8225a2d4..202616f5 100644
--- a/src/RcppCyclopsInterface.h
+++ b/src/RcppCyclopsInterface.h
@@ -63,8 +63,8 @@ class RcppCcdInterface : public CcdInterface {
     	return CcdInterface::runCrossValidation(ccd, modelData);
     }
 
-    double runBoostrap(std::vector<double>& savedBeta) {
-    	return CcdInterface::runBoostrap(ccd, modelData, savedBeta);
+    double runBoostrap(std::vector<double>& savedBeta, std::string& treatmentId) {
+	return CcdInterface::runBoostrap(ccd, modelData, savedBeta, treatmentId);
     }
 
     void logResultsToFile(const std::string& fileName, bool withASE);
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 84834200..06d6d34f 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -305,6 +305,19 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cyclopsRunBootstrap
+List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId);
+RcppExport SEXP _Cyclops_cyclopsRunBootstrap(SEXP inRcppCcdInterfaceSEXP, SEXP outFileNameSEXP, SEXP treatmentIdSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP);
+    Rcpp::traits::input_parameter< const std::string& >::type outFileName(outFileNameSEXP);
+    Rcpp::traits::input_parameter< std::string& >::type treatmentId(treatmentIdSEXP);
+    rcpp_result_gen = Rcpp::wrap(cyclopsRunBootstrap(inRcppCcdInterface, outFileName, treatmentId));
+    return rcpp_result_gen;
+END_RCPP
+}
 // cyclopsLogModel
 List cyclopsLogModel(SEXP inRcppCcdInterface);
 RcppExport SEXP _Cyclops_cyclopsLogModel(SEXP inRcppCcdInterfaceSEXP) {
@@ -805,6 +818,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_Cyclops_cyclopsSetControl", (DL_FUNC) &_Cyclops_cyclopsSetControl, 23},
     {"_Cyclops_cyclopsRunCrossValidationl", (DL_FUNC) &_Cyclops_cyclopsRunCrossValidationl, 1},
     {"_Cyclops_cyclopsFitModel", (DL_FUNC) &_Cyclops_cyclopsFitModel, 1},
+    {"_Cyclops_cyclopsRunBootstrap", (DL_FUNC) &_Cyclops_cyclopsRunBootstrap, 3},
     {"_Cyclops_cyclopsLogModel", (DL_FUNC) &_Cyclops_cyclopsLogModel, 1},
     {"_Cyclops_cyclopsInitializeModel", (DL_FUNC) &_Cyclops_cyclopsInitializeModel, 4},
     {"_Cyclops_listOpenCLDevices", (DL_FUNC) &_Cyclops_listOpenCLDevices, 0},
diff --git a/src/cyclops/CcdInterface.cpp b/src/cyclops/CcdInterface.cpp
index 29a60ddd..cc1e8c65 100644
--- a/src/cyclops/CcdInterface.cpp
+++ b/src/cyclops/CcdInterface.cpp
@@ -697,21 +697,28 @@ SelectorType CcdInterface::getDefaultSelectorTypeOrOverride(SelectorType selecto
 double CcdInterface::runBoostrap(
 		CyclicCoordinateDescent *ccd,
 		AbstractModelData *modelData,
-		std::vector<double>& savedBeta) {
+		std::vector<double>& savedBeta,
+		std::string& treatmentId) {
 	struct timeval time1, time2;
 	gettimeofday(&time1, NULL);
 
 	auto selectorType = getDefaultSelectorTypeOrOverride(
 		arguments.crossValidation.selectorType, modelData->getModelType());
 
-	BootstrapSelector selector(arguments.replicates, modelData->getPidVectorSTL(),
+	vector<int> ids;
+	if (selectorType == SelectorType::BY_ROW) {
+		ids.resize(modelData->getNumberOfRows());
+		std::iota(ids.begin(), ids.end(), 0);
+	}
+	BootstrapSelector selector(arguments.replicates, selectorType == SelectorType::BY_ROW ? ids : modelData->getPidVectorSTL(),
 			selectorType, arguments.seed, logger, error);
 	BootstrapDriver driver(arguments.replicates, modelData, logger, error);
 
 	driver.drive(*ccd, selector, arguments);
 	gettimeofday(&time2, NULL);
 
-	driver.logResults(arguments, savedBeta, ccd->getConditionId());
+//	driver.logResults(arguments, savedBeta, ccd->getConditionId());
+	driver.logHR(arguments, savedBeta, treatmentId);
 	return calculateSeconds(time1, time2);
 }
 
diff --git a/src/cyclops/CcdInterface.h b/src/cyclops/CcdInterface.h
index 9728d7b5..318dee1c 100644
--- a/src/cyclops/CcdInterface.h
+++ b/src/cyclops/CcdInterface.h
@@ -197,7 +197,8 @@ class CcdInterface {
     double runBoostrap(
             CyclicCoordinateDescent *ccd,
             AbstractModelData *modelData,
-            std::vector<double>& savedBeta);
+            std::vector<double>& savedBeta,
+	    std::string& treatmentId);
 
     void setDefaultArguments();
 

From 12a748c9872608b9a516896638580fc10c96f80a Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Tue, 11 Apr 2023 22:52:30 +0000
Subject: [PATCH 11/18] update R function name

---
 NAMESPACE    | 2 +-
 R/ModelFit.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 648ebbf1..1e080feb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -22,7 +22,6 @@ export(createNonSeparablePrior)
 export(createParameterizedPrior)
 export(createPrior)
 export(createWeightBasedSearchControl)
-export(doBootstrap)
 export(finalizeSqlCyclopsData)
 export(fitCyclopsModel)
 export(fitCyclopsSimulation)
@@ -42,6 +41,7 @@ export(listOpenCLDevices)
 export(meanLinearPredictor)
 export(mse)
 export(readCyclopsData)
+export(runBootstrap)
 export(setOpenCLDevice)
 export(simulateCyclopsData)
 import(Matrix)
diff --git a/R/ModelFit.R b/R/ModelFit.R
index c183b3c9..bb7269c7 100644
--- a/R/ModelFit.R
+++ b/R/ModelFit.R
@@ -785,7 +785,7 @@ getSEs <- function(object, covariates) {
 }
 
 #' @export
-doBootstrap <- function(object, outFileName, treatmentId) {
+runBootstrap <- function(object, outFileName, treatmentId) {
     .checkInterface(object$cyclopsData, testOnly = TRUE)
     bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId)
     bs

From 92c9b9e17af6a238529a020c2b64b79629594d33 Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Tue, 11 Apr 2023 22:54:25 +0000
Subject: [PATCH 12/18] update automatically generated Rd

---
 man/createControl.Rd        | 4 +++-
 man/fitCyclopsSimulation.Rd | 3 ++-
 man/getFineGrayWeights.Rd   | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/man/createControl.Rd b/man/createControl.Rd
index 6da2461c..4641897b 100644
--- a/man/createControl.Rd
+++ b/man/createControl.Rd
@@ -25,7 +25,9 @@ createControl(
   selectorType = "auto",
   initialBound = 2,
   maxBoundCount = 5,
-  algorithm = "ccd"
+  algorithm = "ccd",
+  doItAll = TRUE,
+  syncCV = FALSE
 )
 }
 \arguments{
diff --git a/man/fitCyclopsSimulation.Rd b/man/fitCyclopsSimulation.Rd
index 0d7270c8..5bcb427e 100644
--- a/man/fitCyclopsSimulation.Rd
+++ b/man/fitCyclopsSimulation.Rd
@@ -9,7 +9,8 @@ fitCyclopsSimulation(
   useCyclops = TRUE,
   model = "logistic",
   coverage = TRUE,
-  includePenalty = FALSE
+  includePenalty = FALSE,
+  computeDevice = "native"
 )
 }
 \arguments{
diff --git a/man/getFineGrayWeights.Rd b/man/getFineGrayWeights.Rd
index 21f2237f..0e92dce3 100644
--- a/man/getFineGrayWeights.Rd
+++ b/man/getFineGrayWeights.Rd
@@ -4,7 +4,7 @@
 \alias{getFineGrayWeights}
 \title{Creates a \code{Surv} object that forces in competing risks and the IPCW needed for Fine-Gray estimation.}
 \usage{
-getFineGrayWeights(ftime, fstatus, cencode = 0, failcode = 1)
+getFineGrayWeights(ftime, fstatus, cvweights = NULL, cencode = 0, failcode = 1)
 }
 \arguments{
 \item{ftime}{Numeric: Observed event (failure) times}

From 5e252ab47f8c9e3d3c7c879c05305a4a23b9a67e Mon Sep 17 00:00:00 2001
From: JianxiaoYang <yangjx@ucla.edu>
Date: Fri, 14 Apr 2023 18:24:22 +0000
Subject: [PATCH 13/18] update R wrapper for bootstrap

---
 R/ModelFit.R                 | 11 +++++++++--
 R/RcppExports.R              |  4 ++--
 man/runBootstrap.Rd          | 20 ++++++++++++++++++++
 src/RcppCyclopsInterface.cpp |  3 ++-
 src/RcppExports.cpp          |  9 +++++----
 5 files changed, 38 insertions(+), 9 deletions(-)
 create mode 100644 man/runBootstrap.Rd

diff --git a/R/ModelFit.R b/R/ModelFit.R
index bb7269c7..2802835e 100644
--- a/R/ModelFit.R
+++ b/R/ModelFit.R
@@ -784,10 +784,17 @@ getSEs <- function(object, covariates) {
     ses
 }
 
+#' @title Run Bootstrap for Cyclops model parameter
+#'
+#' @param object    A fitted Cyclops model object
+#' @param outFileName     Character: Output file name
+#' @param treatmentId     Character: variable to output
+#' @param replicates      Numeric: number of bootstrap samples
+#'
 #' @export
-runBootstrap <- function(object, outFileName, treatmentId) {
+runBootstrap <- function(object, outFileName, treatmentId, replicates) {
     .checkInterface(object$cyclopsData, testOnly = TRUE)
-    bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId)
+    bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId, replicates)
     bs
 }
 
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 3a5bac2a..6e67601e 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -93,8 +93,8 @@
     .Call(`_Cyclops_cyclopsFitModel`, inRcppCcdInterface)
 }
 
-.cyclopsRunBootstrap <- function(inRcppCcdInterface, outFileName, treatmentId) {
-    .Call(`_Cyclops_cyclopsRunBootstrap`, inRcppCcdInterface, outFileName, treatmentId)
+.cyclopsRunBootstrap <- function(inRcppCcdInterface, outFileName, treatmentId, replicates) {
+    .Call(`_Cyclops_cyclopsRunBootstrap`, inRcppCcdInterface, outFileName, treatmentId, replicates)
 }
 
 .cyclopsLogModel <- function(inRcppCcdInterface) {
diff --git a/man/runBootstrap.Rd b/man/runBootstrap.Rd
new file mode 100644
index 00000000..ecea445f
--- /dev/null
+++ b/man/runBootstrap.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ModelFit.R
+\name{runBootstrap}
+\alias{runBootstrap}
+\title{Run Bootstrap for Cyclops model parameter}
+\usage{
+runBootstrap(object, outFileName, treatmentId, replicates)
+}
+\arguments{
+\item{object}{A fitted Cyclops model object}
+
+\item{outFileName}{Character: Output file name}
+
+\item{treatmentId}{Character: variable to output}
+
+\item{replicates}{Numeric: number of bootstrap samples}
+}
+\description{
+Run Bootstrap for Cyclops model parameter
+}
diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp
index 3085ecbc..5f3458b0 100644
--- a/src/RcppCyclopsInterface.cpp
+++ b/src/RcppCyclopsInterface.cpp
@@ -515,12 +515,13 @@ List cyclopsFitModel(SEXP inRcppCcdInterface) {
 }
 
 // [[Rcpp::export(".cyclopsRunBootstrap")]]
-List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId) {
+List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId, int replicates) {
     using namespace bsccs;
 
     XPtr<RcppCcdInterface> interface(inRcppCcdInterface);
     interface->getArguments().doBootstrap = true;
     interface->getArguments().outFileName = outFileName;
+    interface->getArguments().replicates = replicates;
 
     // Save parameter point-estimates
     std::vector<double> savedBeta;
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 06d6d34f..548d50d8 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -306,15 +306,16 @@ BEGIN_RCPP
 END_RCPP
 }
 // cyclopsRunBootstrap
-List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId);
-RcppExport SEXP _Cyclops_cyclopsRunBootstrap(SEXP inRcppCcdInterfaceSEXP, SEXP outFileNameSEXP, SEXP treatmentIdSEXP) {
+List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId, int replicates);
+RcppExport SEXP _Cyclops_cyclopsRunBootstrap(SEXP inRcppCcdInterfaceSEXP, SEXP outFileNameSEXP, SEXP treatmentIdSEXP, SEXP replicatesSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP);
     Rcpp::traits::input_parameter< const std::string& >::type outFileName(outFileNameSEXP);
     Rcpp::traits::input_parameter< std::string& >::type treatmentId(treatmentIdSEXP);
-    rcpp_result_gen = Rcpp::wrap(cyclopsRunBootstrap(inRcppCcdInterface, outFileName, treatmentId));
+    Rcpp::traits::input_parameter< int >::type replicates(replicatesSEXP);
+    rcpp_result_gen = Rcpp::wrap(cyclopsRunBootstrap(inRcppCcdInterface, outFileName, treatmentId, replicates));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -818,7 +819,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_Cyclops_cyclopsSetControl", (DL_FUNC) &_Cyclops_cyclopsSetControl, 23},
     {"_Cyclops_cyclopsRunCrossValidationl", (DL_FUNC) &_Cyclops_cyclopsRunCrossValidationl, 1},
     {"_Cyclops_cyclopsFitModel", (DL_FUNC) &_Cyclops_cyclopsFitModel, 1},
-    {"_Cyclops_cyclopsRunBootstrap", (DL_FUNC) &_Cyclops_cyclopsRunBootstrap, 3},
+    {"_Cyclops_cyclopsRunBootstrap", (DL_FUNC) &_Cyclops_cyclopsRunBootstrap, 4},
     {"_Cyclops_cyclopsLogModel", (DL_FUNC) &_Cyclops_cyclopsLogModel, 1},
     {"_Cyclops_cyclopsInitializeModel", (DL_FUNC) &_Cyclops_cyclopsInitializeModel, 4},
     {"_Cyclops_listOpenCLDevices", (DL_FUNC) &_Cyclops_listOpenCLDevices, 0},

From c7710e1cda715470c1f8476aeda21221d9b529a2 Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Fri, 14 Apr 2023 13:32:38 -0700
Subject: [PATCH 14/18] v3.3.1

---
 .Rbuildignore                           |  2 ++
 CRAN-SUBMISSION                         |  3 +++
 DESCRIPTION                             |  4 ++--
 NEWS.md                                 |  9 +++++++++
 cran-comments.md                        | 17 +++++++++++++----
 man/convertToCyclopsData.Rd             |  6 +++---
 src/RcppCyclopsInterface.cpp            |  6 +++---
 src/RcppModelData.cpp                   |  2 +-
 src/cyclops/CyclicCoordinateDescent.cpp |  4 ++++
 src/cyclops/engine/ModelSpecifics.hpp   |  4 ++--
 tools/configure.R                       |  7 +++++--
 11 files changed, 47 insertions(+), 17 deletions(-)
 create mode 100644 CRAN-SUBMISSION

diff --git a/.Rbuildignore b/.Rbuildignore
index 6d87dae1..094bb570 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -36,3 +36,5 @@ deprecated
 .covrignore
 src/Makevars$
 src/Makevars.win$
+to_fix*
+^CRAN-SUBMISSION$
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
new file mode 100644
index 00000000..e91558cb
--- /dev/null
+++ b/CRAN-SUBMISSION
@@ -0,0 +1,3 @@
+Version: 3.3.1
+Date: 2023-04-09 20:06:33 UTC
+SHA: d88c531efa06933c1007cb785343f1382c95ee0d
diff --git a/DESCRIPTION b/DESCRIPTION
index 3f493895..14d493be 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: Cyclops
 Type: Package
 Title: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis
-Version: 3.3.0
+Version: 3.3.1
 Authors@R: c(
     person("Marc A.", "Suchard", email = "msuchard@ucla.edu", role = c("aut","cre")),
     person("Martijn J.", "Schuemie", role = "aut"),
@@ -51,4 +51,4 @@ Suggests:
     ggplot2,
     microbenchmark,
     cmprsk
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.3
diff --git a/NEWS.md b/NEWS.md
index 3cf71839..4c72ca68 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,12 @@
+Cyclops v3.3.1
+==============
+
+Changes:
+
+1. fix uninitialized value in detected in computeAsymptoticPrecisionMatrix(); value was priorType
+2. fix memory leak caused by call to ::Rf_error()
+3. fix line-endings on Makevar on windows
+
 Cyclops v3.3.0
 ==============
 
diff --git a/cran-comments.md b/cran-comments.md
index c4255b42..edfb58e1 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,16 +1,23 @@
-## Initial submission of patch update to package
+## Resubmission after VALGRIND feedback from Brian Ripley
 
-* fixed all WARNINGS, including generic-inconsistency, uninitialized values
-* fixed NOTE about C++11
+* fixed VALGRIND uninitialized value (UB) issue; variable 'priorType' was uninitialized,
+  specifically in:
+
+    ==2293460== Conditional jump or move depends on uninitialised value(s)
+    ==2293460==    at 0x22CE4848: bsccs::CyclicCoordinateDescent::computeAsymptoticPrecisionMatrix() (packages/tests-vg/Cyclops/src/cyclops/CyclicCoordinateDescent.cpp:1335)
+    
+* fixed VALGRIND memory leak issue; was caused by calls to '::Rf_error()' instead of 
+  'Rcpp::stop()' when handling some error edge-cases
 
 ## Test environments
 * local OS X install, R 4.1
+* r-devel-valgrind docker container
 * ubuntu 20.04 (via gh-actions: devel and release)
 * win-builder (devel and release)
 
 ## R CMD check results
 * There were no ERRORs or WARNINGs
-* There is 1 occasional NOTE:
+* There is 1 occasional NOTE (besides days since last submission):
     checking installed package size ... NOTE
     installed size is 22.5Mb
     sub-directories of 1Mb or more:
@@ -23,4 +30,6 @@ availability of C++17 'if (constexpr ...)' should decrease library size substant
 
 ## Downstream dependencies
 * 'EvidenceSynthesis' - checked and works.
+* 'EmpiricalCalibration' - checked and works.
 * 'IterativeHardThresholding' - checked and works.
+* 'BrokenAdaptiveRidge' - checked and works.
diff --git a/man/convertToCyclopsData.Rd b/man/convertToCyclopsData.Rd
index 61a6bd75..391ec1ec 100644
--- a/man/convertToCyclopsData.Rd
+++ b/man/convertToCyclopsData.Rd
@@ -89,11 +89,11 @@ These columns are expected in the covariates object:
 }
 \section{Methods (by class)}{
 \itemize{
-\item \code{data.frame}: Convert data from two \code{data.frame}
+\item \code{convertToCyclopsData(data.frame)}: Convert data from two \code{data.frame}
 
-\item \code{tbl_dbi}: Convert data from two \code{Andromeda} tables
-}}
+\item \code{convertToCyclopsData(tbl_dbi)}: Convert data from two \code{Andromeda} tables
 
+}}
 \examples{
 #Convert infert dataset to Cyclops format:
 covariates <- data.frame(stratumId = rep(infert$stratum, 2),
diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp
index ae9b1914..ae8ff12e 100644
--- a/src/RcppCyclopsInterface.cpp
+++ b/src/RcppCyclopsInterface.cpp
@@ -622,8 +622,8 @@ void RcppCcdInterface::appendRList(Rcpp::List& list, const Rcpp::List& append) {
 }
 
 void RcppCcdInterface::handleError(const std::string& str) {
-//	Rcpp::stop(str); // TODO Want this to work
-	::Rf_error(str.c_str());
+    Rcpp::stop(str);
+	// ::Rf_error(str.c_str());
 }
 
 bsccs::ConvergenceType RcppCcdInterface::parseConvergenceType(const std::string& convergenceName) {
@@ -946,7 +946,7 @@ void RcppCcdInterface::initializeModelImpl(
 //  	}
 //  	singlePrior->setVariance(0, arguments.hyperprior);
 
- 	JointPriorPtr prior;
+ 	JointPriorPtr prior = nullptr;
 //  	if (arguments.flatPrior.size() == 0) {
 //  		prior = bsccs::make_shared<FullyExchangeableJointPrior>(singlePrior);
 //  	} else {
diff --git a/src/RcppModelData.cpp b/src/RcppModelData.cpp
index 7eb7796b..f585a6fe 100644
--- a/src/RcppModelData.cpp
+++ b/src/RcppModelData.cpp
@@ -675,7 +675,7 @@ List cyclopsReadFileData(const std::string& fileName, const std::string& modelTy
 
     //const std::vector<double>& y = ptr->getYVectorRef();
     double total = 0.0; //std::accumulate(y.begin(), y.end(), 0.0);
-    // delete reader; // TODO Test
+    delete reader; // TODO Use smart_ptr
 
     double time = timer();
     List list = List::create(
diff --git a/src/cyclops/CyclicCoordinateDescent.cpp b/src/cyclops/CyclicCoordinateDescent.cpp
index fbb03df2..6e54b9f2 100644
--- a/src/cyclops/CyclicCoordinateDescent.cpp
+++ b/src/cyclops/CyclicCoordinateDescent.cpp
@@ -54,6 +54,8 @@ CyclicCoordinateDescent::CyclicCoordinateDescent(
 	noiseLevel = NOISY;
 	initialBound = 2.0;
 
+	priorType = priors::NONE;
+
 	init(hXI.getHasOffsetCovariate());
 }
 
@@ -89,6 +91,8 @@ CyclicCoordinateDescent::CyclicCoordinateDescent(const CyclicCoordinateDescent&
 	noiseLevel = copy.noiseLevel;
 	initialBound = copy.initialBound;
 
+	priorType = copy.priorType;
+
 	init(hXI.getHasOffsetCovariate());
 
 	if (copy.hWeights.size() > 0) {
diff --git a/src/cyclops/engine/ModelSpecifics.hpp b/src/cyclops/engine/ModelSpecifics.hpp
index 84f9eed3..e08e7da1 100644
--- a/src/cyclops/engine/ModelSpecifics.hpp
+++ b/src/cyclops/engine/ModelSpecifics.hpp
@@ -1124,7 +1124,7 @@ void ModelSpecifics<BaseModel,RealType>::computeGradientAndHessianImpl(int index
 // 	    gradient = result.real();
 // 	    hessian = result.imag();
 
-        using RealTypePair = std::pair<RealType, RealType>;
+        // using RealTypePair = std::pair<RealType, RealType>;
 
 	    IteratorType it(hX, index);
 	    const int end = it.size() - 1;
@@ -1266,7 +1266,7 @@ void ModelSpecifics<BaseModel,RealType>::computeThirdDerivativeImpl(int index, d
 #endif
 
     RealType third = static_cast<RealType>(0);
-    RealType hessian = static_cast<RealType>(0);
+    // RealType hessian = static_cast<RealType>(0);
 
     if (BaseModel::cumulativeGradientAndHessian) { // Compile-time switch
 
diff --git a/tools/configure.R b/tools/configure.R
index d9853786..08ab5033 100644
--- a/tools/configure.R
+++ b/tools/configure.R
@@ -19,5 +19,8 @@ if (getRversion() < "4.1") {
     }
 }
 
-cat(txt, file = makevars_out, sep = "\n")
-cat(txt_win, file = makevars_win_out, sep = "\n")
+if (.Platform$OS.type == "unix") {
+	cat(txt, file = makevars_out, sep = "\n")
+} else {
+	cat(txt_win, file = makevars_win_out, sep = "\n")
+}

From 478e5ac4a21b6b5616c5252ab4276f2b2f5a9fad Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Thu, 18 May 2023 05:18:52 -0700
Subject: [PATCH 15/18] add auto option to cvRepetitions

---
 NEWS.md                  |  2 ++
 R/ModelFit.R             | 18 ++++++++++++++++++
 tests/testthat/test-cv.R | 20 ++++++++++++++++++++
 tools/configure.R        |  2 +-
 4 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 33852b06..943d8096 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,6 +4,8 @@ develop
 Changes:
 
 1. improvements on adaptive likelihood profiling
+2. add `auto` option to `cvRepetitions`
+3. bumped explicit C++11 requirement up to R v4.1
 
 Cyclops v3.3.1
 ==============
diff --git a/R/ModelFit.R b/R/ModelFit.R
index c4fbae74..385a011c 100644
--- a/R/ModelFit.R
+++ b/R/ModelFit.R
@@ -204,6 +204,11 @@ fitCyclopsModel <- function(cyclopsData,
             writeLines(paste("Using cross-validation selector type", control$selectorType))
         }
     }
+
+    if (control$cvRepetitions == "auto") {
+        control$cvRepetitions <- .getNumberOfRepetitions(getNumberOfRows(cyclopsData))
+    }
+
     control <- .setControl(cyclopsData$cyclopsInterfacePtr, control)
     threads <- control$threads
 
@@ -324,6 +329,11 @@ fitCyclopsModel <- function(cyclopsData,
     fit$scale <- cyclopsData$scale
     fit$threads <- threads
     fit$seed <- control$seed
+
+    if (prior$useCrossValidation) {
+        fit$cvRepetitions <- control$cvRepetitions
+    }
+
     class(fit) <- "cyclopsFit"
     return(fit)
 }
@@ -1116,3 +1126,11 @@ convertToGlmnetLambda <- function(variance, nobs) {
     graph <-  lapply(0:(nParents - 1), function(n, types) { 0:(nTypes - 1) + n * nTypes }, types = nTypes)
     graph
 }
+
+.getNumberOfRepetitions <- function(nrows) {
+    top <- 1E2
+    factor <- 0.5
+    pmax(pmin(
+        ceiling(top / nrows^(factor))
+        , 10), 1)
+}
diff --git a/tests/testthat/test-cv.R b/tests/testthat/test-cv.R
index 8b47229c..3252e670 100644
--- a/tests/testthat/test-cv.R
+++ b/tests/testthat/test-cv.R
@@ -206,3 +206,23 @@ test_that("Seed gets returned", {
                            control = createControl(seed = NULL), warnings = FALSE)
     expect_true(!is.null(fit$seed))
 })
+
+test_that("Auto cvRepetitions", {
+    data <- simulateCyclopsData(nstrata = 1, nrows = 1000, ncovars = 10, model = "logistic")
+    cyclopsData <- convertToCyclopsData(data$outcomes,
+                                        data$covariates,
+                                        modelType = "lr",
+                                        addIntercept = TRUE)
+
+    prior <- createPrior("laplace", exclude = c(0), useCrossValidation = TRUE)
+
+    control <- createControl(noiseLevel = "quiet",
+                             cvType = "auto",
+                             cvRepetitions = "auto")
+
+    fit <- fitCyclopsModel(cyclopsData,
+                           prior = prior,
+                           control = control)
+
+    expect_equal(fit$cvRepetitions, 4)
+})
diff --git a/tools/configure.R b/tools/configure.R
index 08ab5033..acdda039 100644
--- a/tools/configure.R
+++ b/tools/configure.R
@@ -9,7 +9,7 @@ makevars_win_out <- file.path("src", "Makevars.win")
 txt <- readLines(makevars_in)
 txt_win <- readLines(makevars_win_in)
 
-if (getRversion() < "4.1") {
+if (getRversion() < "4.2") {
     if (!any(grepl("^CXX_STD", txt))) {
         txt <- c("CXX_STD = CXX11", txt)
     }

From b20b830d4a2ec67ad834f62bc6cf8956179b20d0 Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Fri, 19 May 2023 16:11:21 -0700
Subject: [PATCH 16/18] play with CXX11 flag some more

---
 tools/configure.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/configure.R b/tools/configure.R
index acdda039..4b4222e9 100644
--- a/tools/configure.R
+++ b/tools/configure.R
@@ -9,11 +9,13 @@ makevars_win_out <- file.path("src", "Makevars.win")
 txt <- readLines(makevars_in)
 txt_win <- readLines(makevars_win_in)
 
-if (getRversion() < "4.2") {
+if (getRversion() < "4.3") { # macOS / linux
     if (!any(grepl("^CXX_STD", txt))) {
         txt <- c("CXX_STD = CXX11", txt)
     }
+}
 
+if (getRversion() < "4.2") { # Windoz
     if (!any(grepl("^CXX_STD", txt_win))) {
         txt_win <- c("CXX_STD = CXX11", txt_win)
     }

From 60ebb58ce21bbeacb8a9148f21b0dfcab6a2839d Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Mon, 25 Sep 2023 06:52:05 -0700
Subject: [PATCH 17/18] new CRAN format

---
 R/Cyclops-package.R | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 R/Cyclops-package.R

diff --git a/R/Cyclops-package.R b/R/Cyclops-package.R
new file mode 100644
index 00000000..a65cf643
--- /dev/null
+++ b/R/Cyclops-package.R
@@ -0,0 +1,6 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+## usethis namespace: end
+NULL

From 95bfa2b7d8d9cf37dba148ef051d475a0ca71067 Mon Sep 17 00:00:00 2001
From: Marc Suchard <msuchard@ucla.edu>
Date: Mon, 25 Sep 2023 09:37:39 -0700
Subject: [PATCH 18/18] counting process reformating

---
 NAMESPACE                       |   3 +
 R/TimeEffects.R                 | 132 ++++++++++++++++++++++++++++++++
 man/Cyclops-package.Rd          |  41 ++++++++++
 man/convertToTimeVaryingCoef.Rd |  39 ++++++++++
 man/splitTime.Rd                |  26 +++++++
 5 files changed, 241 insertions(+)
 create mode 100644 R/TimeEffects.R
 create mode 100644 man/Cyclops-package.Rd
 create mode 100644 man/convertToTimeVaryingCoef.Rd
 create mode 100644 man/splitTime.Rd

diff --git a/NAMESPACE b/NAMESPACE
index d632ced3..4fc36ad5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,6 +13,7 @@ S3method(survfit,cyclopsFit)
 S3method(vcov,cyclopsFit)
 export(Multitype)
 export(convertToCyclopsData)
+export(convertToTimeVaryingCoef)
 export(coverage)
 export(createAutoGridCrossValidationControl)
 export(createControl)
@@ -40,6 +41,7 @@ export(meanLinearPredictor)
 export(mse)
 export(readCyclopsData)
 export(simulateCyclopsData)
+export(splitTime)
 import(Matrix)
 import(Rcpp)
 import(dplyr)
@@ -72,5 +74,6 @@ importFrom(stats,terms)
 importFrom(stats,time)
 importFrom(stats,vcov)
 importFrom(survival,Surv)
+importFrom(survival,survSplit)
 importFrom(survival,survfit)
 useDynLib(Cyclops, .registration = TRUE)
diff --git a/R/TimeEffects.R b/R/TimeEffects.R
new file mode 100644
index 00000000..5ae7aa0e
--- /dev/null
+++ b/R/TimeEffects.R
@@ -0,0 +1,132 @@
+#' @title Split the analysis time into several intervals for time-varying coefficients.
+#'
+#' @description
+#' \code{splitTime} split the analysis time into several intervals for time-varying coefficients
+#'
+#' @param shortOut     A data frame containing the outcomes with predefined columns (see below).
+#' @param cut          Numeric: Time points to cut at
+#'
+#' These columns are expected in the shortOut object:
+#' \tabular{lll}{
+#'   \verb{rowId}   \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr
+#'   \verb{y}       \tab(real) \tab Observed event status \cr
+#'   \verb{time}    \tab(real) \tab Observed event time \cr
+#' }
+#'
+#' @return A long outcome table for time-varying coefficients.
+#' @importFrom survival survfit survSplit
+#' @export
+splitTime <- function(shortOut, cut) {
+
+    if (!"time" %in% colnames(shortOut)) stop("Must provide observed event time.")
+    if (!"y" %in% colnames(shortOut)) stop("Must provide observed event status.")
+    if ("rowId" %in% colnames(shortOut)) {
+        shortOut <- shortOut %>%
+            rename(subjectId = rowId) %>%
+            arrange(subjectId)
+    } else {
+        shortOut <- shortOut %>%
+            mutate(subjectId = row_number())
+    }
+
+    shortOut <- collect(shortOut)
+    longOut <- do.call('survSplit', list(formula = Surv(shortOut$time, shortOut$y)~.,
+                                         data = shortOut,
+                                         cut = cut,
+                                         episode = "stratumId",
+                                         id = "newSubjectId"))
+    longOut <- longOut %>%
+        rename(y = event) %>%
+        mutate(time = tstop - tstart) %>%
+        select(-c(newSubjectId, tstart, tstop)) %>%
+        arrange(stratumId, subjectId)
+
+    # Restore rowIds
+    SubjectIds <- shortOut$subjectId
+    newSubjectId <- max(SubjectIds)+1
+    longOut$rowId <-c(SubjectIds, # rowId = subjectId at 1st stratum
+                      newSubjectId:(newSubjectId+(nrow(longOut)-length(SubjectIds))-1)) # create new distinct rowIds for other strata
+
+    # Reorder columns
+    longOut <- longOut %>%
+        select(rowId, everything()) %>%
+        select(subjectId, everything()) %>%
+        select(stratumId, everything())
+
+    return(longOut)
+}
+
+#' @title Convert short sparse covariate table to long sparse covariate table for time-varying coefficients.
+#'
+#' @description
+#' \code{convertToTimeVaryingCoef} convert short sparse covariate table to long sparse covariate table for time-varying coefficients.
+#'
+#' @param shortCov       A data frame containing the covariate with predefined columns (see below).
+#' @param longOut        A data frame containing the outcomes with predefined columns (see below), output of \code{splitTime}.
+#' @param timeVaryCoefId   Integer: A numeric identifier of a time-varying coefficient
+#'
+#' @details
+#' These columns are expected in the shortCov object:
+#' \tabular{lll}{
+#'   \verb{rowId}  	       \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr
+#'   \verb{covariateId}    \tab(integer) \tab A numeric identifier of a covariate  \cr
+#'   \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr
+#' }
+#'
+#' These columns are expected in the longOut object:
+#' \tabular{lll}{
+#'   \verb{stratumId}   \tab(integer) \tab Stratum ID for time-varying models \cr
+#'   \verb{subjectId}  	\tab(integer) \tab Subject ID is used to link multiple covariates (x) at different time intervals to a single subject \cr
+#'   \verb{rowId}  	    \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr
+#'   \verb{y}           \tab(real) \tab The outcome variable \cr
+#'   \verb{time}        \tab(real) \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr
+#'                      \tab       \tab(e.g. number of days) \cr
+#' }
+#' @return A long sparse covariate table for time-varying coefficients.
+#' @export
+convertToTimeVaryingCoef <- function(shortCov, longOut, timeVaryCoefId) {
+
+    # Process time-varying coefficients
+    timeVaryCoefId <- sort(unique(timeVaryCoefId))
+    numTime <- length(timeVaryCoefId) # number of time-varying covariates
+    maxCovId <- max(shortCov$covariateId)
+
+    # First stratum
+    longCov <- shortCov
+    longCov$stratumId <- 1
+    colnames(longCov)[which(names(longCov) == "rowId")] <- "subjectId"
+    colnames(shortCov)[which(names(shortCov) == "rowId")] <- "subjectId"
+
+    # Rest of strata
+    maxStrata <- max(longOut$stratumId)
+    for (st in 2:maxStrata) {
+
+        # get valid subjects in current stratum
+        subId <- longOut[longOut$stratumId == st, ]$subjectId
+
+        # get valid sparse covariates information in current stratum
+        curStrata <- shortCov[shortCov$subjectId %in% subId, ]
+
+        if (any(curStrata$covariateId %in% timeVaryCoefId)) { # skip when valid subjects only have non-zero time-indep covariates
+            curStrata$stratumId <- st # assign current stratumId
+
+            # recode covariateId for time-varying coefficients
+            # TODO update label
+            for (i in 1:numTime) {
+                curStrata[curStrata$covariateId == timeVaryCoefId[i], "covariateId"] <- maxCovId + numTime * (st - 2) + i
+            }
+
+            # bind current stratum to longCov
+            longCov <- rbind(longCov, curStrata)
+        }
+    }
+
+    # match rowId in longCov
+    longCov$rowId <- NA
+    for (i in 1:nrow(longCov)) {
+        longCov$rowId[i] <- longOut[with(longOut, subjectId == longCov$subjectId[i] & stratumId == longCov$stratumId[i]), "rowId"]
+    }
+
+    return(longCov)
+}
+
diff --git a/man/Cyclops-package.Rd b/man/Cyclops-package.Rd
new file mode 100644
index 00000000..d726da88
--- /dev/null
+++ b/man/Cyclops-package.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Cyclops-package.R
+\docType{package}
+\name{Cyclops-package}
+\alias{Cyclops}
+\alias{Cyclops-package}
+\title{Cyclops: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis}
+\description{
+This model fitting tool incorporates cyclic coordinate descent and majorization-minimization approaches to fit a variety of regression models found in large-scale observational healthcare data. Implementations focus on computational optimization and fine-scale parallelization to yield efficient inference in massive datasets. Please see: Suchard, Simpson, Zorych, Ryan and Madigan (2013) \doi{10.1145/2414416.2414791}.
+}
+\seealso{
+Useful links:
+\itemize{
+  \item \url{https://github.com/ohdsi/cyclops}
+  \item Report bugs at \url{https://github.com/ohdsi/cyclops/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Marc A. Suchard \email{msuchard@ucla.edu}
+
+Authors:
+\itemize{
+  \item Martijn J. Schuemie
+  \item Trevor R. Shaddox
+  \item Yuxi Tian
+  \item Jianxiao Yang
+  \item Eric Kawaguchi
+}
+
+Other contributors:
+\itemize{
+  \item Sushil Mittal [contributor]
+  \item Observational Health Data Sciences and Informatics [copyright holder]
+  \item Marcus Geelnard (provided the TinyThread library) [copyright holder, contributor]
+  \item Rutgers University (provided the HParSearch routine) [copyright holder, contributor]
+  \item R Development Core Team (provided the ZeroIn routine) [copyright holder, contributor]
+}
+
+}
+\keyword{internal}
diff --git a/man/convertToTimeVaryingCoef.Rd b/man/convertToTimeVaryingCoef.Rd
new file mode 100644
index 00000000..b587f8f2
--- /dev/null
+++ b/man/convertToTimeVaryingCoef.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/TimeEffects.R
+\name{convertToTimeVaryingCoef}
+\alias{convertToTimeVaryingCoef}
+\title{Convert short sparse covariate table to long sparse covariate table for time-varying coefficients.}
+\usage{
+convertToTimeVaryingCoef(shortCov, longOut, timeVaryCoefId)
+}
+\arguments{
+\item{shortCov}{A data frame containing the covariate with predefined columns (see below).}
+
+\item{longOut}{A data frame containing the outcomes with predefined columns (see below), output of \code{splitTime}.}
+
+\item{timeVaryCoefId}{Integer: A numeric identifier of a time-varying coefficient}
+}
+\value{
+A long sparse covariate table for time-varying coefficients.
+}
+\description{
+\code{convertToTimeVaryingCoef} convert short sparse covariate table to long sparse covariate table for time-varying coefficients.
+}
+\details{
+These columns are expected in the shortCov object:
+\tabular{lll}{
+  \verb{rowId}  	       \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr
+  \verb{covariateId}    \tab(integer) \tab A numeric identifier of a covariate  \cr
+  \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr
+}
+
+These columns are expected in the longOut object:
+\tabular{lll}{
+  \verb{stratumId}   \tab(integer) \tab Stratum ID for time-varying models \cr
+  \verb{subjectId}  	\tab(integer) \tab Subject ID is used to link multiple covariates (x) at different time intervals to a single subject \cr
+  \verb{rowId}  	    \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr
+  \verb{y}           \tab(real) \tab The outcome variable \cr
+  \verb{time}        \tab(real) \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr
+                     \tab       \tab(e.g. number of days) \cr
+}
+}
diff --git a/man/splitTime.Rd b/man/splitTime.Rd
new file mode 100644
index 00000000..fedc4a7e
--- /dev/null
+++ b/man/splitTime.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/TimeEffects.R
+\name{splitTime}
+\alias{splitTime}
+\title{Split the analysis time into several intervals for time-varying coefficients.}
+\usage{
+splitTime(shortOut, cut)
+}
+\arguments{
+\item{shortOut}{A data frame containing the outcomes with predefined columns (see below).}
+
+\item{cut}{Numeric: Time points to cut at
+
+These columns are expected in the shortOut object:
+\tabular{lll}{
+  \verb{rowId}   \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr
+  \verb{y}       \tab(real) \tab Observed event status \cr
+  \verb{time}    \tab(real) \tab Observed event time \cr
+}}
+}
+\value{
+A long outcome table for time-varying coefficients.
+}
+\description{
+\code{splitTime} split the analysis time into several intervals for time-varying coefficients
+}