From f320511dce6374fc7e545a19edda52cfc471ab6f Mon Sep 17 00:00:00 2001 From: Henrik John Date: Wed, 18 May 2022 17:40:06 +0200 Subject: [PATCH 01/18] Resolve an issue with starting betas being reset During cross validation the beta values are reset to 0.0, while they should be reset to the starting betas, if provided. --- R/ModelFit.R | 1 + R/RcppExports.R | 12 +++++++ src/RcppCyclopsInterface.cpp | 22 +++++++++++++ src/RcppExports.cpp | 43 +++++++++++++++++++++++++ src/cyclops/CyclicCoordinateDescent.cpp | 20 +++++++++++- src/cyclops/CyclicCoordinateDescent.h | 3 ++ 6 files changed, 100 insertions(+), 1 deletion(-) diff --git a/R/ModelFit.R b/R/ModelFit.R index 79f00152..49424278 100644 --- a/R/ModelFit.R +++ b/R/ModelFit.R @@ -200,6 +200,7 @@ fitCyclopsModel <- function(cyclopsData, } .cyclopsSetBeta(cyclopsData$cyclopsInterfacePtr, startingCoefficients) + .cyclopsSetStartingBeta(cyclopsData$cyclopsInterfacePtr, startingCoefficients) } if (!is.null(fixedCoefficients)) { diff --git a/R/RcppExports.R b/R/RcppExports.R index 24c97eb0..2794d953 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -25,10 +25,22 @@ invisible(.Call(`_Cyclops_cyclopsSetBeta`, inRcppCcdInterface, beta)) } +.cyclopsGetBeta <- function(inRcppCcdInterface, index) { + .Call(`_Cyclops_cyclopsGetBeta`, inRcppCcdInterface, index) +} + +.cyclopsSetStartingBeta <- function(inRcppCcdInterface, inStartingBeta) { + invisible(.Call(`_Cyclops_cyclopsSetStartingBeta`, inRcppCcdInterface, inStartingBeta)) +} + .cyclopsSetFixedBeta <- function(inRcppCcdInterface, beta, fixed) { invisible(.Call(`_Cyclops_cyclopsSetFixedBeta`, inRcppCcdInterface, beta, fixed)) } +.cyclopsGetFixedBeta <- function(inRcppCcdInterface, index) { + .Call(`_Cyclops_cyclopsGetFixedBeta`, inRcppCcdInterface, index) +} + .cyclopsGetIsRegularized <- function(inRcppCcdInterface, index) { .Call(`_Cyclops_cyclopsGetIsRegularized`, inRcppCcdInterface, index) } diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp index 829d757b..eca98a13 100644 --- a/src/RcppCyclopsInterface.cpp +++ b/src/RcppCyclopsInterface.cpp @@ -112,6 +112,20 @@ void cyclopsSetBeta(SEXP inRcppCcdInterface, const std::vector& beta) { interface->getCcd().setBeta(beta); } +// [[Rcpp::export(.cyclopsGetBeta)]] +double cyclopsGetBeta(SEXP inRcppCcdInterface, const int index) { + using namespace bsccs; + XPtr interface(inRcppCcdInterface); + return interface->getCcd().getBeta(index); +} + +// [[Rcpp::export(.cyclopsSetStartingBeta)]] +void cyclopsSetStartingBeta(SEXP inRcppCcdInterface, const std::vector& inStartingBeta) { + using namespace bsccs; + XPtr interface(inRcppCcdInterface); + interface->getCcd().setStartingBeta(inStartingBeta); +} + // [[Rcpp::export(.cyclopsSetFixedBeta)]] void cyclopsSetFixedBeta(SEXP inRcppCcdInterface, int beta, bool fixed) { using namespace bsccs; @@ -120,6 +134,14 @@ void cyclopsSetFixedBeta(SEXP inRcppCcdInterface, int beta, bool fixed) { interface->getCcd().setFixedBeta(beta - 1, fixed); } +// [[Rcpp::export(.cyclopsGetFixedBeta)]] +bool cyclopsGetFixedBeta(SEXP inRcppCcdInterface, const int index){ + using namespace bsccs; + XPtr interface(inRcppCcdInterface); + return interface->getCcd().getFixedBeta(index); +} + + // [[Rcpp::export(".cyclopsGetIsRegularized")]] bool cyclopsGetIsRegularized(SEXP inRcppCcdInterface, const int index) { using namespace bsccs; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index daadf98a..1b059404 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,6 +6,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // cyclopsGetModelTypeNames std::vector cyclopsGetModelTypeNames(); RcppExport SEXP _Cyclops_cyclopsGetModelTypeNames() { @@ -68,6 +73,29 @@ BEGIN_RCPP return R_NilValue; END_RCPP } +// cyclopsGetBeta +double cyclopsGetBeta(SEXP inRcppCcdInterface, const int index); +RcppExport SEXP _Cyclops_cyclopsGetBeta(SEXP inRcppCcdInterfaceSEXP, SEXP indexSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP); + Rcpp::traits::input_parameter< const int >::type index(indexSEXP); + rcpp_result_gen = Rcpp::wrap(cyclopsGetBeta(inRcppCcdInterface, index)); + return rcpp_result_gen; +END_RCPP +} +// cyclopsSetStartingBeta +void cyclopsSetStartingBeta(SEXP inRcppCcdInterface, const std::vector& inStartingBeta); +RcppExport SEXP _Cyclops_cyclopsSetStartingBeta(SEXP inRcppCcdInterfaceSEXP, SEXP inStartingBetaSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP); + Rcpp::traits::input_parameter< const std::vector& >::type inStartingBeta(inStartingBetaSEXP); + cyclopsSetStartingBeta(inRcppCcdInterface, inStartingBeta); + return R_NilValue; +END_RCPP +} // cyclopsSetFixedBeta void cyclopsSetFixedBeta(SEXP inRcppCcdInterface, int beta, bool fixed); RcppExport SEXP _Cyclops_cyclopsSetFixedBeta(SEXP inRcppCcdInterfaceSEXP, SEXP betaSEXP, SEXP fixedSEXP) { @@ -80,6 +108,18 @@ BEGIN_RCPP return R_NilValue; END_RCPP } +// cyclopsGetFixedBeta +bool cyclopsGetFixedBeta(SEXP inRcppCcdInterface, const int index); +RcppExport SEXP _Cyclops_cyclopsGetFixedBeta(SEXP inRcppCcdInterfaceSEXP, SEXP indexSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP); + Rcpp::traits::input_parameter< const int >::type index(indexSEXP); + rcpp_result_gen = Rcpp::wrap(cyclopsGetFixedBeta(inRcppCcdInterface, index)); + return rcpp_result_gen; +END_RCPP +} // cyclopsGetIsRegularized bool cyclopsGetIsRegularized(SEXP inRcppCcdInterface, const int index); RcppExport SEXP _Cyclops_cyclopsGetIsRegularized(SEXP inRcppCcdInterfaceSEXP, SEXP indexSEXP) { @@ -771,7 +811,10 @@ static const R_CallMethodDef CallEntries[] = { {"_Cyclops_cyclopsGetUseOffsetNames", (DL_FUNC) &_Cyclops_cyclopsGetUseOffsetNames, 0}, {"_Cyclops_cyclopsGetComputeDevice", (DL_FUNC) &_Cyclops_cyclopsGetComputeDevice, 1}, {"_Cyclops_cyclopsSetBeta", (DL_FUNC) &_Cyclops_cyclopsSetBeta, 2}, + {"_Cyclops_cyclopsGetBeta", (DL_FUNC) &_Cyclops_cyclopsGetBeta, 2}, + {"_Cyclops_cyclopsSetStartingBeta", (DL_FUNC) &_Cyclops_cyclopsSetStartingBeta, 2}, {"_Cyclops_cyclopsSetFixedBeta", (DL_FUNC) &_Cyclops_cyclopsSetFixedBeta, 3}, + {"_Cyclops_cyclopsGetFixedBeta", (DL_FUNC) &_Cyclops_cyclopsGetFixedBeta, 2}, {"_Cyclops_cyclopsGetIsRegularized", (DL_FUNC) &_Cyclops_cyclopsGetIsRegularized, 2}, {"_Cyclops_cyclopsSetWeights", (DL_FUNC) &_Cyclops_cyclopsSetWeights, 2}, {"_Cyclops_cyclopsSetCensorWeights", (DL_FUNC) &_Cyclops_cyclopsSetCensorWeights, 2}, diff --git a/src/cyclops/CyclicCoordinateDescent.cpp b/src/cyclops/CyclicCoordinateDescent.cpp index 7ec11e3e..20d51d31 100644 --- a/src/cyclops/CyclicCoordinateDescent.cpp +++ b/src/cyclops/CyclicCoordinateDescent.cpp @@ -203,10 +203,14 @@ void CyclicCoordinateDescent::init(bool offset) { hDelta.resize(J, static_cast(initialBound)); hBeta.resize(J, static_cast(0.0)); + // initialize starting betas to default value 0.0 + startingBeta.resize(J, static_cast(0.0)); + // hXBeta.resize(K, static_cast(0.0)); // hXBetaSave.resize(K, static_cast(0.0)); fixBeta.resize(J, false); + hWeights.resize(0); cWeights.resize(0); // ESK: For censor weights @@ -217,6 +221,7 @@ void CyclicCoordinateDescent::init(bool offset) { varianceKnown = false; if (offset) { hBeta[0] = static_cast(1); + startingBeta[0] = static_cast(1); fixBeta[0] = true; xBetaKnown = false; } else { @@ -257,7 +262,13 @@ void CyclicCoordinateDescent::resetBeta(void) { auto start = hXI.getHasOffsetCovariate() ? 1 : 0; for (auto j = start; j < J; j++) { - hBeta[j] = 0.0; + + // check if a non-zero starting beta is present + if (startingBeta[j] != 0.0) { + hBeta[j] = startingBeta[j]; + } else { + hBeta[j] = 0.0; + } } computeXBeta(); sufficientStatisticsKnown = false; @@ -470,6 +481,13 @@ void CyclicCoordinateDescent::setBeta(const std::vector& beta) { varianceKnown = false; } +void CyclicCoordinateDescent::setStartingBeta(const std::vector& inStartingBeta) { + // ToDo: This functionality could be merged into setBeta() + for (int j = 0; j < J; ++j) { + startingBeta[j] = inStartingBeta[j]; + } +} + void CyclicCoordinateDescent::setBeta(int i, double beta) { #define PROCESS_IN_MS #ifdef PROCESS_IN_MS diff --git a/src/cyclops/CyclicCoordinateDescent.h b/src/cyclops/CyclicCoordinateDescent.h index 2e4e3575..e18c5ce0 100644 --- a/src/cyclops/CyclicCoordinateDescent.h +++ b/src/cyclops/CyclicCoordinateDescent.h @@ -131,6 +131,8 @@ class CyclicCoordinateDescent { // template void setBeta(const std::vector& beta); + void setStartingBeta(const std::vector& inStartingBeta); + void setBeta(int i, double beta); // void double getHessianComponent(int i, int j); @@ -328,6 +330,7 @@ class CyclicCoordinateDescent { typedef std::vector DoubleVector; DoubleVector hBeta; + DoubleVector startingBeta; // DoubleVector& hXBeta; // TODO Delegate to ModelSpecifics // DoubleVector& hXBetaSave; // Delegate From da4b7e706073f0aa5bec074de50908d3989041b3 Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Fri, 28 Oct 2022 23:12:58 +0000 Subject: [PATCH 02/18] add example for ScanReduce() --- .../fused_kernel/device_scan_reduce.cuh | 108 +++++++++++++++++- 1 file changed, 103 insertions(+), 5 deletions(-) diff --git a/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh b/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh index 41d6a39c..6764ea95 100644 --- a/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh +++ b/src/cyclops/engine/fused_kernel/device_scan_reduce.cuh @@ -51,7 +51,109 @@ struct DeviceFuse //@{ /** - * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified binary \p scan_op functor, \p reduction_op functor and \p transform_op functor. + * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified + * binary \p scan_op functor, \p reduction_op functor and \p transform_op functor. + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix scan and transform-reduction of an `int` + * device vector. + * + * @par + * @code + * #include + * #include "device_scan_reduce.cuh" + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [1, 2, 2, 1, 3, 0, 1] + * int *d_trans_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_sum; // e.g., [-] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive + * // prefix scan + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * DeviceFuse::ScanReduce( + * d_temp_storage, temp_storage_bytes, + * d_in, d_trans_in, d_sum, + * cub::Sum(), min_op, cub::Sum(), + * num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * DeviceScan::InclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_in, d_trans_in, d_sum, + * cub::Sum(), min_op, cub::Sum(), + * num_items); + * + * // d_sum <-- [26] + * @endcode + * + * @tparam ScanInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam TransformInputIteratorT + * **[inferred]** Random-access input iterator type for reading additional + * input for transformation \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing reduction + * outputs \iterator + * + * @tparam ScanOpT + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam ReductionOpT + * **[inferred]** Binary reduction functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam TransformOpT + * **[inferred]** Binary transform functor type having member + * `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Random-access iterator to the input sequence of data items + * + * @param[in] d_trans_in + * Random-access iterator to the additional input sequence of data items + * + * @param[out] d_sum + * Random-access iterator to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename ScanInputIteratorT, @@ -95,10 +197,6 @@ struct DeviceFuse }; -/** - * \example example_device_fuse.cu - */ - CUB_NAMESPACE_END From 189ffa36a07b2d775e7b127ea2a6442393c78fed Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Sat, 5 Nov 2022 06:09:00 +0000 Subject: [PATCH 03/18] update fused kernel for cub_1.8.0 --- src/Makevars | 1 + src/cyclops/engine/CudaKernel.cu | 3 +- .../fused_kernel_1.8.0/agent_scan_reduce.cuh | 454 ++++++++ .../block_store_exchange.cuh | 1019 +++++++++++++++++ .../fused_kernel_1.8.0/device_scan_reduce.cuh | 206 ++++ .../dispatch_scan_reduce.cuh | 810 +++++++++++++ 6 files changed, 2491 insertions(+), 2 deletions(-) create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh create mode 100644 src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh diff --git a/src/Makevars b/src/Makevars index cd1a5116..5f91f59a 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,6 +1,7 @@ CUDA_HOME = /usr/local/cuda CUB_PATH = $(CUDA_HOME)/include +#CUB_PATH = /home/jianxiao/cub_1.8.0/cub ## Use the R_HOME indirection to support installations of multiple R version PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` -lOpenCL -pthread -L$(CUDA_HOME)/lib64 -lcuda -lcudart diff --git a/src/cyclops/engine/CudaKernel.cu b/src/cyclops/engine/CudaKernel.cu index 29c36dd6..f072d7e6 100644 --- a/src/cyclops/engine/CudaKernel.cu +++ b/src/cyclops/engine/CudaKernel.cu @@ -11,11 +11,10 @@ #include #include #include -//#include -//#include #include "CudaKernel.h" #include "fused_kernel/device_scan_reduce.cuh" +//#include "fused_kernel_1.8.0/device_scan_reduce.cuh" using namespace cub; diff --git a/src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh new file mode 100644 index 00000000..5bea9c1b --- /dev/null +++ b/src/cyclops/engine/fused_kernel_1.8.0/agent_scan_reduce.cuh @@ -0,0 +1,454 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentScanReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan and transform-reduction. + */ + +#pragma once + +#include + +#include +#include +#include "block_store_exchange.cuh" +#include +#include +#include +#include +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentScan + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use + //BlockReduceAlgorithm _REDUCE_ALGORITHM> ///< Cooperative block-wide reduction algorithm to use +struct AgentScanPolicy1 +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) +// VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + //static const BlockReduceAlgorithm REDUCE_ALGORITHM = _REDUCE_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . + */ +template < + typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type + typename ScanInputIteratorT, ///< Random-access input iterator type + typename TransformInputIteratorT, ///< Random-access additional input iterator type + typename OutputIteratorT, ///< Random-access reduction output iterator type + typename ScanOpT, ///< Scan functor type + typename ReductionOpT, ///< Reduction functor type + typename TransformOpT, ///< Transformation functor type + typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan) + typename OffsetT> ///< Signed integer type for global offsets +struct AgentScanReduce +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + typedef typename std::iterator_traits::value_type InputT1; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + // Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + ScanInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedScanInputIteratorT; + + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + ScanInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedTransformInputIteratorT; + + // Constants + enum + { + IS_INCLUSIVE = Equals::VALUE, // Inclusive scan if no init_value type is provided + BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Parameterized BlockLoad type + typedef BlockLoad< + InputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::LOAD_ALGORITHM> + BlockLoadT; + + typedef BlockLoad< + InputT1, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::LOAD_ALGORITHM> + BlockLoadT1; + + // Parameterized BlockStore type + typedef BlockStore< + InputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::STORE_ALGORITHM> + BlockStoreT; + typedef BlockStore< + InputT1, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::STORE_ALGORITHM> + BlockStoreT1; + + // Parameterized BlockReduce type + typedef BlockReduce< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + BLOCK_REDUCE_WARP_REDUCTIONS> + BlockReduceT; + + // Parameterized BlockScan type + typedef BlockScan< + InputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + InputT, + ScanOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles + typedef BlockScanRunningPrefixOp< + InputT, + ScanOpT> + RunningPrefixCallbackOp; + + // Shared memory type for this thread block + union _TempStorage + { + typename BlockLoadT::TempStorage load; // Smem needed for tile loading + typename BlockStoreT::TempStorage store; // Smem needed for tile storing + typename BlockLoadT1::TempStorage load1; // Smem needed for tile loading + typename BlockStoreT1::TempStorage store1; // Smem needed for tile storing + + struct + { + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename BlockReduceT::TempStorage reduce; + }; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Linear thread-id + int linear_tid; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedScanInputIteratorT d_in; ///< Input data + WrappedTransformInputIteratorT d_trans_in; ///< Additional input data for transformation + OutputIteratorT d_block_sum; ///< Output data (block reduction) + OutputIteratorT d_sum; ///< Output data (reduction) + ScanOpT scan_op; ///< Binary scan operator + ReductionOpT reduction_op; ///< Binary reduction operator + TransformOpT transform_op; ///< Transformation operator on scan output + InitValueT init_value; ///< The init_value element for ScanOpT + + + //--------------------------------------------------------------------- + // Block scan utility methods + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + OutputT init_value, + ScanOpT scan_op, + OutputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); + block_aggregate = scan_op(init_value, block_aggregate); + } + + + /** + * Inclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + InputT (&items)[ITEMS_PER_THREAD], + InitValueT /*init_value*/, + ScanOpT scan_op, + InputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); + } + + + /** + * Exclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); + } + + + /** + * Inclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + InputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentScanReduce( + TempStorage& temp_storage, ///< Reference to temp_storage + ScanInputIteratorT d_in, ///< Input data + TransformInputIteratorT d_trans_in, ///< Additional input data for transformation + OutputIteratorT d_block_sum, ///< Output data (block reduction) + OutputIteratorT d_sum, ///< Output data (reduction) + ScanOpT scan_op, ///< Binary scan operator + ReductionOpT reduction_op, ///< Binary reduction operator + TransformOpT transform_op, ///< Transformation operator on scan output + InitValueT init_value) ///< Initial value to seed the exclusive scan + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_trans_in(d_trans_in), + d_block_sum(d_block_sum), + d_sum(d_sum), + scan_op(scan_op), + reduction_op(reduction_op), + transform_op(transform_op), + init_value(init_value), + linear_tid(RowMajorTid(AgentScanPolicyT::BLOCK_THREADS, 1, 1)) + {} + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template ///< Whether the current tile is the last tile + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + // Load items + InputT items[ITEMS_PER_THREAD]; // input for scan + InputT1 items1[ITEMS_PER_THREAD]; // input for reduction + + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining); + else + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + + CTA_SYNC(); + + // Perform tile scan + if (tile_idx == 0) + { + // Scan first tile + InputT block_aggregate; + ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); + if ((!IS_LAST_TILE) && (threadIdx.x == 0)) + tile_state.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + ScanTile(items, scan_op, prefix_op, Int2Type()); + } + + CTA_SYNC(); + + // Perform reduce and store + OutputT item; + OutputT block_aggregate = OutputT(); + OutputT thread_aggregate = OutputT(); + + if (IS_LAST_TILE) { // partially-full tile + + BlockStoreT(temp_storage.store).Exchange(tile_offset, items, num_remaining); + + // load additional data + BlockLoadT1(temp_storage.load1).Load(d_trans_in + tile_offset, items1, num_remaining); + BlockStoreT1(temp_storage.store1).Exchange(tile_offset, items1, num_remaining); + + CTA_SYNC(); + + // thread transform reduction + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * BLOCK_THREADS) + linear_tid < num_remaining) + { + item = transform_op(items[ITEM], items1[ITEM]); + thread_aggregate = reduction_op(thread_aggregate, item); + } + } + + } else { // full tile + + // load additional data + BlockLoadT1(temp_storage.load1).Load(d_trans_in + tile_offset, items1); + + CTA_SYNC(); + + // thread transform reduction + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + item = transform_op(items[ITEM], items1[ITEM]); + thread_aggregate = reduction_op(thread_aggregate, item); + } + + } + + // block reduction + block_aggregate = BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); + + if (threadIdx.x == 0) + d_block_sum[blockIdx.x] = block_aggregate; + + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + ScanTileStateT& tile_state, ///< Global tile state descriptor + int start_tile) ///< The starting tile for the current grid + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = start_tile + blockIdx.x; // Current tile index + OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + else if (num_remaining > 0) + { + // Last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh b/src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh new file mode 100644 index 00000000..59e6f5f9 --- /dev/null +++ b/src/cyclops/engine/fused_kernel_1.8.0/block_store_exchange.cuh @@ -0,0 +1,1019 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for writing linear segments of data from the CUDA thread block + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[ITEM] = items[ITEM]; + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) + { + thread_itr[ITEM] = items[ITEM]; + } + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, + * which is the default starting offset returned by \p cudaMalloc() + * + * \par + * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void StoreDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for storing from + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + enum + { + // Maximum CUDA vector size is 4 elements + MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), + + // Vector size must be a power of two and an even divisor of the items per thread + VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? + MAX_VEC_SIZE : + 1, + + VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Alias global pointer + Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); + + // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) + Vector raw_vector[VECTORS_PER_THREAD]; + T *raw_items = reinterpret_cast(raw_vector); + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + raw_items[ITEM] = items[ITEM]; + } + + // Direct-store using vector types + StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); +} + + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } + } +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } + } +} + + +//@} end member group + + +/** @} */ // end group UtilIo + + +//----------------------------------------------------------------------------- +// Generic BlockStore abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. + */ +enum BlockStoreAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_STORE_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written directly + * to memory using CUDA's built-in vectorized stores as a coalescing optimization. + * For example, st.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector store width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p OutputIteratorT is not a simple pointer type + * - The block output offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_STORE_VECTORIZE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_WARP_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * To reduce the shared memory requirement, only one warp's worth of shared + * memory is provisioned and is subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. + */ + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + +}; + + +/** + * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam T The type of data to be written. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockStore class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockStore can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is written directly to memory using CUDA's built-in vectorized stores as a + * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockStore} + * \par + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockStore +{ +private: + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Store helper + template + struct StoreInternal; + + + /** + * BLOCK_STORE_DIRECT specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_VECTORIZE specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) + __device__ __forceinline__ void Store( + T *block_ptr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToStriped(items); + StoreDirectStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + + /// Exchange items for partially-full tile (last tile), guarded by range + template + __device__ __forceinline__ void Exchange( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to exchange + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToStriped(items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef StoreInternal InternalStore; + + + /// Shared memory storage layout type + typedef typename InternalStore::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + + /// \smemstorage{BlockStore} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockStore() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockStore( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Store items into a linear segment of memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items); + } + + /** + * \brief Store items into a linear segment of memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. + * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with + * only the first two threads being unmasked to store portions of valid data. + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); + } + + template + __device__ __forceinline__ void Exchange( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to exchange + int valid_items) ///< [in] Number of valid items to write + { + InternalStore(temp_storage, linear_tid).Exchange(block_itr, items, valid_items); + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh new file mode 100644 index 00000000..4fae1c10 --- /dev/null +++ b/src/cyclops/engine/fused_kernel_1.8.0/device_scan_reduce.cuh @@ -0,0 +1,206 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScanReduce provides device-wide, parallel operations for computing a prefix scan and a tranform-reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include +#include "dispatch_scan_reduce.cuh" +#include + +/// Optional outer namespace(s) +//CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +struct DeviceFuse +{ + /******************************************************************//** + * \name Inclusive scan and transform-reduction + *********************************************************************/ + //@{ + + /** + * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified + * binary \p scan_op functor, \p reduction_op functor and \p transform_op functor. + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix scan and transform-reduction of an `int` + * device vector. + * + * @par + * @code + * #include + * #include "device_scan_reduce.cuh" + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [1, 2, 2, 1, 3, 0, 1] + * int *d_trans_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_sum; // e.g., [-] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive + * // prefix scan + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * DeviceFuse::ScanReduce( + * d_temp_storage, temp_storage_bytes, + * d_in, d_trans_in, d_sum, + * cub::Sum(), min_op, cub::Sum(), + * num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * DeviceScan::InclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_in, d_trans_in, d_sum, + * cub::Sum(), min_op, cub::Sum(), + * num_items); + * + * // d_sum <-- [26] + * @endcode + * + * @tparam ScanInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam TransformInputIteratorT + * **[inferred]** Random-access input iterator type for reading additional + * input for transformation \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing reduction + * outputs \iterator + * + * @tparam ScanOpT + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam ReductionOpT + * **[inferred]** Binary reduction functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam TransformOpT + * **[inferred]** Binary transform functor type having member + * `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Random-access iterator to the input sequence of data items + * + * @param[in] d_trans_in + * Random-access iterator to the additional input sequence of data items + * + * @param[out] d_sum + * Random-access iterator to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + */ + template < + typename ScanInputIteratorT, + typename TransformInputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename ReductionOpT, + typename TransformOpT> + CUB_RUNTIME_FUNCTION + static cudaError_t ScanReduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ScanInputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + TransformInputIteratorT d_trans_in, ///< [in] Pointer to the additional input sequence of data items for transformation + OutputIteratorT d_sum, ///< [out] Pointer to the output aggregate + ScanOpT scan_op, ///< [in] Binary scan functor + ReductionOpT reduction_op, ///< [in] Binary reduction functor + TransformOpT transform_op, ///< [in] Transformation functor on scan output + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScanReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_trans_in, + d_sum, + scan_op, + reduction_op, + transform_op, + NullType(), + num_items, + stream, + debug_synchronous); + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh new file mode 100644 index 00000000..9805552a --- /dev/null +++ b/src/cyclops/engine/fused_kernel_1.8.0/dispatch_scan_reduce.cuh @@ -0,0 +1,810 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScanReduce provides device-wide, parallel operations for computing a prefix scan and transform-reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "agent_scan_reduce.cuh" +#include +#include +#include +#include +#include +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT> ///< Tile status interface type +__global__ void DeviceScanReduceInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); +} + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT, ///< Tile status interface type + typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected +__global__ void DeviceCompactInitKernel1( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles, ///< [in] Number of tiles + NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); + + // Initialize d_num_selected_out + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + *d_num_selected_out = 0; +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type + typename ScanInputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename TransformInputIteratorT, ///< Random-access input iterator type for reading additional inputs for transfromation \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing reduction outputs \iterator + typename ScanTileStateT, ///< Tile status interface type + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename TransformOpT, ///< Transformation functor on scan output + typename InitValueT, ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS)) +__global__ void DeviceScanReduceKernel( + ScanInputIteratorT d_in, ///< Input data + TransformInputIteratorT d_trans_in, ///< Additional input data for transformation + OutputIteratorT d_block_sum, ///< Output data (block reduction) + OutputIteratorT d_sum, ///< Output data (reduction) + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + ScanOpT scan_op, ///< Binary scan functor + ReductionOpT reduction_op, ///< Binary reduction functor + TransformOpT transform_op, ///< Transformation functor on scan output + InitValueT init_value, ///< Initial value to seed the exclusive scan + OffsetT num_items) ///< Total number of scan items for the entire problem +{ + // Thread block type for scanning input tiles + typedef AgentScanReduce< + ScanPolicyT, + ScanInputIteratorT, + TransformInputIteratorT, + OutputIteratorT, + ScanOpT, + ReductionOpT, + TransformOpT, + InitValueT, + OffsetT> AgentScanReduceT; + + // Shared memory for AgentScan + __shared__ typename AgentScanReduceT::TempStorage temp_storage; + + // Process tiles + AgentScanReduceT(temp_storage, d_in, d_trans_in, d_block_sum, d_sum, scan_op, reduction_op, transform_op, init_value).ConsumeRange( + num_items, + tile_state, + start_tile); + + +} + + +/** + * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. + */ +template < +// typename ChainedPolicyT, ///< Chained tuning policy + typename SingleTilePolicyT, ///< Chained tuning policy + typename ScanInputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OuputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +//__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__launch_bounds__ (int(SingleTilePolicyT::BLOCK_THREADS), 1) +__global__ void DeviceReduceSingleTileKernel1( + ScanInputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OuputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< +// typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, + SingleTilePolicyT, + ScanInputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Check if empty problem + if (num_items == 0) + { + if (threadIdx.x == 0) + *d_out = init; + return; + } + + // Consume input tiles + OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + OffsetT(0), + num_items); + + // Output result + if (threadIdx.x == 0) + *d_out = reduction_op(init, block_aggregate); +} + +/****************************************************************************** + * Policy + ******************************************************************************/ + +template < + typename OuputT, ///< Data type + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DeviceScanReducePolicy +{ + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM13 + struct Policy130 : ChainedPolicy<130, Policy130, Policy130> + { + // ReducePolicy + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy130> + { + // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + // ReducePolicy (GTX670: 154.0 @ 48M 4B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + /// SM60 + struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + { + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 16, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// MaxPolicy + typedef Policy600 MaxPolicy; + +}; + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename ScanInputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename TransformInputIteratorT, ///< Random-access input iterator type for reading additional inputs for transformation \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing reduction outputs \iterator + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename TransformOpT, ///< Transformation functor on scan output + typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchScanReduce : + DeviceReducePolicy< + typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type, // ... else the output iterator's value type + OffsetT, + ReductionOpT> +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + int num_tiles; + int init_grid_size; + int scan_grid_size; + int scan_sm_occupancy; + ScanTileStateT tile_state; + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM600 + struct Policy600 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 15, InputT), ///< Threads per block, items per thread + BLOCK_LOAD_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM520 + struct Policy520 + { + // Titan X: 32.47B items/s @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, InputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM35 + struct Policy350 + { + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, InputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + BLOCK_SCAN_RAKING> + ScanPolicyT; + }; + + /// SM30 + struct Policy300 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(256, 9, InputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM20 + struct Policy200 + { + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, InputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM13 + struct Policy130 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(96, 21, InputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanPolicyT; + }; + + /// SM10 + struct Policy100 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(64, 9, InputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 520) + typedef Policy520 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {}; + struct PtxSingleTilePolicy : PtxPolicy::ScanPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_kernel_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_kernel_config.template Init(); + scan_kernel_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 520) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 350) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 300) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 200) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 130) + { + scan_kernel_config.template Init(); + } + else + { + scan_kernel_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < +// typename SingleTilePolicyT, + typename ScanReduceInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename ScanReduceSweepKernelPtrT, ///< Function type of cub::DeviceScanKernelPtrT + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ScanInputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + TransformInputIteratorT d_trans_in, ///< [in] Pointer to the additional input sequence of data items for transformation + OutputIteratorT d_sum, ///< [out] Pointer to the output aggregate + ScanOpT scan_op, ///< [in] Binary scan functor + ReductionOpT reduction_op, ///< [in] Binary reduction functor + TransformOpT transform_op, ///< [in] Transformation functor on scan output + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanReduceInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ScanReduceSweepKernelPtrT scan_reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel + SingleTileKernelT single_tile_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_trans_in; + (void)d_sum; + (void)scan_op; + (void)reduction_op; + (void)transform_op; + (void)init_value; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)scan_reduce_kernel; + (void)single_tile_kernel; + (void)scan_kernel_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + typedef typename DispatchScanReduce::MaxPolicy MaxPolicyT; + typedef typename MaxPolicyT::SingleTilePolicy SingleTilePolicyT; + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; // bytes for both scan and reduction + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors (scan) + allocation_sizes[1] = + { + scan_grid_size * sizeof(OutputT) // bytes needed for privatized block reductions (reduction) + }; + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[2]; // temp storage for both scan and reduction + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + OutputT* d_block_sum = (OutputT*) allocations[1]; + + // Return if empty problem + if (num_items == 0) + break; + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_sm_occupancy, // out + scan_reduce_kernel, + scan_kernel_config.block_threads))) break; + + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d scan_reduce_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); + + // Invoke scan_kernel + scan_reduce_kernel<<>>( + d_in, + d_trans_in, + d_block_sum, + d_sum, + tile_state, + start_tile, + scan_op, + reduction_op, + transform_op, + init_value, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + SingleTilePolicyT::BLOCK_THREADS, + (long long) stream, + SingleTilePolicyT::ITEMS_PER_THREAD); + + // Invoke DeviceReduceSingleTileKernel + single_tile_kernel<<<1, SingleTilePolicyT::BLOCK_THREADS, 0, stream>>>( + d_block_sum, + d_sum, + scan_grid_size, + reduction_op, + OutputT()); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ScanInputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + TransformInputIteratorT d_trans_in, ///< [in] Pointer to the additional input sequence of data items for transformation + OutputIteratorT d_sum, ///< [out] Pointer to the output aggregate + ScanOpT scan_op, ///< [in] Binary scan functor + ReductionOpT reduction_op, ///< [in] Binary reduction functor + TransformOpT transform_op, ///< [in] Transformation functor on scan output + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchScanReduce::MaxPolicy MaxPolicyT; + typedef typename MaxPolicyT::SingleTilePolicy SingleTilePolicyT; + + cudaError error = cudaSuccess; +// debug_synchronous = true; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Get kernel kernel dispatch configurations + KernelConfig scan_kernel_config; + InitConfigs(ptx_version, scan_kernel_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_trans_in, + d_sum, + scan_op, + reduction_op, + transform_op, + init_value, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceScanReduceInitKernel, + DeviceScanReduceKernel, + DeviceReduceSingleTileKernel1, + scan_kernel_config))) break; + } + while (0); + + return error; + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + From 1cf448118cf64d100bb08ba14fedcdfe229cad8e Mon Sep 17 00:00:00 2001 From: Jianxiao Yang Date: Sat, 10 Dec 2022 19:55:06 +0800 Subject: [PATCH 04/18] add ScanReduce() example for cub_1.17 --- .../fused_kernel_1.17/device_scan_reduce.cuh | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh b/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh index 41d6a39c..4b8c1e9c 100644 --- a/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh +++ b/src/cyclops/engine/fused_kernel_1.17/device_scan_reduce.cuh @@ -52,6 +52,107 @@ struct DeviceFuse /** * \brief Computes a device-wide inclusive prefix scan and transform-reduction using the specified binary \p scan_op functor, \p reduction_op functor and \p transform_op functor. + * + * @par Snippet + * The code snippet below illustrates the inclusive prefix scan and transform-reduction of an `int` + * device vector. + * + * @par + * @code + * #include + * #include "device_scan_reduce.cuh" + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [1, 2, 2, 1, 3, 0, 1] + * int *d_trans_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_sum; // e.g., [-] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive + * // prefix scan + * void *d_temp_storage = nullptr; + * size_t temp_storage_bytes = 0; + * DeviceFuse::ScanReduce( + * d_temp_storage, temp_storage_bytes, + * d_in, d_trans_in, d_sum, + * cub::Sum(), min_op, cub::Sum(), + * num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * DeviceScan::InclusiveSum( + * d_temp_storage, temp_storage_bytes, + * d_in, d_trans_in, d_sum, + * cub::Sum(), min_op, cub::Sum(), + * num_items); + * + * // d_sum <-- [26] + * @endcode + * + * @tparam ScanInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan + * inputs \iterator + * + * @tparam TransformInputIteratorT + * **[inferred]** Random-access input iterator type for reading additional + * input for transformation \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing reduction + * outputs \iterator + * + * @tparam ScanOpT + * **[inferred]** Binary scan functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam ReductionOpT + * **[inferred]** Binary reduction functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam TransformOpT + * **[inferred]** Binary transform functor type having member + * `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Random-access iterator to the input sequence of data items + * + * @param[in] d_trans_in + * Random-access iterator to the additional input sequence of data items + * + * @param[out] d_sum + * Random-access iterator to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template < typename ScanInputIteratorT, From 270043434fa217f834a0d03bb037aa224011f045 Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Tue, 4 Apr 2023 14:59:40 -0700 Subject: [PATCH 05/18] attempt at 3.3.0 CRAN submission --- DESCRIPTION | 2 +- NEWS.md | 11 +++++++++++ R/Survfit.R | 6 ++++-- README.md | 4 ++-- cran-comments.md | 18 ++++++++++-------- inst/CITATION | 10 +++++----- man/survfit.cyclopsFit.Rd | 6 ++++-- src/cyclops/Types.h | 18 +++++++++--------- ...ierarchyGridSearchCrossValidationDriver.cpp | 6 +++--- 9 files changed, 49 insertions(+), 32 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f9555bf2..4b37e0fc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: Cyclops Type: Package Title: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis -Version: 3.2.0 +Version: 3.3.0 Authors@R: c( person("Marc A.", "Suchard", email = "msuchard@ucla.edu", role = c("aut","cre")), person("Martijn J.", "Schuemie", role = "aut"), diff --git a/NEWS.md b/NEWS.md index ee655c35..17c42c9b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,15 @@ +Cyclops v3.3.0 +============== + +Changes: + +1. bump for R 4.2 +2. fix CRAN warnings + a. used `minValues` + b. explicit dependence on C++11 + Cyclops v3.2.0 +============== Changes: diff --git a/R/Survfit.R b/R/Survfit.R index f1f2532f..798935fb 100644 --- a/R/Survfit.R +++ b/R/Survfit.R @@ -4,15 +4,17 @@ #' @description #' \code{survfit.cyclopsFit} computes baseline hazard function #' -#' @param cyclopsFit A Cyclops survival model fit object +#' @param formula A Cyclops survival model fit object #' @param type type of baseline survival, choices are: "aalen" (Breslow) +#' @param ... for future methods #' #' @return Baseline survival function for mean covariates #' #' @importFrom survival survfit #' #' @export -survfit.cyclopsFit <- function(cyclopsFit, type="aalen") { +survfit.cyclopsFit <- function(formula, type="aalen", ...) { + cyclopsFit <- formula delta = meanLinearPredictor(cyclopsFit) times = getTimeVector(cyclopsFit$cyclopsData) diff --git a/README.md b/README.md index 3cf4cb0c..f7eded0b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Cyclops ======= [![Build Status](https://github.com/ohdsi/Cyclops/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/Cyclops/actions?query=workflow%3AR-CMD-check) -[![codecov.io](https://codecov.io/github/OHDSI/Cyclops/coverage.svg?branch=main)](https://codecov.io/github/OHDSI/Cyclops?branch=main) +[![codecov.io](https://app.codecov.io/github/OHDSI/Cyclops/coverage.svg?branch=main)](https://app.codecov.io/github/OHDSI/Cyclops?branch=main) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/Cyclops)](https://CRAN.R-project.org/package=Cyclops) [![CRAN_Status_Badge](https://cranlogs.r-pkg.org/badges/Cyclops)](https://cran.r-project.org/package=Cyclops) @@ -76,7 +76,7 @@ License ======= Cyclops is licensed under Apache License 2.0. Cyclops contains the TinyThread libray. -The TinyThread library is licensed under the [zlib/libpng](https://opensource.org/licenses/Zlib/) license as described [here](https://tinythreadpp.bitsnbites.eu/). +The TinyThread library is licensed under the [zlib/libpng](https://opensource.org/license/zlib-license-php/) license as described [here](https://tinythreadpp.bitsnbites.eu/). Development =========== diff --git a/cran-comments.md b/cran-comments.md index 953a3c98..d76f48cb 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,9 +1,6 @@ -## Initial submission of major patch update to package +## Initial submission of patch update to package -* fixed likelihood profiling when objective function is non-concave due - to floating-point truncation -* fixed parsing of 64-bit integer covariate IDs -* added Jeffrey's prior for single regression coefficient +* fixed all CRAN WARNINGS, including generic-inconsistency, uninitialized values ## Test environments * local OS X install, R 4.1 @@ -12,16 +9,21 @@ ## R CMD check results * There were no ERRORs or WARNINGs -* There is 1 occasional NOTE: - checking installed package size ... NOTE +* There is 2 occasional NOTEs: + 1. checking installed package size ... NOTE installed size is 22.5Mb sub-directories of 1Mb or more: libs 21.7Mb + 2. Specified C++11: please drop specification unless essential -This occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug +The first NOTE occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug symbols in their compilation; Cyclops performance is heavily dependent on many template instantiations that generate a large library when including debug symbols. Future availability of C++17 'if (constexpr ...)' should decrease library size substantially. +The second NOTE remains because many OHDSI community users remain on R 4.0 that supports C++11 +but does not automatically compile with C++11 enabled. + ## Downstream dependencies * 'EvidenceSynthesis' - checked and works. +* 'IterativeHardThresholding' - checked and works. diff --git a/inst/CITATION b/inst/CITATION index 8d29f2ff..7306e6e7 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,11 +1,11 @@ citHeader("To cite Cyclops in publications use:") bibentry(bibtype = "Article", - author = personList(as.person("M. A. Suchard"), - as.person("S. E. Simpson"), - as.person("I. Zorych"), - as.person("P. Ryan"), - as.person("D. Madigan")), + author = c(as.person("M. A. Suchard"), + as.person("S. E. Simpson"), + as.person("I. Zorych"), + as.person("P. Ryan"), + as.person("D. Madigan")), title = "Massive parallelization of serial inference algorithms for complex generalized linear models", journal = "ACM Transactions on Modeling and Computer Simulation", volume = "23", diff --git a/man/survfit.cyclopsFit.Rd b/man/survfit.cyclopsFit.Rd index e77faa42..0b4b4051 100644 --- a/man/survfit.cyclopsFit.Rd +++ b/man/survfit.cyclopsFit.Rd @@ -4,12 +4,14 @@ \alias{survfit.cyclopsFit} \title{Calculate baseline hazard function} \usage{ -\method{survfit}{cyclopsFit}(cyclopsFit, type = "aalen") +\method{survfit}{cyclopsFit}(formula, type = "aalen", ...) } \arguments{ -\item{cyclopsFit}{A Cyclops survival model fit object} +\item{formula}{A Cyclops survival model fit object} \item{type}{type of baseline survival, choices are: "aalen" (Breslow)} + +\item{...}{for future methods} } \value{ Baseline survival function for mean covariates diff --git a/src/cyclops/Types.h b/src/cyclops/Types.h index 32d514ac..b477e85a 100644 --- a/src/cyclops/Types.h +++ b/src/cyclops/Types.h @@ -12,7 +12,7 @@ #include #include -#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L +// #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L // C++11 #include namespace bsccs { @@ -20,15 +20,15 @@ using std::make_shared; using std::unique_ptr; } -#else +// #else // C++98 (R build) - #include "boost/smart_ptr.hpp" - namespace bsccs { - using boost::shared_ptr; - using boost::make_shared; - using boost::unique_ptr; - } -#endif +// #include "boost/smart_ptr.hpp" +// namespace bsccs { +// using boost::shared_ptr; +// using boost::make_shared; +// using boost::unique_ptr; +// } +// #endif #ifdef WIN_BUILD #include diff --git a/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp b/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp index 49c3476d..68c3d1a0 100644 --- a/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp +++ b/src/cyclops/drivers/HierarchyGridSearchCrossValidationDriver.cpp @@ -74,7 +74,7 @@ void HierarchyGridSearchCrossValidationDriver::drive(CyclicCoordinateDescent& cc std::vector outerPoints; std::vector innerPoints; std::vector outerValues; - std::vector minValues; + // std::vector minValues; for (int outerStep = 0; outerStep < gridSize; outerStep++){ std::vector predLogLikelihoodOuter; @@ -124,9 +124,9 @@ void HierarchyGridSearchCrossValidationDriver::drive(CyclicCoordinateDescent& cc // Report results double maxPoint; double maxValue; - double minValue; + // double minValue; removed for v3.3.0; TODO why was this here? findMax(&maxPoint, &maxValue); - minValues.push_back(minValue); + // minValues.push_back(minValue); innerPoints.push_back(maxPoint); outerPoints.push_back(outerPoint); From d88c531efa06933c1007cb785343f1382c95ee0d Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Wed, 5 Apr 2023 08:47:41 -0700 Subject: [PATCH 06/18] fix CRAN warnings and notes; remove explicit dependency on C++11 for R < 4.1 --- .Rbuildignore | 2 ++ .gitignore | 2 ++ DESCRIPTION | 1 + NEWS.md | 3 ++- cleanup | 3 +++ configure | 18 ++++++++++++++++++ configure.win | 2 ++ cran-comments.md | 13 +++++-------- src/{Makevars => Makevars.in} | 2 -- src/{Makevars.win => Makevars.win.in} | 2 -- tools/configure.R | 23 +++++++++++++++++++++++ 11 files changed, 58 insertions(+), 13 deletions(-) create mode 100755 cleanup create mode 100755 configure create mode 100755 configure.win rename src/{Makevars => Makevars.in} (99%) rename src/{Makevars.win => Makevars.win.in} (99%) create mode 100644 tools/configure.R diff --git a/.Rbuildignore b/.Rbuildignore index 5695b3ca..6d87dae1 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -34,3 +34,5 @@ deprecated ^\.github$ ^CRAN-RELEASE$ .covrignore +src/Makevars$ +src/Makevars.win$ diff --git a/.gitignore b/.gitignore index 66ec7364..acfe09e8 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ CRAN-RELEASE .DS_Store *.gcno *.gcda +src/Makevars +src/Makevars.win diff --git a/DESCRIPTION b/DESCRIPTION index 4b37e0fc..3f493895 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,7 @@ Description: This model fitting tool incorporates cyclic coordinate descent and Suchard, Simpson, Zorych, Ryan and Madigan (2013) . License: Apache License 2.0 LazyData: Yes +Biarch: true URL: https://github.com/ohdsi/cyclops BugReports: https://github.com/ohdsi/cyclops/issues Depends: diff --git a/NEWS.md b/NEWS.md index 17c42c9b..3cf71839 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,8 @@ Changes: 1. bump for R 4.2 2. fix CRAN warnings a. used `minValues` - b. explicit dependence on C++11 +3. fix CRAN notes + a. remove explicit dependence on C++11 (except for R <= 4.0) Cyclops v3.2.0 ============== diff --git a/cleanup b/cleanup new file mode 100755 index 00000000..db13273f --- /dev/null +++ b/cleanup @@ -0,0 +1,3 @@ +#!/bin/sh + +rm -f config.* src/Makevars src/Markvars.win src/config.h diff --git a/configure b/configure new file mode 100755 index 00000000..87fb3528 --- /dev/null +++ b/configure @@ -0,0 +1,18 @@ +#!/usr/bin/env sh + +# adapted from https://github.com/tidyverse/purrr/blob/main/configure + +# Check that this is not just ./configure. We need to run this +# from R CMD INSTALL, to have the R env vars set. + +if [ -z "$R_HOME" ]; then +echo >&2 R_HOME is not set, are you running R CMD INSTALL? +exit 1 +fi + +# Find the R binary we need to use. This is a bit trickier on +# Windows, because it has two architectures. On windows R_ARCH_BIN +# is set, so this should work everywhere. +RBIN="${R_HOME}/bin${R_ARCH_BIN}/R" + +"$RBIN" --vanilla --slave -f tools/configure.R diff --git a/configure.win b/configure.win new file mode 100755 index 00000000..1f66e1c7 --- /dev/null +++ b/configure.win @@ -0,0 +1,2 @@ +#! /usr/bin/env sh +sh ./configure diff --git a/cran-comments.md b/cran-comments.md index d76f48cb..c4255b42 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,6 +1,7 @@ ## Initial submission of patch update to package -* fixed all CRAN WARNINGS, including generic-inconsistency, uninitialized values +* fixed all WARNINGS, including generic-inconsistency, uninitialized values +* fixed NOTE about C++11 ## Test environments * local OS X install, R 4.1 @@ -9,21 +10,17 @@ ## R CMD check results * There were no ERRORs or WARNINGs -* There is 2 occasional NOTEs: - 1. checking installed package size ... NOTE +* There is 1 occasional NOTE: + checking installed package size ... NOTE installed size is 22.5Mb sub-directories of 1Mb or more: libs 21.7Mb - 2. Specified C++11: please drop specification unless essential -The first NOTE occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug +This occurs on systems (like 'r-devel-linux-x86_64-fedora-clang') that include debug symbols in their compilation; Cyclops performance is heavily dependent on many template instantiations that generate a large library when including debug symbols. Future availability of C++17 'if (constexpr ...)' should decrease library size substantially. -The second NOTE remains because many OHDSI community users remain on R 4.0 that supports C++11 -but does not automatically compile with C++11 enabled. - ## Downstream dependencies * 'EvidenceSynthesis' - checked and works. * 'IterativeHardThresholding' - checked and works. diff --git a/src/Makevars b/src/Makevars.in similarity index 99% rename from src/Makevars rename to src/Makevars.in index a1bfd86d..ee8a04ba 100644 --- a/src/Makevars +++ b/src/Makevars.in @@ -24,8 +24,6 @@ PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` ## done by a number of packages, but recommended only for more advanced users ## comfortable with autoconf and its related tools. -CXX_STD = CXX11 - PKG_CPPFLAGS = -I. -Icyclops -DR_BUILD -DDOUBLE_PRECISION PKG_CXXFLAGS = -g1 diff --git a/src/Makevars.win b/src/Makevars.win.in similarity index 99% rename from src/Makevars.win rename to src/Makevars.win.in index dd5cd45c..8a09c711 100644 --- a/src/Makevars.win +++ b/src/Makevars.win.in @@ -24,8 +24,6 @@ PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` ## done by a number of packages, but recommended only for more advanced users ## comfortable with autoconf and its related tools. -CXX_STD = CXX11 - PKG_CPPFLAGS = -I. -Icyclops -DR_BUILD -DWIN_BUILD -DDOUBLE_PRECISION -DRCPP_PARALLEL_USE_TBB=1 # PKG_LIBS += $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" \ diff --git a/tools/configure.R b/tools/configure.R new file mode 100644 index 00000000..d9853786 --- /dev/null +++ b/tools/configure.R @@ -0,0 +1,23 @@ +# Gets run when compiling + +makevars_in <- file.path("src", "Makevars.in") +makevars_win_in <- file.path("src", "Makevars.win.in") + +makevars_out <- file.path("src", "Makevars") +makevars_win_out <- file.path("src", "Makevars.win") + +txt <- readLines(makevars_in) +txt_win <- readLines(makevars_win_in) + +if (getRversion() < "4.1") { + if (!any(grepl("^CXX_STD", txt))) { + txt <- c("CXX_STD = CXX11", txt) + } + + if (!any(grepl("^CXX_STD", txt_win))) { + txt_win <- c("CXX_STD = CXX11", txt_win) + } +} + +cat(txt, file = makevars_out, sep = "\n") +cat(txt_win, file = makevars_win_out, sep = "\n") From ec41b492efbdaca83a923ffbe8c897a703435493 Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Wed, 5 Apr 2023 10:57:32 -0700 Subject: [PATCH 07/18] fix line-endings for windoz, remove some std::cerr --- DESCRIPTION | 2 +- NEWS.md | 1 + src/cyclops/drivers/AbstractCrossValidationDriver.cpp | 2 -- src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp | 5 +---- tools/configure.R | 7 +++++-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 747d0b51..0158d5b2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: Cyclops Type: Package Title: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis -Version: 3.3.0 +Version: 3.3.0.99 Authors@R: c( person("Marc A.", "Suchard", email = "msuchard@ucla.edu", role = c("aut","cre")), person("Martijn J.", "Schuemie", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 726fbbc8..37d71a8a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ develop Changes: 1. improvements on adaptive likelihood profiling +2. fix line-endings on Makevar on windows Cyclops v3.3.0 ============== diff --git a/src/cyclops/drivers/AbstractCrossValidationDriver.cpp b/src/cyclops/drivers/AbstractCrossValidationDriver.cpp index 2d3ac6f0..4ceddf75 100644 --- a/src/cyclops/drivers/AbstractCrossValidationDriver.cpp +++ b/src/cyclops/drivers/AbstractCrossValidationDriver.cpp @@ -141,8 +141,6 @@ double AbstractCrossValidationDriver::doCrossValidationStep( const auto& arguments = allArguments.crossValidation; bool coldStart = allArguments.resetCoefficients; - std::cerr << "C " << arguments.foldToCompute << std::endl; - predLogLikelihood.resize(arguments.foldToCompute); auto& weightsExclude = this->weightsExclude; diff --git a/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp b/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp index b4b67080..9b1da1ee 100644 --- a/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp +++ b/src/cyclops/drivers/AutoSearchCrossValidationDriver.cpp @@ -112,13 +112,10 @@ MaxPoint AutoSearchCrossValidationDriver::doCrossValidationLoop( std::vector predLogLikelihood; - std::cerr << "A" << std::endl; - // Newly re-located code double pointEstimate = doCrossValidationStep(ccd, selector, allArguments, step, nThreads, ccdPool, selectorPool, - predLogLikelihood); - std::cerr << "B" << std::endl; + predLogLikelihood); double stdDevEstimate = computeStDev(predLogLikelihood, pointEstimate); diff --git a/tools/configure.R b/tools/configure.R index d9853786..b372da41 100644 --- a/tools/configure.R +++ b/tools/configure.R @@ -19,5 +19,8 @@ if (getRversion() < "4.1") { } } -cat(txt, file = makevars_out, sep = "\n") -cat(txt_win, file = makevars_win_out, sep = "\n") +if (.Platform$OS.type == "unix") { + cat(txt, file = makevars_out, sep = "\n") +} else { + cat(txt_win, file = makevars_win_out, sep = "\n") +} From 353d6bda4800b79091f42aebb43636063454acca Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Tue, 11 Apr 2023 22:35:13 +0000 Subject: [PATCH 08/18] log both raw estimates and summary stats for bootstrap --- src/cyclops/drivers/BootstrapDriver.cpp | 54 +++++++++++++++++++++++++ src/cyclops/drivers/BootstrapDriver.h | 2 + 2 files changed, 56 insertions(+) diff --git a/src/cyclops/drivers/BootstrapDriver.cpp b/src/cyclops/drivers/BootstrapDriver.cpp index e07613f4..049cefaa 100644 --- a/src/cyclops/drivers/BootstrapDriver.cpp +++ b/src/cyclops/drivers/BootstrapDriver.cpp @@ -131,4 +131,58 @@ void BootstrapDriver::logResults(const CCDArguments& arguments, std::vector& savedBeta, std::string treatmentId) { + +// int j = J-1; + int j = 0; + while (modelData->getColumnLabel(j) != treatmentId) j++; + + ofstream outLog(arguments.outFileName.c_str()); + if (!outLog) { + std::ostringstream stream; + stream << "Unable to open log file: " << arguments.bsFileName; + error->throwError(stream); + } + + string sep(","); // TODO Make option + + // Raw estimates + for(rvector::iterator it = estimates[j]->begin(); it != estimates[j]->end(); ++it) outLog << *it << endl; + + // Stats + outLog << "Drug_concept_id" << sep << "Condition_concept_id" << sep << + "score" << sep << "standard_error" << sep << "bs_mean" << sep << "bs_lower" << sep << + "bs_upper" << sep << "bs_prob0" << endl; + + outLog << modelData->getColumnLabel(j) << + sep << treatmentId << sep; + + double mean = 0.0; + double var = 0.0; + double prob0 = 0.0; + for (rvector::iterator it = estimates[j]->begin(); it != estimates[j]->end(); ++it) { + mean += *it; + var += *it * *it; + if (*it == 0.0) { + prob0 += 1.0; + } + } + + double size = static_cast(estimates[j]->size()); + mean /= size; + var = (var / size) - (mean * mean); + prob0 /= size; + + sort(estimates[j]->begin(), estimates[j]->end()); + int offsetLower = static_cast(size * 0.025); + int offsetUpper = static_cast(size * 0.975); + + double lower = *(estimates[j]->begin() + offsetLower); + double upper = *(estimates[j]->begin() + offsetUpper); + + outLog << savedBeta[j] << sep; + outLog << std::sqrt(var) << sep << mean << sep << lower << sep << upper << sep << prob0 << endl; + outLog.close(); +} + } // namespace diff --git a/src/cyclops/drivers/BootstrapDriver.h b/src/cyclops/drivers/BootstrapDriver.h index e72571ee..64fba9e7 100644 --- a/src/cyclops/drivers/BootstrapDriver.h +++ b/src/cyclops/drivers/BootstrapDriver.h @@ -39,6 +39,8 @@ class BootstrapDriver : public AbstractDriver { void logResults(const CCDArguments& arguments, std::vector& savedBeta, std::string conditionId); + void logHR(const CCDArguments& arguments, std::vector& savedBeta, std::string treatmentId); + private: const int replicates; AbstractModelData* modelData; From 30e44e4450ee30fae06e4ad405acf15d4eb83916 Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Tue, 11 Apr 2023 22:40:25 +0000 Subject: [PATCH 09/18] use the same permutation function for SelectorType::BY_ROW and SelectorType::BY_PID for now --- src/cyclops/drivers/BootstrapSelector.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cyclops/drivers/BootstrapSelector.cpp b/src/cyclops/drivers/BootstrapSelector.cpp index 734589c8..b696bdd4 100644 --- a/src/cyclops/drivers/BootstrapSelector.cpp +++ b/src/cyclops/drivers/BootstrapSelector.cpp @@ -56,18 +56,20 @@ void BootstrapSelector::permute() { // Get non-excluded indices int N_new = indicesIncluded.size(); - if (type == SelectorType::BY_PID) { +// if (type == SelectorType::BY_PID) { std::uniform_int_distribution uniform(0, N_new - 1); for (int i = 0; i < N_new; i++) { int ind = uniform(prng); int draw = indicesIncluded[ind]; selectedSet.insert(draw); } +/* } else { std::ostringstream stream; stream << "BootstrapSelector::permute is not yet implemented."; error->throwError(stream); } +*/ } void BootstrapSelector::getWeights(int batch, std::vector& weights) { @@ -80,16 +82,18 @@ void BootstrapSelector::getWeights(int batch, std::vector& weights) { return; } - if (type == SelectorType::BY_PID) { + //if (type == SelectorType::BY_PID) { for (size_t k = 0; k < K; k++) { int count = selectedSet.count(ids.at(k)); weights[k] = static_cast(count); } +/* } else { std::ostringstream stream; stream << "BootstrapSelector::getWeights is not yet implemented."; error->throwError(stream); } +*/ } void BootstrapSelector::getComplement(std::vector& weights) { From 8f161fe2b5c17456f9f576b54adad9c445247b4e Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Tue, 11 Apr 2023 22:43:10 +0000 Subject: [PATCH 10/18] R wrapper for bootstrap; fix (?) bootstrap for cox --- NAMESPACE | 1 + R/ModelFit.R | 7 +++++++ R/RcppExports.R | 4 ++++ src/RcppCyclopsInterface.cpp | 25 +++++++++++++++++++++++++ src/RcppCyclopsInterface.h | 4 ++-- src/RcppExports.cpp | 14 ++++++++++++++ src/cyclops/CcdInterface.cpp | 13 ++++++++++--- src/cyclops/CcdInterface.h | 3 ++- 8 files changed, 65 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 3045a7c4..648ebbf1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ export(createNonSeparablePrior) export(createParameterizedPrior) export(createPrior) export(createWeightBasedSearchControl) +export(doBootstrap) export(finalizeSqlCyclopsData) export(fitCyclopsModel) export(fitCyclopsSimulation) diff --git a/R/ModelFit.R b/R/ModelFit.R index f74e48f2..c183b3c9 100644 --- a/R/ModelFit.R +++ b/R/ModelFit.R @@ -784,6 +784,13 @@ getSEs <- function(object, covariates) { ses } +#' @export +doBootstrap <- function(object, outFileName, treatmentId) { + .checkInterface(object$cyclopsData, testOnly = TRUE) + bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId) + bs +} + #' @title Confidence intervals for Cyclops model parameters #' #' @description diff --git a/R/RcppExports.R b/R/RcppExports.R index 25c2f8ec..3a5bac2a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -93,6 +93,10 @@ .Call(`_Cyclops_cyclopsFitModel`, inRcppCcdInterface) } +.cyclopsRunBootstrap <- function(inRcppCcdInterface, outFileName, treatmentId) { + .Call(`_Cyclops_cyclopsRunBootstrap`, inRcppCcdInterface, outFileName, treatmentId) +} + .cyclopsLogModel <- function(inRcppCcdInterface) { .Call(`_Cyclops_cyclopsLogModel`, inRcppCcdInterface) } diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp index c58a3706..3085ecbc 100644 --- a/src/RcppCyclopsInterface.cpp +++ b/src/RcppCyclopsInterface.cpp @@ -514,6 +514,31 @@ List cyclopsFitModel(SEXP inRcppCcdInterface) { return list; } +// [[Rcpp::export(".cyclopsRunBootstrap")]] +List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId) { + using namespace bsccs; + + XPtr interface(inRcppCcdInterface); + interface->getArguments().doBootstrap = true; + interface->getArguments().outFileName = outFileName; + + // Save parameter point-estimates + std::vector savedBeta; + for (int j = 0; j < interface->getCcd().getBetaSize(); ++j) { + savedBeta.push_back(interface->getCcd().getBeta(j)); + } // TODO Handle above work in interface.runBootstrap + double timeUpdate = interface->runBoostrap(savedBeta, treatmentId); + + interface->diagnoseModel(0.0, 0.0); + + List list = List::create( + Rcpp::Named("interface")=interface, + Rcpp::Named("timeFit")=timeUpdate + ); + RcppCcdInterface::appendRList(list, interface->getResult()); + return list; +} + // [[Rcpp::export(".cyclopsLogModel")]] List cyclopsLogModel(SEXP inRcppCcdInterface) { using namespace bsccs; diff --git a/src/RcppCyclopsInterface.h b/src/RcppCyclopsInterface.h index 8225a2d4..202616f5 100644 --- a/src/RcppCyclopsInterface.h +++ b/src/RcppCyclopsInterface.h @@ -63,8 +63,8 @@ class RcppCcdInterface : public CcdInterface { return CcdInterface::runCrossValidation(ccd, modelData); } - double runBoostrap(std::vector& savedBeta) { - return CcdInterface::runBoostrap(ccd, modelData, savedBeta); + double runBoostrap(std::vector& savedBeta, std::string& treatmentId) { + return CcdInterface::runBoostrap(ccd, modelData, savedBeta, treatmentId); } void logResultsToFile(const std::string& fileName, bool withASE); diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 84834200..06d6d34f 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -305,6 +305,19 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// cyclopsRunBootstrap +List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId); +RcppExport SEXP _Cyclops_cyclopsRunBootstrap(SEXP inRcppCcdInterfaceSEXP, SEXP outFileNameSEXP, SEXP treatmentIdSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP); + Rcpp::traits::input_parameter< const std::string& >::type outFileName(outFileNameSEXP); + Rcpp::traits::input_parameter< std::string& >::type treatmentId(treatmentIdSEXP); + rcpp_result_gen = Rcpp::wrap(cyclopsRunBootstrap(inRcppCcdInterface, outFileName, treatmentId)); + return rcpp_result_gen; +END_RCPP +} // cyclopsLogModel List cyclopsLogModel(SEXP inRcppCcdInterface); RcppExport SEXP _Cyclops_cyclopsLogModel(SEXP inRcppCcdInterfaceSEXP) { @@ -805,6 +818,7 @@ static const R_CallMethodDef CallEntries[] = { {"_Cyclops_cyclopsSetControl", (DL_FUNC) &_Cyclops_cyclopsSetControl, 23}, {"_Cyclops_cyclopsRunCrossValidationl", (DL_FUNC) &_Cyclops_cyclopsRunCrossValidationl, 1}, {"_Cyclops_cyclopsFitModel", (DL_FUNC) &_Cyclops_cyclopsFitModel, 1}, + {"_Cyclops_cyclopsRunBootstrap", (DL_FUNC) &_Cyclops_cyclopsRunBootstrap, 3}, {"_Cyclops_cyclopsLogModel", (DL_FUNC) &_Cyclops_cyclopsLogModel, 1}, {"_Cyclops_cyclopsInitializeModel", (DL_FUNC) &_Cyclops_cyclopsInitializeModel, 4}, {"_Cyclops_listOpenCLDevices", (DL_FUNC) &_Cyclops_listOpenCLDevices, 0}, diff --git a/src/cyclops/CcdInterface.cpp b/src/cyclops/CcdInterface.cpp index 29a60ddd..cc1e8c65 100644 --- a/src/cyclops/CcdInterface.cpp +++ b/src/cyclops/CcdInterface.cpp @@ -697,21 +697,28 @@ SelectorType CcdInterface::getDefaultSelectorTypeOrOverride(SelectorType selecto double CcdInterface::runBoostrap( CyclicCoordinateDescent *ccd, AbstractModelData *modelData, - std::vector& savedBeta) { + std::vector& savedBeta, + std::string& treatmentId) { struct timeval time1, time2; gettimeofday(&time1, NULL); auto selectorType = getDefaultSelectorTypeOrOverride( arguments.crossValidation.selectorType, modelData->getModelType()); - BootstrapSelector selector(arguments.replicates, modelData->getPidVectorSTL(), + vector ids; + if (selectorType == SelectorType::BY_ROW) { + ids.resize(modelData->getNumberOfRows()); + std::iota(ids.begin(), ids.end(), 0); + } + BootstrapSelector selector(arguments.replicates, selectorType == SelectorType::BY_ROW ? ids : modelData->getPidVectorSTL(), selectorType, arguments.seed, logger, error); BootstrapDriver driver(arguments.replicates, modelData, logger, error); driver.drive(*ccd, selector, arguments); gettimeofday(&time2, NULL); - driver.logResults(arguments, savedBeta, ccd->getConditionId()); +// driver.logResults(arguments, savedBeta, ccd->getConditionId()); + driver.logHR(arguments, savedBeta, treatmentId); return calculateSeconds(time1, time2); } diff --git a/src/cyclops/CcdInterface.h b/src/cyclops/CcdInterface.h index 9728d7b5..318dee1c 100644 --- a/src/cyclops/CcdInterface.h +++ b/src/cyclops/CcdInterface.h @@ -197,7 +197,8 @@ class CcdInterface { double runBoostrap( CyclicCoordinateDescent *ccd, AbstractModelData *modelData, - std::vector& savedBeta); + std::vector& savedBeta, + std::string& treatmentId); void setDefaultArguments(); From 12a748c9872608b9a516896638580fc10c96f80a Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Tue, 11 Apr 2023 22:52:30 +0000 Subject: [PATCH 11/18] update R function name --- NAMESPACE | 2 +- R/ModelFit.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 648ebbf1..1e080feb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,7 +22,6 @@ export(createNonSeparablePrior) export(createParameterizedPrior) export(createPrior) export(createWeightBasedSearchControl) -export(doBootstrap) export(finalizeSqlCyclopsData) export(fitCyclopsModel) export(fitCyclopsSimulation) @@ -42,6 +41,7 @@ export(listOpenCLDevices) export(meanLinearPredictor) export(mse) export(readCyclopsData) +export(runBootstrap) export(setOpenCLDevice) export(simulateCyclopsData) import(Matrix) diff --git a/R/ModelFit.R b/R/ModelFit.R index c183b3c9..bb7269c7 100644 --- a/R/ModelFit.R +++ b/R/ModelFit.R @@ -785,7 +785,7 @@ getSEs <- function(object, covariates) { } #' @export -doBootstrap <- function(object, outFileName, treatmentId) { +runBootstrap <- function(object, outFileName, treatmentId) { .checkInterface(object$cyclopsData, testOnly = TRUE) bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId) bs From 92c9b9e17af6a238529a020c2b64b79629594d33 Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Tue, 11 Apr 2023 22:54:25 +0000 Subject: [PATCH 12/18] update automatically generated Rd --- man/createControl.Rd | 4 +++- man/fitCyclopsSimulation.Rd | 3 ++- man/getFineGrayWeights.Rd | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/man/createControl.Rd b/man/createControl.Rd index 6da2461c..4641897b 100644 --- a/man/createControl.Rd +++ b/man/createControl.Rd @@ -25,7 +25,9 @@ createControl( selectorType = "auto", initialBound = 2, maxBoundCount = 5, - algorithm = "ccd" + algorithm = "ccd", + doItAll = TRUE, + syncCV = FALSE ) } \arguments{ diff --git a/man/fitCyclopsSimulation.Rd b/man/fitCyclopsSimulation.Rd index 0d7270c8..5bcb427e 100644 --- a/man/fitCyclopsSimulation.Rd +++ b/man/fitCyclopsSimulation.Rd @@ -9,7 +9,8 @@ fitCyclopsSimulation( useCyclops = TRUE, model = "logistic", coverage = TRUE, - includePenalty = FALSE + includePenalty = FALSE, + computeDevice = "native" ) } \arguments{ diff --git a/man/getFineGrayWeights.Rd b/man/getFineGrayWeights.Rd index 21f2237f..0e92dce3 100644 --- a/man/getFineGrayWeights.Rd +++ b/man/getFineGrayWeights.Rd @@ -4,7 +4,7 @@ \alias{getFineGrayWeights} \title{Creates a \code{Surv} object that forces in competing risks and the IPCW needed for Fine-Gray estimation.} \usage{ -getFineGrayWeights(ftime, fstatus, cencode = 0, failcode = 1) +getFineGrayWeights(ftime, fstatus, cvweights = NULL, cencode = 0, failcode = 1) } \arguments{ \item{ftime}{Numeric: Observed event (failure) times} From 5e252ab47f8c9e3d3c7c879c05305a4a23b9a67e Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Fri, 14 Apr 2023 18:24:22 +0000 Subject: [PATCH 13/18] update R wrapper for bootstrap --- R/ModelFit.R | 11 +++++++++-- R/RcppExports.R | 4 ++-- man/runBootstrap.Rd | 20 ++++++++++++++++++++ src/RcppCyclopsInterface.cpp | 3 ++- src/RcppExports.cpp | 9 +++++---- 5 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 man/runBootstrap.Rd diff --git a/R/ModelFit.R b/R/ModelFit.R index bb7269c7..2802835e 100644 --- a/R/ModelFit.R +++ b/R/ModelFit.R @@ -784,10 +784,17 @@ getSEs <- function(object, covariates) { ses } +#' @title Run Bootstrap for Cyclops model parameter +#' +#' @param object A fitted Cyclops model object +#' @param outFileName Character: Output file name +#' @param treatmentId Character: variable to output +#' @param replicates Numeric: number of bootstrap samples +#' #' @export -runBootstrap <- function(object, outFileName, treatmentId) { +runBootstrap <- function(object, outFileName, treatmentId, replicates) { .checkInterface(object$cyclopsData, testOnly = TRUE) - bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId) + bs <- .cyclopsRunBootstrap(object$cyclopsData$cyclopsInterfacePtr, outFileName, treatmentId, replicates) bs } diff --git a/R/RcppExports.R b/R/RcppExports.R index 3a5bac2a..6e67601e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -93,8 +93,8 @@ .Call(`_Cyclops_cyclopsFitModel`, inRcppCcdInterface) } -.cyclopsRunBootstrap <- function(inRcppCcdInterface, outFileName, treatmentId) { - .Call(`_Cyclops_cyclopsRunBootstrap`, inRcppCcdInterface, outFileName, treatmentId) +.cyclopsRunBootstrap <- function(inRcppCcdInterface, outFileName, treatmentId, replicates) { + .Call(`_Cyclops_cyclopsRunBootstrap`, inRcppCcdInterface, outFileName, treatmentId, replicates) } .cyclopsLogModel <- function(inRcppCcdInterface) { diff --git a/man/runBootstrap.Rd b/man/runBootstrap.Rd new file mode 100644 index 00000000..ecea445f --- /dev/null +++ b/man/runBootstrap.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ModelFit.R +\name{runBootstrap} +\alias{runBootstrap} +\title{Run Bootstrap for Cyclops model parameter} +\usage{ +runBootstrap(object, outFileName, treatmentId, replicates) +} +\arguments{ +\item{object}{A fitted Cyclops model object} + +\item{outFileName}{Character: Output file name} + +\item{treatmentId}{Character: variable to output} + +\item{replicates}{Numeric: number of bootstrap samples} +} +\description{ +Run Bootstrap for Cyclops model parameter +} diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp index 3085ecbc..5f3458b0 100644 --- a/src/RcppCyclopsInterface.cpp +++ b/src/RcppCyclopsInterface.cpp @@ -515,12 +515,13 @@ List cyclopsFitModel(SEXP inRcppCcdInterface) { } // [[Rcpp::export(".cyclopsRunBootstrap")]] -List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId) { +List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId, int replicates) { using namespace bsccs; XPtr interface(inRcppCcdInterface); interface->getArguments().doBootstrap = true; interface->getArguments().outFileName = outFileName; + interface->getArguments().replicates = replicates; // Save parameter point-estimates std::vector savedBeta; diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 06d6d34f..548d50d8 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -306,15 +306,16 @@ BEGIN_RCPP END_RCPP } // cyclopsRunBootstrap -List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId); -RcppExport SEXP _Cyclops_cyclopsRunBootstrap(SEXP inRcppCcdInterfaceSEXP, SEXP outFileNameSEXP, SEXP treatmentIdSEXP) { +List cyclopsRunBootstrap(SEXP inRcppCcdInterface, const std::string& outFileName, std::string& treatmentId, int replicates); +RcppExport SEXP _Cyclops_cyclopsRunBootstrap(SEXP inRcppCcdInterfaceSEXP, SEXP outFileNameSEXP, SEXP treatmentIdSEXP, SEXP replicatesSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type inRcppCcdInterface(inRcppCcdInterfaceSEXP); Rcpp::traits::input_parameter< const std::string& >::type outFileName(outFileNameSEXP); Rcpp::traits::input_parameter< std::string& >::type treatmentId(treatmentIdSEXP); - rcpp_result_gen = Rcpp::wrap(cyclopsRunBootstrap(inRcppCcdInterface, outFileName, treatmentId)); + Rcpp::traits::input_parameter< int >::type replicates(replicatesSEXP); + rcpp_result_gen = Rcpp::wrap(cyclopsRunBootstrap(inRcppCcdInterface, outFileName, treatmentId, replicates)); return rcpp_result_gen; END_RCPP } @@ -818,7 +819,7 @@ static const R_CallMethodDef CallEntries[] = { {"_Cyclops_cyclopsSetControl", (DL_FUNC) &_Cyclops_cyclopsSetControl, 23}, {"_Cyclops_cyclopsRunCrossValidationl", (DL_FUNC) &_Cyclops_cyclopsRunCrossValidationl, 1}, {"_Cyclops_cyclopsFitModel", (DL_FUNC) &_Cyclops_cyclopsFitModel, 1}, - {"_Cyclops_cyclopsRunBootstrap", (DL_FUNC) &_Cyclops_cyclopsRunBootstrap, 3}, + {"_Cyclops_cyclopsRunBootstrap", (DL_FUNC) &_Cyclops_cyclopsRunBootstrap, 4}, {"_Cyclops_cyclopsLogModel", (DL_FUNC) &_Cyclops_cyclopsLogModel, 1}, {"_Cyclops_cyclopsInitializeModel", (DL_FUNC) &_Cyclops_cyclopsInitializeModel, 4}, {"_Cyclops_listOpenCLDevices", (DL_FUNC) &_Cyclops_listOpenCLDevices, 0}, From c7710e1cda715470c1f8476aeda21221d9b529a2 Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Fri, 14 Apr 2023 13:32:38 -0700 Subject: [PATCH 14/18] v3.3.1 --- .Rbuildignore | 2 ++ CRAN-SUBMISSION | 3 +++ DESCRIPTION | 4 ++-- NEWS.md | 9 +++++++++ cran-comments.md | 17 +++++++++++++---- man/convertToCyclopsData.Rd | 6 +++--- src/RcppCyclopsInterface.cpp | 6 +++--- src/RcppModelData.cpp | 2 +- src/cyclops/CyclicCoordinateDescent.cpp | 4 ++++ src/cyclops/engine/ModelSpecifics.hpp | 4 ++-- tools/configure.R | 7 +++++-- 11 files changed, 47 insertions(+), 17 deletions(-) create mode 100644 CRAN-SUBMISSION diff --git a/.Rbuildignore b/.Rbuildignore index 6d87dae1..094bb570 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -36,3 +36,5 @@ deprecated .covrignore src/Makevars$ src/Makevars.win$ +to_fix* +^CRAN-SUBMISSION$ diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 00000000..e91558cb --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 3.3.1 +Date: 2023-04-09 20:06:33 UTC +SHA: d88c531efa06933c1007cb785343f1382c95ee0d diff --git a/DESCRIPTION b/DESCRIPTION index 3f493895..14d493be 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: Cyclops Type: Package Title: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis -Version: 3.3.0 +Version: 3.3.1 Authors@R: c( person("Marc A.", "Suchard", email = "msuchard@ucla.edu", role = c("aut","cre")), person("Martijn J.", "Schuemie", role = "aut"), @@ -51,4 +51,4 @@ Suggests: ggplot2, microbenchmark, cmprsk -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.3 diff --git a/NEWS.md b/NEWS.md index 3cf71839..4c72ca68 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +Cyclops v3.3.1 +============== + +Changes: + +1. fix uninitialized value in detected in computeAsymptoticPrecisionMatrix(); value was priorType +2. fix memory leak caused by call to ::Rf_error() +3. fix line-endings on Makevar on windows + Cyclops v3.3.0 ============== diff --git a/cran-comments.md b/cran-comments.md index c4255b42..edfb58e1 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,16 +1,23 @@ -## Initial submission of patch update to package +## Resubmission after VALGRIND feedback from Brian Ripley -* fixed all WARNINGS, including generic-inconsistency, uninitialized values -* fixed NOTE about C++11 +* fixed VALGRIND uninitialized value (UB) issue; variable 'priorType' was uninitialized, + specifically in: + + ==2293460== Conditional jump or move depends on uninitialised value(s) + ==2293460== at 0x22CE4848: bsccs::CyclicCoordinateDescent::computeAsymptoticPrecisionMatrix() (packages/tests-vg/Cyclops/src/cyclops/CyclicCoordinateDescent.cpp:1335) + +* fixed VALGRIND memory leak issue; was caused by calls to '::Rf_error()' instead of + 'Rcpp::stop()' when handling some error edge-cases ## Test environments * local OS X install, R 4.1 +* r-devel-valgrind docker container * ubuntu 20.04 (via gh-actions: devel and release) * win-builder (devel and release) ## R CMD check results * There were no ERRORs or WARNINGs -* There is 1 occasional NOTE: +* There is 1 occasional NOTE (besides days since last submission): checking installed package size ... NOTE installed size is 22.5Mb sub-directories of 1Mb or more: @@ -23,4 +30,6 @@ availability of C++17 'if (constexpr ...)' should decrease library size substant ## Downstream dependencies * 'EvidenceSynthesis' - checked and works. +* 'EmpiricalCalibration' - checked and works. * 'IterativeHardThresholding' - checked and works. +* 'BrokenAdaptiveRidge' - checked and works. diff --git a/man/convertToCyclopsData.Rd b/man/convertToCyclopsData.Rd index 61a6bd75..391ec1ec 100644 --- a/man/convertToCyclopsData.Rd +++ b/man/convertToCyclopsData.Rd @@ -89,11 +89,11 @@ These columns are expected in the covariates object: } \section{Methods (by class)}{ \itemize{ -\item \code{data.frame}: Convert data from two \code{data.frame} +\item \code{convertToCyclopsData(data.frame)}: Convert data from two \code{data.frame} -\item \code{tbl_dbi}: Convert data from two \code{Andromeda} tables -}} +\item \code{convertToCyclopsData(tbl_dbi)}: Convert data from two \code{Andromeda} tables +}} \examples{ #Convert infert dataset to Cyclops format: covariates <- data.frame(stratumId = rep(infert$stratum, 2), diff --git a/src/RcppCyclopsInterface.cpp b/src/RcppCyclopsInterface.cpp index ae9b1914..ae8ff12e 100644 --- a/src/RcppCyclopsInterface.cpp +++ b/src/RcppCyclopsInterface.cpp @@ -622,8 +622,8 @@ void RcppCcdInterface::appendRList(Rcpp::List& list, const Rcpp::List& append) { } void RcppCcdInterface::handleError(const std::string& str) { -// Rcpp::stop(str); // TODO Want this to work - ::Rf_error(str.c_str()); + Rcpp::stop(str); + // ::Rf_error(str.c_str()); } bsccs::ConvergenceType RcppCcdInterface::parseConvergenceType(const std::string& convergenceName) { @@ -946,7 +946,7 @@ void RcppCcdInterface::initializeModelImpl( // } // singlePrior->setVariance(0, arguments.hyperprior); - JointPriorPtr prior; + JointPriorPtr prior = nullptr; // if (arguments.flatPrior.size() == 0) { // prior = bsccs::make_shared(singlePrior); // } else { diff --git a/src/RcppModelData.cpp b/src/RcppModelData.cpp index 7eb7796b..f585a6fe 100644 --- a/src/RcppModelData.cpp +++ b/src/RcppModelData.cpp @@ -675,7 +675,7 @@ List cyclopsReadFileData(const std::string& fileName, const std::string& modelTy //const std::vector& y = ptr->getYVectorRef(); double total = 0.0; //std::accumulate(y.begin(), y.end(), 0.0); - // delete reader; // TODO Test + delete reader; // TODO Use smart_ptr double time = timer(); List list = List::create( diff --git a/src/cyclops/CyclicCoordinateDescent.cpp b/src/cyclops/CyclicCoordinateDescent.cpp index fbb03df2..6e54b9f2 100644 --- a/src/cyclops/CyclicCoordinateDescent.cpp +++ b/src/cyclops/CyclicCoordinateDescent.cpp @@ -54,6 +54,8 @@ CyclicCoordinateDescent::CyclicCoordinateDescent( noiseLevel = NOISY; initialBound = 2.0; + priorType = priors::NONE; + init(hXI.getHasOffsetCovariate()); } @@ -89,6 +91,8 @@ CyclicCoordinateDescent::CyclicCoordinateDescent(const CyclicCoordinateDescent& noiseLevel = copy.noiseLevel; initialBound = copy.initialBound; + priorType = copy.priorType; + init(hXI.getHasOffsetCovariate()); if (copy.hWeights.size() > 0) { diff --git a/src/cyclops/engine/ModelSpecifics.hpp b/src/cyclops/engine/ModelSpecifics.hpp index 84f9eed3..e08e7da1 100644 --- a/src/cyclops/engine/ModelSpecifics.hpp +++ b/src/cyclops/engine/ModelSpecifics.hpp @@ -1124,7 +1124,7 @@ void ModelSpecifics::computeGradientAndHessianImpl(int index // gradient = result.real(); // hessian = result.imag(); - using RealTypePair = std::pair; + // using RealTypePair = std::pair; IteratorType it(hX, index); const int end = it.size() - 1; @@ -1266,7 +1266,7 @@ void ModelSpecifics::computeThirdDerivativeImpl(int index, d #endif RealType third = static_cast(0); - RealType hessian = static_cast(0); + // RealType hessian = static_cast(0); if (BaseModel::cumulativeGradientAndHessian) { // Compile-time switch diff --git a/tools/configure.R b/tools/configure.R index d9853786..08ab5033 100644 --- a/tools/configure.R +++ b/tools/configure.R @@ -19,5 +19,8 @@ if (getRversion() < "4.1") { } } -cat(txt, file = makevars_out, sep = "\n") -cat(txt_win, file = makevars_win_out, sep = "\n") +if (.Platform$OS.type == "unix") { + cat(txt, file = makevars_out, sep = "\n") +} else { + cat(txt_win, file = makevars_win_out, sep = "\n") +} From 478e5ac4a21b6b5616c5252ab4276f2b2f5a9fad Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Thu, 18 May 2023 05:18:52 -0700 Subject: [PATCH 15/18] add auto option to cvRepetitions --- NEWS.md | 2 ++ R/ModelFit.R | 18 ++++++++++++++++++ tests/testthat/test-cv.R | 20 ++++++++++++++++++++ tools/configure.R | 2 +- 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 33852b06..943d8096 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ develop Changes: 1. improvements on adaptive likelihood profiling +2. add `auto` option to `cvRepetitions` +3. bumped explicit C++11 requirement up to R v4.1 Cyclops v3.3.1 ============== diff --git a/R/ModelFit.R b/R/ModelFit.R index c4fbae74..385a011c 100644 --- a/R/ModelFit.R +++ b/R/ModelFit.R @@ -204,6 +204,11 @@ fitCyclopsModel <- function(cyclopsData, writeLines(paste("Using cross-validation selector type", control$selectorType)) } } + + if (control$cvRepetitions == "auto") { + control$cvRepetitions <- .getNumberOfRepetitions(getNumberOfRows(cyclopsData)) + } + control <- .setControl(cyclopsData$cyclopsInterfacePtr, control) threads <- control$threads @@ -324,6 +329,11 @@ fitCyclopsModel <- function(cyclopsData, fit$scale <- cyclopsData$scale fit$threads <- threads fit$seed <- control$seed + + if (prior$useCrossValidation) { + fit$cvRepetitions <- control$cvRepetitions + } + class(fit) <- "cyclopsFit" return(fit) } @@ -1116,3 +1126,11 @@ convertToGlmnetLambda <- function(variance, nobs) { graph <- lapply(0:(nParents - 1), function(n, types) { 0:(nTypes - 1) + n * nTypes }, types = nTypes) graph } + +.getNumberOfRepetitions <- function(nrows) { + top <- 1E2 + factor <- 0.5 + pmax(pmin( + ceiling(top / nrows^(factor)) + , 10), 1) +} diff --git a/tests/testthat/test-cv.R b/tests/testthat/test-cv.R index 8b47229c..3252e670 100644 --- a/tests/testthat/test-cv.R +++ b/tests/testthat/test-cv.R @@ -206,3 +206,23 @@ test_that("Seed gets returned", { control = createControl(seed = NULL), warnings = FALSE) expect_true(!is.null(fit$seed)) }) + +test_that("Auto cvRepetitions", { + data <- simulateCyclopsData(nstrata = 1, nrows = 1000, ncovars = 10, model = "logistic") + cyclopsData <- convertToCyclopsData(data$outcomes, + data$covariates, + modelType = "lr", + addIntercept = TRUE) + + prior <- createPrior("laplace", exclude = c(0), useCrossValidation = TRUE) + + control <- createControl(noiseLevel = "quiet", + cvType = "auto", + cvRepetitions = "auto") + + fit <- fitCyclopsModel(cyclopsData, + prior = prior, + control = control) + + expect_equal(fit$cvRepetitions, 4) +}) diff --git a/tools/configure.R b/tools/configure.R index 08ab5033..acdda039 100644 --- a/tools/configure.R +++ b/tools/configure.R @@ -9,7 +9,7 @@ makevars_win_out <- file.path("src", "Makevars.win") txt <- readLines(makevars_in) txt_win <- readLines(makevars_win_in) -if (getRversion() < "4.1") { +if (getRversion() < "4.2") { if (!any(grepl("^CXX_STD", txt))) { txt <- c("CXX_STD = CXX11", txt) } From b20b830d4a2ec67ad834f62bc6cf8956179b20d0 Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Fri, 19 May 2023 16:11:21 -0700 Subject: [PATCH 16/18] play with CXX11 flag some more --- tools/configure.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/configure.R b/tools/configure.R index acdda039..4b4222e9 100644 --- a/tools/configure.R +++ b/tools/configure.R @@ -9,11 +9,13 @@ makevars_win_out <- file.path("src", "Makevars.win") txt <- readLines(makevars_in) txt_win <- readLines(makevars_win_in) -if (getRversion() < "4.2") { +if (getRversion() < "4.3") { # macOS / linux if (!any(grepl("^CXX_STD", txt))) { txt <- c("CXX_STD = CXX11", txt) } +} +if (getRversion() < "4.2") { # Windoz if (!any(grepl("^CXX_STD", txt_win))) { txt_win <- c("CXX_STD = CXX11", txt_win) } From 60ebb58ce21bbeacb8a9148f21b0dfcab6a2839d Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Mon, 25 Sep 2023 06:52:05 -0700 Subject: [PATCH 17/18] new CRAN format --- R/Cyclops-package.R | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 R/Cyclops-package.R diff --git a/R/Cyclops-package.R b/R/Cyclops-package.R new file mode 100644 index 00000000..a65cf643 --- /dev/null +++ b/R/Cyclops-package.R @@ -0,0 +1,6 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +## usethis namespace: end +NULL From 95bfa2b7d8d9cf37dba148ef051d475a0ca71067 Mon Sep 17 00:00:00 2001 From: Marc Suchard Date: Mon, 25 Sep 2023 09:37:39 -0700 Subject: [PATCH 18/18] counting process reformating --- NAMESPACE | 3 + R/TimeEffects.R | 132 ++++++++++++++++++++++++++++++++ man/Cyclops-package.Rd | 41 ++++++++++ man/convertToTimeVaryingCoef.Rd | 39 ++++++++++ man/splitTime.Rd | 26 +++++++ 5 files changed, 241 insertions(+) create mode 100644 R/TimeEffects.R create mode 100644 man/Cyclops-package.Rd create mode 100644 man/convertToTimeVaryingCoef.Rd create mode 100644 man/splitTime.Rd diff --git a/NAMESPACE b/NAMESPACE index d632ced3..4fc36ad5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ S3method(survfit,cyclopsFit) S3method(vcov,cyclopsFit) export(Multitype) export(convertToCyclopsData) +export(convertToTimeVaryingCoef) export(coverage) export(createAutoGridCrossValidationControl) export(createControl) @@ -40,6 +41,7 @@ export(meanLinearPredictor) export(mse) export(readCyclopsData) export(simulateCyclopsData) +export(splitTime) import(Matrix) import(Rcpp) import(dplyr) @@ -72,5 +74,6 @@ importFrom(stats,terms) importFrom(stats,time) importFrom(stats,vcov) importFrom(survival,Surv) +importFrom(survival,survSplit) importFrom(survival,survfit) useDynLib(Cyclops, .registration = TRUE) diff --git a/R/TimeEffects.R b/R/TimeEffects.R new file mode 100644 index 00000000..5ae7aa0e --- /dev/null +++ b/R/TimeEffects.R @@ -0,0 +1,132 @@ +#' @title Split the analysis time into several intervals for time-varying coefficients. +#' +#' @description +#' \code{splitTime} split the analysis time into several intervals for time-varying coefficients +#' +#' @param shortOut A data frame containing the outcomes with predefined columns (see below). +#' @param cut Numeric: Time points to cut at +#' +#' These columns are expected in the shortOut object: +#' \tabular{lll}{ +#' \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr +#' \verb{y} \tab(real) \tab Observed event status \cr +#' \verb{time} \tab(real) \tab Observed event time \cr +#' } +#' +#' @return A long outcome table for time-varying coefficients. +#' @importFrom survival survfit survSplit +#' @export +splitTime <- function(shortOut, cut) { + + if (!"time" %in% colnames(shortOut)) stop("Must provide observed event time.") + if (!"y" %in% colnames(shortOut)) stop("Must provide observed event status.") + if ("rowId" %in% colnames(shortOut)) { + shortOut <- shortOut %>% + rename(subjectId = rowId) %>% + arrange(subjectId) + } else { + shortOut <- shortOut %>% + mutate(subjectId = row_number()) + } + + shortOut <- collect(shortOut) + longOut <- do.call('survSplit', list(formula = Surv(shortOut$time, shortOut$y)~., + data = shortOut, + cut = cut, + episode = "stratumId", + id = "newSubjectId")) + longOut <- longOut %>% + rename(y = event) %>% + mutate(time = tstop - tstart) %>% + select(-c(newSubjectId, tstart, tstop)) %>% + arrange(stratumId, subjectId) + + # Restore rowIds + SubjectIds <- shortOut$subjectId + newSubjectId <- max(SubjectIds)+1 + longOut$rowId <-c(SubjectIds, # rowId = subjectId at 1st stratum + newSubjectId:(newSubjectId+(nrow(longOut)-length(SubjectIds))-1)) # create new distinct rowIds for other strata + + # Reorder columns + longOut <- longOut %>% + select(rowId, everything()) %>% + select(subjectId, everything()) %>% + select(stratumId, everything()) + + return(longOut) +} + +#' @title Convert short sparse covariate table to long sparse covariate table for time-varying coefficients. +#' +#' @description +#' \code{convertToTimeVaryingCoef} convert short sparse covariate table to long sparse covariate table for time-varying coefficients. +#' +#' @param shortCov A data frame containing the covariate with predefined columns (see below). +#' @param longOut A data frame containing the outcomes with predefined columns (see below), output of \code{splitTime}. +#' @param timeVaryCoefId Integer: A numeric identifier of a time-varying coefficient +#' +#' @details +#' These columns are expected in the shortCov object: +#' \tabular{lll}{ +#' \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr +#' \verb{covariateId} \tab(integer) \tab A numeric identifier of a covariate \cr +#' \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr +#' } +#' +#' These columns are expected in the longOut object: +#' \tabular{lll}{ +#' \verb{stratumId} \tab(integer) \tab Stratum ID for time-varying models \cr +#' \verb{subjectId} \tab(integer) \tab Subject ID is used to link multiple covariates (x) at different time intervals to a single subject \cr +#' \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr +#' \verb{y} \tab(real) \tab The outcome variable \cr +#' \verb{time} \tab(real) \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr +#' \tab \tab(e.g. number of days) \cr +#' } +#' @return A long sparse covariate table for time-varying coefficients. +#' @export +convertToTimeVaryingCoef <- function(shortCov, longOut, timeVaryCoefId) { + + # Process time-varying coefficients + timeVaryCoefId <- sort(unique(timeVaryCoefId)) + numTime <- length(timeVaryCoefId) # number of time-varying covariates + maxCovId <- max(shortCov$covariateId) + + # First stratum + longCov <- shortCov + longCov$stratumId <- 1 + colnames(longCov)[which(names(longCov) == "rowId")] <- "subjectId" + colnames(shortCov)[which(names(shortCov) == "rowId")] <- "subjectId" + + # Rest of strata + maxStrata <- max(longOut$stratumId) + for (st in 2:maxStrata) { + + # get valid subjects in current stratum + subId <- longOut[longOut$stratumId == st, ]$subjectId + + # get valid sparse covariates information in current stratum + curStrata <- shortCov[shortCov$subjectId %in% subId, ] + + if (any(curStrata$covariateId %in% timeVaryCoefId)) { # skip when valid subjects only have non-zero time-indep covariates + curStrata$stratumId <- st # assign current stratumId + + # recode covariateId for time-varying coefficients + # TODO update label + for (i in 1:numTime) { + curStrata[curStrata$covariateId == timeVaryCoefId[i], "covariateId"] <- maxCovId + numTime * (st - 2) + i + } + + # bind current stratum to longCov + longCov <- rbind(longCov, curStrata) + } + } + + # match rowId in longCov + longCov$rowId <- NA + for (i in 1:nrow(longCov)) { + longCov$rowId[i] <- longOut[with(longOut, subjectId == longCov$subjectId[i] & stratumId == longCov$stratumId[i]), "rowId"] + } + + return(longCov) +} + diff --git a/man/Cyclops-package.Rd b/man/Cyclops-package.Rd new file mode 100644 index 00000000..d726da88 --- /dev/null +++ b/man/Cyclops-package.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Cyclops-package.R +\docType{package} +\name{Cyclops-package} +\alias{Cyclops} +\alias{Cyclops-package} +\title{Cyclops: Cyclic Coordinate Descent for Logistic, Poisson and Survival Analysis} +\description{ +This model fitting tool incorporates cyclic coordinate descent and majorization-minimization approaches to fit a variety of regression models found in large-scale observational healthcare data. Implementations focus on computational optimization and fine-scale parallelization to yield efficient inference in massive datasets. Please see: Suchard, Simpson, Zorych, Ryan and Madigan (2013) \doi{10.1145/2414416.2414791}. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/ohdsi/cyclops} + \item Report bugs at \url{https://github.com/ohdsi/cyclops/issues} +} + +} +\author{ +\strong{Maintainer}: Marc A. Suchard \email{msuchard@ucla.edu} + +Authors: +\itemize{ + \item Martijn J. Schuemie + \item Trevor R. Shaddox + \item Yuxi Tian + \item Jianxiao Yang + \item Eric Kawaguchi +} + +Other contributors: +\itemize{ + \item Sushil Mittal [contributor] + \item Observational Health Data Sciences and Informatics [copyright holder] + \item Marcus Geelnard (provided the TinyThread library) [copyright holder, contributor] + \item Rutgers University (provided the HParSearch routine) [copyright holder, contributor] + \item R Development Core Team (provided the ZeroIn routine) [copyright holder, contributor] +} + +} +\keyword{internal} diff --git a/man/convertToTimeVaryingCoef.Rd b/man/convertToTimeVaryingCoef.Rd new file mode 100644 index 00000000..b587f8f2 --- /dev/null +++ b/man/convertToTimeVaryingCoef.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TimeEffects.R +\name{convertToTimeVaryingCoef} +\alias{convertToTimeVaryingCoef} +\title{Convert short sparse covariate table to long sparse covariate table for time-varying coefficients.} +\usage{ +convertToTimeVaryingCoef(shortCov, longOut, timeVaryCoefId) +} +\arguments{ +\item{shortCov}{A data frame containing the covariate with predefined columns (see below).} + +\item{longOut}{A data frame containing the outcomes with predefined columns (see below), output of \code{splitTime}.} + +\item{timeVaryCoefId}{Integer: A numeric identifier of a time-varying coefficient} +} +\value{ +A long sparse covariate table for time-varying coefficients. +} +\description{ +\code{convertToTimeVaryingCoef} convert short sparse covariate table to long sparse covariate table for time-varying coefficients. +} +\details{ +These columns are expected in the shortCov object: +\tabular{lll}{ + \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr + \verb{covariateId} \tab(integer) \tab A numeric identifier of a covariate \cr + \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr +} + +These columns are expected in the longOut object: +\tabular{lll}{ + \verb{stratumId} \tab(integer) \tab Stratum ID for time-varying models \cr + \verb{subjectId} \tab(integer) \tab Subject ID is used to link multiple covariates (x) at different time intervals to a single subject \cr + \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr + \verb{y} \tab(real) \tab The outcome variable \cr + \verb{time} \tab(real) \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr + \tab \tab(e.g. number of days) \cr +} +} diff --git a/man/splitTime.Rd b/man/splitTime.Rd new file mode 100644 index 00000000..fedc4a7e --- /dev/null +++ b/man/splitTime.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TimeEffects.R +\name{splitTime} +\alias{splitTime} +\title{Split the analysis time into several intervals for time-varying coefficients.} +\usage{ +splitTime(shortOut, cut) +} +\arguments{ +\item{shortOut}{A data frame containing the outcomes with predefined columns (see below).} + +\item{cut}{Numeric: Time points to cut at + +These columns are expected in the shortOut object: +\tabular{lll}{ + \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome (y) \cr + \verb{y} \tab(real) \tab Observed event status \cr + \verb{time} \tab(real) \tab Observed event time \cr +}} +} +\value{ +A long outcome table for time-varying coefficients. +} +\description{ +\code{splitTime} split the analysis time into several intervals for time-varying coefficients +}