diff --git a/examples/misc/c_groups.cu b/examples/misc/c_groups.cu new file mode 100644 index 0000000..744a442 --- /dev/null +++ b/examples/misc/c_groups.cu @@ -0,0 +1,65 @@ + +#include +#include +#include "src/include/utils.h" +#include + +using namespace cooperative_groups; +__device__ int reduce_sum(thread_group g, int *temp, int val) +{ + int lane = g.thread_rank(); + + // Each iteration halves the number of active threads + // Each thread adds its partial sum[i] to sum[lane+i] + for (int i = g.size() / 2; i > 0; i /= 2) + { + temp[lane] = val; + g.sync(); // wait for all threads to store + if(lane>>(sum, data, n); + cudaDeviceSynchronize(); + fmt::print("The array is sized {}\n", n); + fmt::print("Sum is equal to {}\n", *sum); + + return 0; +}