-
Notifications
You must be signed in to change notification settings - Fork 0
/
counting_sort.cu
127 lines (100 loc) · 4.08 KB
/
counting_sort.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <cuda.h>
#define THREAD_NUM 1024
// Tuned for NVIDIA Tesla V100 (12 GB VRAM)
#define ARR_SPLIT_LEN 805306368 // 3 GB (= 805,306,368 * 4 Byte)
// Util
__host__ int array_split(size_t arr_len[], int size) {
int arr_len_cnt = 0;
if (size > 2 * ARR_SPLIT_LEN) {
arr_len_cnt = 3;
arr_len[0] = arr_len[1] = ARR_SPLIT_LEN;
arr_len[2] = size - 2 * ARR_SPLIT_LEN;
} else if (size > ARR_SPLIT_LEN) {
arr_len_cnt = 2;
arr_len[0] = ARR_SPLIT_LEN;
arr_len[1] = size - ARR_SPLIT_LEN;
} else {
arr_len_cnt = 1;
arr_len[0] = size;
}
return arr_len_cnt;
}
// Histogram
__global__ void build_histogram_kernel(int arr[], int histogram[], int arr_size) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < arr_size) {
atomicAdd(&histogram[arr[idx]], 1);
}
}
__host__ void build_histogram(int arr[], int histogram[], int size, int max_val) {
int *arr_device;
size_t arr_len[3] = { 0, 0, 0 };
int arr_len_cnt = array_split(arr_len, size);
int *histogram_device;
size_t histogram_len = max_val;
cudaMalloc(&arr_device, arr_len[0] * sizeof(int));
cudaMalloc(&histogram_device, histogram_len * sizeof(int));
cudaMemset(histogram_device, 0, histogram_len * sizeof(int));
for (int i = 0; i < arr_len_cnt; i++) {
int block_num = (arr_len[i] / THREAD_NUM) + (arr_len[i] % THREAD_NUM == 0 ? 0 : 1);
cudaMemcpy(arr_device, &arr[i * ARR_SPLIT_LEN], arr_len[i] * sizeof(int), cudaMemcpyHostToDevice);
build_histogram_kernel <<< block_num, THREAD_NUM >>> (arr_device, histogram_device, arr_len[i]);
}
cudaMemcpy(histogram, histogram_device, histogram_len * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(arr_device);
cudaFree(histogram_device);
}
// Prefix
__host__ void build_prefix(int histogram[], int max_val) {
int sum = 0;
for (int i = 0; i < max_val; i++) {
sum += histogram[i];
histogram[i] = sum;
}
}
// Output
__global__ void build_output_kernel(int arr[], int prefix[], int prefix_size, int first_prefix, int base_idx) {
__shared__ int local_prefix[THREAD_NUM + 1];
unsigned int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int local_idx = threadIdx.x + 1;
if (global_idx == 0) {
local_prefix[0] = first_prefix;
} else if (local_idx == 1) {
local_prefix[0] = prefix[global_idx - 1];
}
if (global_idx < prefix_size) {
local_prefix[local_idx] = prefix[global_idx];
__syncthreads();
int cnt = local_prefix[local_idx] - local_prefix[local_idx - 1];
int start_idx = local_prefix[local_idx - 1];
for (int i = 0; i < cnt; i++) {
arr[start_idx + i] = global_idx + base_idx;
}
}
}
__host__ void build_output(int arr_out[], int prefix[], int size, int max_val) {
int *prefix_device;
size_t prefix_len[3] = { 0, 0, 0 };
int prefix_len_cnt = array_split(prefix_len, max_val);
int *arr_out_device;
int arr_out_len = size;
cudaMalloc(&arr_out_device, arr_out_len * sizeof(int));
cudaMalloc(&prefix_device, prefix_len[0] * sizeof(int));
for (int i = 0; i < prefix_len_cnt; i++) {
int block_num = (prefix_len[i] / THREAD_NUM) + (prefix_len[i] % THREAD_NUM == 0 ? 0 : 1);
int first_prefix = (i == 0 ? 0 : prefix[i * ARR_SPLIT_LEN - 1]);
cudaMemcpy(prefix_device, &prefix[i * ARR_SPLIT_LEN], prefix_len[i] * sizeof(int), cudaMemcpyHostToDevice);
build_output_kernel <<< block_num, THREAD_NUM >>> (arr_out_device, prefix_device, prefix_len[i], first_prefix, i * ARR_SPLIT_LEN);
}
cudaMemcpy(arr_out, arr_out_device, arr_out_len * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(arr_out_device);
cudaFree(prefix_device);
}
// Sort
__host__ void counting_sort(int arr[], int size, int max_val) {
int *histogram_and_prefix = new int[max_val];
build_histogram(arr, histogram_and_prefix, size, max_val);
build_prefix(histogram_and_prefix, max_val);
build_output(arr, histogram_and_prefix, size, max_val);
delete [] histogram_and_prefix;
}