forked from NVIDIA/cuda-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dwtHaar1D_kernel.cuh
244 lines (209 loc) · 10.8 KB
/
dwtHaar1D_kernel.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* 1D DWT for Haar wavelet and signals with a length which is a power of 2.
* The code reduces bank conflicts and non-coalesced reads / writes as
* appropriate but does not fully remove them because the computational
* overhead to achieve this would outweighs the benefit (see inline comments
* for more details).
* Large signals are subdivided into sub-signals with 512 elements and the
* wavelet transform for these is computed with one block over 10 decomposition
* levels. The resulting signal consisting of the approximation coefficients at
* level X is then processed in a subsequent step on the device. This requires
* interblock synchronization which is only possible on host side.
* Detail coefficients which have been computed are not further referenced
* during the decomposition so that they can be stored directly in their final
* position in global memory. The transform and its storing scheme preserve
* locality in the coefficients so that these writes are coalesced.
* Approximation coefficients are stored in shared memory because they are
* needed to compute the subsequent decomposition step. The top most
* approximation coefficient for a sub-signal processed by one block is stored
* in a special global memory location to simplify the processing after the
* interblock synchronization.
* Most books on wavelets explain the Haar wavelet decomposition. A good freely
* available resource is the Wavelet primer by Stollnitz et al.
* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf
* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf
* The basic of all Wavelet transforms is to decompose a signal into
* approximation (a) and detail (d) coefficients where the detail tends to be
* small or zero which allows / simplifies compression. The following "graphs"
* demonstrate the transform for a signal
* of length eight. The index always describes the decomposition level where
* a coefficient arises. The input signal is interpreted as approximation signal
* at level 0. The coefficients computed on the device are stored in the same
* scheme as in the example. This data structure is particularly well suited for
* compression and also preserves the hierarchical structure of the
decomposition.
-------------------------------------------------
| a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 |
-------------------------------------------------
-------------------------------------------------
| a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------
-------------------------------------------------
| a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------
-------------------------------------------------
| a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------
* Device Code.
*/
#ifndef _DWTHAAR1D_KERNEL_H_
#define _DWTHAAR1D_KERNEL_H_
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
////////////////////////////////////////////////////////////////////////////////
//! @param id input data
//! @param od output data
//! @param value
////////////////////////////////////////////////////////////////////////////////
__global__ void initValue(float *od, float value) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// position of write into global memory
unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
od[index] = value;
// sync after each decomposition step
cg::sync(cta);
}
////////////////////////////////////////////////////////////////////////////////
//! Compute partial wavelet decomposition on the GPU using Haar basis
//! For each thread block the full decomposition is computed but these results
//! have to be combined
//! Use one thread to perform the full decomposition
//! @param id input data
//! @param od output data
//! @param approx_final place to store the final approximation coefficient for
//! the subsignal
//! @param dlevels number of decomposition levels for this transform
//! @param slength_step_half half signal length for current decomposition
//! level (offset for storing detail coefficients in
//! global memory
//! @param bdim block dimension
////////////////////////////////////////////////////////////////////////////////
__global__ void dwtHaar1D(float *id, float *od, float *approx_final,
const unsigned int dlevels,
const unsigned int slength_step_half,
const int bdim) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// shared memory for part of the signal
extern __shared__ float shared[];
// thread runtime environment, 1D parametrization
const int gdim = gridDim.x;
// const int bdim = blockDim.x;
const int bid = blockIdx.x;
const int tid = threadIdx.x;
// global thread id (w.r.t. to total data set)
const int tid_global = (bid * bdim) + tid;
unsigned int idata = (bid * (2 * bdim)) + tid;
// read data from global memory
shared[tid] = id[idata];
shared[tid + bdim] = id[idata + bdim];
cg::sync(cta);
// this operation has a two way bank conflicts for all threads, this are two
// additional cycles for each warp -- all alternatives to avoid this bank
// conflict are more expensive than the one cycle introduced by serialization
float data0 = shared[2 * tid];
float data1 = shared[(2 * tid) + 1];
cg::sync(cta);
// detail coefficient, not further referenced so directly store in
// global memory
od[tid_global + slength_step_half] = (data0 - data1) * INV_SQRT_2;
// offset to avoid bank conflicts
// see the scan example for a more detailed description
unsigned int atid = tid + (tid >> LOG_NUM_BANKS);
// approximation coefficient
// store in shared memory for further decomposition steps in this global step
shared[atid] = (data0 + data1) * INV_SQRT_2;
// all threads have to write approximation coefficient to shared memory before
// next steps can take place
cg::sync(cta);
// early out if possible
// the compiler removes this part from the source because dlevels is
// a constant shader input
// note: syncthreads in bodies of branches can lead to dead-locks unless
// the condition evaluates the same way for ALL threads of a block, as in
// this case
if (dlevels > 1) {
// offset to second element in shared element which has to be used for the
// decomposition, effectively 2^(i - 1)
unsigned int offset_neighbor = 1;
// number of active threads per decomposition level
// identical to the offset for the detail coefficients
unsigned int num_threads = bdim >> 1;
// index for the first element of the pair to process
// the representation is still compact (and therefore still tid * 2)
// because the first step operated on registers and only the result has been
// written to shared memory
unsigned int idata0 = tid * 2;
// offset levels to make the loop more efficient
for (unsigned int i = 1; i < dlevels; ++i) {
// Non-coalesced writes occur if the number of active threads becomes
// less than 16 for a block because the start address for the first
// block is not always aligned with 64 byte which is necessary for
// coalesced access. However, the problem only occurs at high levels
// with only a small number of active threads so that the total number of
// non-coalesced access is rather small and does not justify the
// computations which are necessary to avoid these uncoalesced writes
// (this has been tested and verified)
if (tid < num_threads) {
// update stride, with each decomposition level the stride grows by a
// factor of 2
unsigned int idata1 = idata0 + offset_neighbor;
// position of write into global memory
unsigned int g_wpos = (num_threads * gdim) + (bid * num_threads) + tid;
// compute wavelet decomposition step
// offset to avoid bank conflicts
unsigned int c_idata0 = idata0 + (idata0 >> LOG_NUM_BANKS);
unsigned int c_idata1 = idata1 + (idata1 >> LOG_NUM_BANKS);
// detail coefficient, not further modified so directly store
// in global memory
od[g_wpos] = (shared[c_idata0] - shared[c_idata1]) * INV_SQRT_2;
// approximation coefficient
// note that the representation in shared memory becomes rather sparse
// (with a lot of holes inbetween) but the storing scheme in global
// memory guarantees that the common representation (approx, detail_0,
// detail_1, ...)
// is achieved
shared[c_idata0] = (shared[c_idata0] + shared[c_idata1]) * INV_SQRT_2;
// update storage offset for details
num_threads = num_threads >> 1; // div 2
offset_neighbor <<= 1; // mul 2
idata0 = idata0 << 1; // mul 2
}
// sync after each decomposition step
cg::sync(cta);
}
// write the top most level element for the next decomposition steps
// which are performed after an interlock synchronization on host side
if (0 == tid) {
approx_final[bid] = shared[0];
}
} // end early out if possible
}
#endif // #ifndef _DWTHAAR1D_KERNEL_H_