forked from NVIDIA/cuda-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dwtHaar1D.cu
389 lines (330 loc) · 14.2 KB
/
dwtHaar1D.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* 1D DWT for Haar wavelet and signals with a length which is a power of 2.
* The code reduces bank conflicts and non-coalesced reads / writes as
* appropriate but does not fully remove them because the computational
* overhead to achieve this would outweighs the benefit (see inline comments
* for more details).
* Large signals are subdivided into sub-signals with 512 elements and the
* wavelet transform for these is computed with one block over 10 decomposition
* levels. The resulting signal consisting of the approximation coefficients at
* level X is then processed in a subsequent step on the device. This requires
* interblock synchronization which is only possible on host side.
* Detail coefficients which have been computed are not further referenced
* during the decomposition so that they can be stored directly in their final
* position in global memory. The transform and its storing scheme preserve
* locality in the coefficients so that these writes are coalesced.
* Approximation coefficients are stored in shared memory because they are
* needed to compute the subsequent decomposition step. The top most
* approximation coefficient for a sub-signal processed by one block is stored
* in a special global memory location to simplify the processing after the
* interblock synchronization.
* Most books on wavelets explain the Haar wavelet decomposition. A good freely
* available resource is the Wavelet primer by Stollnitz et al.
* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf
* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf
* The basic of all Wavelet transforms is to decompose a signal into
* approximation (a) and detail (d) coefficients where the detail tends to be
* small or zero which allows / simplifies compression. The following "graphs"
* demonstrate the transform for a signal
* of length eight. The index always describes the decomposition level where
* a coefficient arises. The input signal is interpreted as approximation signal
* at level 0. The coefficients computed on the device are stored in the same
* scheme as in the example. This data structure is particularly well suited for
* compression and also preserves the hierarchical structure of the
decomposition.
-------------------------------------------------
| a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 |
-------------------------------------------------
-------------------------------------------------
| a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------
-------------------------------------------------
| a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------
-------------------------------------------------
| a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
-------------------------------------------------
* Host code.
*/
#ifdef _WIN32
#define NOMINMAX
#endif
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
// includes, project
#include <helper_functions.h>
#include <helper_cuda.h>
// constants which are used in host and device code
#define INV_SQRT_2 0.70710678118654752440f;
const unsigned int LOG_NUM_BANKS = 4;
const unsigned int NUM_BANKS = 16;
////////////////////////////////////////////////////////////////////////////////
// includes, kernels
#include "dwtHaar1D_kernel.cuh"
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
bool getLevels(unsigned int len, unsigned int *levels);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
// run test
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Perform the wavelet decomposition
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
bool bResult = false; // flag for final validation of the results
char *s_fname = NULL, *r_gold_fname = NULL;
char r_fname[256];
const char usage[] = {
"\nUsage:\n"
" dwtHaar1D --signal=<signal_file> --result=<result_file> "
"--gold=<gold_file>\n\n"
" <signal_file> Input file containing the signal\n"
" <result_file> Output file storing the result of the wavelet "
"decomposition\n"
" <gold_file> Input file containing the reference result of the "
"wavelet decomposition\n"
"\nExample:\n"
" ./dwtHaar1D\n"
" --signal=signal.dat\n"
" --result=result.dat\n"
" --gold=regression.gold.dat\n"};
printf("%s Starting...\n\n", argv[0]);
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
findCudaDevice(argc, (const char **)argv);
// file names, either specified as cmd line args or use default
if (argc == 4) {
char *tmp_sfname, *tmp_rfname, *tmp_goldfname;
if ((getCmdLineArgumentString(argc, (const char **)argv, "signal",
&tmp_sfname) != true) ||
(getCmdLineArgumentString(argc, (const char **)argv, "result",
&tmp_rfname) != true) ||
(getCmdLineArgumentString(argc, (const char **)argv, "gold",
&tmp_goldfname) != true)) {
fprintf(stderr, "Invalid input syntax.\n%s", usage);
exit(EXIT_FAILURE);
}
s_fname = sdkFindFilePath(tmp_sfname, argv[0]);
r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]);
strcpy(r_fname, tmp_rfname);
} else {
s_fname = sdkFindFilePath("signal.dat", argv[0]);
r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]);
strcpy(r_fname, "result.dat");
}
printf("source file = \"%s\"\n", s_fname);
printf("reference file = \"%s\"\n", r_fname);
printf("gold file = \"%s\"\n", r_gold_fname);
// read in signal
unsigned int slength = 0;
float *signal = NULL;
if (s_fname == NULL) {
fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage);
exit(EXIT_FAILURE);
}
if (sdkReadFile(s_fname, &signal, &slength, false) == true) {
printf("Reading signal from \"%s\"\n", s_fname);
} else {
exit(EXIT_FAILURE);
}
// get the number of decompositions necessary to perform a full decomposition
unsigned int dlevels_complete = 0;
if (true != getLevels(slength, &dlevels_complete)) {
// error message
fprintf(stderr, "Signal length not supported.\n");
// cleanup and abort
free(signal);
exit(EXIT_FAILURE);
}
// device in data
float *d_idata = NULL;
// device out data
float *d_odata = NULL;
// device approx_final data
float *approx_final = NULL;
// The very final approximation coefficient has to be written to the output
// data, all others are reused as input data in the next global step and
// therefore have to be written to the input data again.
// The following flag indicates where to copy approx_final data
// - 0 is input, 1 is output
int approx_is_input;
// allocate device mem
const unsigned int smem_size = sizeof(float) * slength;
checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size));
checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size));
checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size));
// copy input data to device
checkCudaErrors(
cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice));
// total number of threads
// in the first decomposition step always one thread computes the average and
// detail signal for one pair of adjacent values
unsigned int num_threads_total_left = slength / 2;
// decomposition levels performed in the current / next step
unsigned int dlevels_step = dlevels_complete;
// 1D signal so the arrangement of elements is also 1D
dim3 block_size;
dim3 grid_size;
// number of decomposition levels left after one iteration on the device
unsigned int dlevels_left = dlevels_complete;
// if less or equal 1k elements, then the data can be processed in one block,
// this avoids the Wait-For-Idle (WFI) on host side which is necessary if the
// computation is split across multiple SM's if enough input data
if (dlevels_complete <= 10) {
// decomposition can be performed at once
block_size.x = num_threads_total_left;
approx_is_input = 0;
} else {
// 512 threads per block
grid_size.x = (num_threads_total_left / 512);
block_size.x = 512;
// 512 threads corresponds to 10 decomposition steps
dlevels_step = 10;
dlevels_left -= 10;
approx_is_input = 1;
}
// Initialize d_odata to 0.0f
initValue<<<grid_size, block_size>>>(d_odata, 0.0f);
// do until full decomposition is accomplished
while (0 != num_threads_total_left) {
// double the number of threads as bytes
unsigned int mem_shared = (2 * block_size.x) * sizeof(float);
// extra memory requirements to avoid bank conflicts
mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float);
// run kernel
dwtHaar1D<<<grid_size, block_size, mem_shared>>>(
d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left,
block_size.x);
// Copy approx_final to appropriate location
if (approx_is_input) {
checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4,
cudaMemcpyDeviceToDevice));
} else {
checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4,
cudaMemcpyDeviceToDevice));
}
// update level variables
if (dlevels_left < 10) {
// approx_final = d_odata;
approx_is_input = 0;
}
// more global steps necessary
dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left;
dlevels_left -= 10;
// after each step only half the threads are used any longer
// therefore after 10 steps 2^10 less threads
num_threads_total_left = num_threads_total_left >> 10;
// update block and grid size
grid_size.x =
(num_threads_total_left / 512) + (0 != (num_threads_total_left % 512))
? 1
: 0;
if (grid_size.x <= 1) {
block_size.x = num_threads_total_left;
}
}
// get the result back from the server
// allocate mem for the result
float *odata = (float *)malloc(smem_size);
checkCudaErrors(
cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost));
// post processing
// write file for regression test
if (r_fname == NULL) {
fprintf(stderr,
"Cannot write the output file storing the result of the wavelet "
"decomposition.\n%s",
usage);
exit(EXIT_FAILURE);
}
if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) {
printf("Writing result to \"%s\"\n", r_fname);
} else {
exit(EXIT_FAILURE);
}
// load the reference solution
unsigned int len_reference = 0;
float *reference = NULL;
if (r_gold_fname == NULL) {
fprintf(stderr,
"Cannot read the file containing the reference result of the "
"wavelet decomposition.\n%s",
usage);
exit(EXIT_FAILURE);
}
if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) {
printf("Reading reference result from \"%s\"\n", r_gold_fname);
} else {
exit(EXIT_FAILURE);
}
assert(slength == len_reference);
// compare the computed solution and the reference
bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f);
free(reference);
// free allocated host and device memory
checkCudaErrors(cudaFree(d_odata));
checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(approx_final));
free(signal);
free(odata);
free(s_fname);
free(r_gold_fname);
printf(bResult ? "Test success!\n" : "Test failure!\n");
}
////////////////////////////////////////////////////////////////////////////////
//! Get number of decomposition levels to perform a full decomposition
//! Also check if the input signal size is suitable
//! @return true if the number of decomposition levels could be determined
//! and the signal length is supported by the implementation,
//! otherwise false
//! @param len length of input signal
//! @param levels number of decomposition levels necessary to perform a full
//! decomposition
////////////////////////////////////////////////////////////////////////////////
bool getLevels(unsigned int len, unsigned int *levels) {
bool retval = false;
// currently signals up to a length of 2^20 supported
for (unsigned int i = 0; i < 20; ++i) {
if (len == (1 << i)) {
*levels = i;
retval = true;
break;
}
}
return retval;
}