-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathassignment.cu
415 lines (350 loc) · 15.9 KB
/
assignment.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
// standard lib stuff
#include <stdio.h>
#include <iostream>
#include <math.h>
#include <algorithm>
#include <random>
// thrust library
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/tuple.h>
#include <thrust/generate.h>
#include <thrust/random.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <cstdlib>
// For random number generation
#include <curand.h>
#include <curand_kernel.h>
// Program global variables. Many will be overridden during execution.
const double MAX = 5.12;
const double MIN = -5.12;
const int SIZE_PARENT_POOL = 7;
// Default settings if not overriden at runtime.
int POPULATION_SIZE = 200;
int N_PARAMETERS = 10;
int BLOCKSIZE = 256;
int TOTALTHREADS = 1024;
/* wrapper function to check cuda calls
* ref: https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
*/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/* Generate uniform distribution
*
* Used by thrust transform functions to create large numbers of
* random numbers in a uniform distribution.
*/
struct prg
{
double a, b;
__host__ __device__
prg(double _a=0.f, double _b=1.f) : a(_a), b(_b) {
};
__host__ __device__
double operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
//thrust::default_random_engine rng( 5555555 );
thrust::uniform_real_distribution<double> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
/* Generate normal distribution
*
* Used by thrust transform functions to create large numbers of
* random numbers from a gaussian (normal) distribution.
*/
struct normal
{
double a, b;
__host__ __device__
normal(double _a=0.f, double _b=0.1f) : a(_a), b(_b) {
};
__host__ __device__
double operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::normal_distribution<double> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
/* Return the larger of the two elements in the tuple
*
* Used by thrust reduce functions to find the maximum element
* in an array.
*/
template <class T>
struct larger_tuple {
__device__ __host__
thrust::tuple<T,int> operator()(const thrust::tuple<T,int> &a, const thrust::tuple<T,int> &b)
{
if (a > b) return a;
else return b;
}
};
/* min_index returns the index of the smallest element in the array
*
* Similar to NumPy argmin() functionality, this returns the index of the
* smallest element in the input array.
*/
template <class T>
int min_index(thrust::device_vector<T>& vec) {
thrust::counting_iterator<int> begin(0); thrust::counting_iterator<int> end(vec.size());
thrust::tuple<T,int> init(vec[0],0);
thrust::tuple<T,int> largest;
largest = thrust::reduce(thrust::make_zip_iterator(thrust::make_tuple(vec.begin(), begin)), thrust::make_zip_iterator(thrust::make_tuple(vec.end(), end)),
init, larger_tuple<T>());
return thrust::get<1>(largest);
}
/* score evaluates the fitness of each member in the population
*
* Each member (represented by a N_PARAMETERS sized section of the population
* array) is evaulated against the desired input function. For this example,
* this is hard-coded to be the "offset sphere problem".
*
* Higher numbers represent better fitness.
*/
__global__ void score(unsigned int n, unsigned int np, double *source, double *score) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
double value;
if (index < n) {
for (int i=index; i < n; i += stride) {
// score = 1/ (sqrt(sum((xi - 0.5)**2)) + 1)
value = 0;
for (int p=0; p<np; p++) {
value += std::pow(source[i*np+p]-0.5, 2);
}
value = (double) std::sqrt( (double) value);
score[i] = (double) 1.0 / (double) (value+1.0);
}
}
}
/* pickParents generates the set of parents to breed into the next generation
*
* The method here is a sort of limited tournament style. Each parent
* in the output array is the member with the best fitness drawn from a random
* pool of SIZE_PARENT_POOL that has been generated ahead of time.
*
* The brings in a controlled amount of natural selction, while still tending
* to bring the best members forward to the next generation. Note many parents
* can and will breed multiple times.
*/
__global__ void pickParents(unsigned int n, unsigned int np, int *randParents, double *score, int *pool) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Pick a parent n times
for (int i=index; i<n; i+=stride) {
double best = -1.0;
int best_index = -1;
int idx;
// Grab the SIZE_PARENT_POOL randomely chosen individuals,
// and pick the best one to output.
for (int j=0; j<SIZE_PARENT_POOL; j++) {
idx = randParents[i*SIZE_PARENT_POOL+j];
if (score[idx] > best) {
best = score[idx];
best_index = idx;
}
}
pool[i] = best_index;
}
}
/* breedGeneration uses the chosen parents to create a new derived generation
*
* 10% of the time, the parent will produce no children, but will move straight
* into the new generation.
*
* 90% of the time, a child will be produced from the two parents. In this
* case, half the values from one parent and half from the other are used
* to create the child. In this event, 5% of the time a random mutation
* will also happen to the child's values.
*/
__global__ void breedGeneration(unsigned int n, unsigned int np, int *randomParameters, double *population, double *newPopulation, int *parentsPool, double *mutations) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int parentA;
int parentB;
int probCopyParents;
int crossover;
int probChildAMutate;
int mutationPoint;
// Once for each individual
for (int i=index; i<n; i+=stride) {
probCopyParents = randomParameters[i*4];
crossover = randomParameters[(i+1)*4] % np;
probChildAMutate = randomParameters[(i+2)*4];
mutationPoint = randomParameters[(i+3)*4] % np;
parentA = parentsPool[i];
parentB = parentsPool[i+1];
// Place parent directly into the subsequent generation and
if (probCopyParents < 10) {
for (int j=0; j<np; j++) {
newPopulation[(i*np) + j] = population[(parentA*np) + j];
continue;
}
}
for (int j=0; j<np; j++) {
if (j < crossover) {
newPopulation[(i*np) + j] = population[(parentA*np) + j];
} else {
newPopulation[(i*np) + j] = population[(parentB*np) + j];
}
}
if (probChildAMutate < 5) {
double newval = newPopulation[(i*np) + mutationPoint] + mutations[i];
newPopulation[(i*np) + mutationPoint] = fminf(newval, MAX);
newPopulation[(i*np) + mutationPoint] = fmaxf(newval, MIN);
}
}
}
/* init sets up the random states
*/
__global__ void init(unsigned int seed, curandState_t* states) {
int idx = threadIdx.x+blockDim.x*blockIdx.x;
curand_init(seed, idx, 0, &states[idx]);
}
/* setRandom generates a random array modulo the max parameter.
*/
__global__ void setRandom(curandState_t* states, int* numbers, int max) {
int idx = threadIdx.x+blockDim.x*blockIdx.x;
for (int i=0; i<SIZE_PARENT_POOL; i++) {
numbers[idx+i] = curand(&states[idx]) % max;
}
}
int main(int argc, char** argv) {
int generation = 0;
// Create thrust arrays to hold the population, scores, and the new
// population that is created each generation.
thrust::device_vector<double> population(POPULATION_SIZE * N_PARAMETERS);
thrust::device_vector<double> popScores(POPULATION_SIZE);
thrust::device_vector<double> newPopulation(POPULATION_SIZE * N_PARAMETERS);
// Create raw pointers to the arrays createde by thrust. When using
// device calls outside the thrust library, this is the appropriate input
double* popPtr = thrust::raw_pointer_cast(&population[0]);
double* newPopPtr = thrust::raw_pointer_cast(&newPopulation[0]);
double* scoresPtr = thrust::raw_pointer_cast(&popScores[0]);
// Allocated random state for the initial population
curandState_t* states;
gpuErrchk(cudaMalloc((void**) &states, POPULATION_SIZE*SIZE_PARENT_POOL*sizeof(curandState_t)));
// Create random state for the random potential parents for each generation
int *randParents;
gpuErrchk(cudaMalloc((void**) &randParents, POPULATION_SIZE*SIZE_PARENT_POOL*sizeof(int)));
// Allocate array for the actual parents used to breed in each generation
int *parentsPool_d;
gpuErrchk(cudaMalloc((void**) &parentsPool_d, POPULATION_SIZE*sizeof(int)));
// Create random states and array for the parameters needed to breed children;
// - the probability of placing the parent directly into the next gen
// - the crossover point for the child
// - probability of mutating the child
// - index to mutate the child
curandState_t* childStates;
gpuErrchk(cudaMalloc((void**) &childStates, POPULATION_SIZE*4*sizeof(curandState_t)));
int *randParams_d;
gpuErrchk(cudaMalloc((void**)&randParams_d, POPULATION_SIZE*4*sizeof(int)));
// Parse Argument variables
if (argc >= 2) {
POPULATION_SIZE = atoi(argv[1]);
}
if (argc >= 3) {
N_PARAMETERS = atoi(argv[2]);
}
if (argc >= 4) {
TOTALTHREADS = atoi(argv[3]);
}
if (argc >= 5) {
BLOCKSIZE = atoi(argv[4]);
}
// Cude Device Properties to see if the BLOCKSIZE can be handled
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
if (BLOCKSIZE > prop.maxThreadsPerBlock) {
std::cout << "Device only supports a BLOCKSIZE of " << prop.maxThreadsPerBlock <<" threads/block" << std::endl;
std::cout << "Try again with a smaller BLOCKSIZE" << std::endl;
return 1;
}
std::cout << "Running with " << TOTALTHREADS << " threads and a BLOCKSIZE of ";
std::cout << BLOCKSIZE << std::endl;
int numBlocks = TOTALTHREADS/BLOCKSIZE;
// validate command line arguments and re-size if needed
if (TOTALTHREADS % BLOCKSIZE != 0) {
++numBlocks;
TOTALTHREADS = numBlocks*BLOCKSIZE;
printf("Warning: Total thread count is not evenly divisible by the block size\n");
printf("The total number of threads will be rounded up to %d\n", TOTALTHREADS);
}
std::cout << "Running " << numBlocks << " total blocks." << std::endl;
// initialize random generator for the initial population
std::default_random_engine generator;
std::normal_distribution<double> distribution(0, .1);
// Generate initial random population representing generation 0
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + POPULATION_SIZE * N_PARAMETERS,
population.begin(),
prg(MIN, MAX));
// Evaluate every member of the population and find the most fit individual
score<<<TOTALTHREADS, BLOCKSIZE>>>(POPULATION_SIZE, N_PARAMETERS, popPtr, scoresPtr);
double best = *(thrust::max_element(popScores.begin(), popScores.end()));
int best_index = min_index(popScores);
std::cout << "Initial generation best score: " << best << " at index: " << best_index << " ";
for (int i=0; i<N_PARAMETERS; i++) {
std::cout << population[best_index * N_PARAMETERS + i] << " ";
}
std::cout << std::endl;
// Create successive generations until convergence is achieved.
while (best < .999) {
// Create a new set of random parents to breed into the next generation
init<<<POPULATION_SIZE*SIZE_PARENT_POOL, 1>>>(time(0), states);
gpuErrchk(cudaPeekAtLastError());
setRandom<<<POPULATION_SIZE*SIZE_PARENT_POOL, 1>>>(states, randParents, POPULATION_SIZE);
gpuErrchk(cudaPeekAtLastError());
pickParents<<<TOTALTHREADS, BLOCKSIZE>>>(POPULATION_SIZE, N_PARAMETERS, randParents, scoresPtr, parentsPool_d);
gpuErrchk(cudaPeekAtLastError());
// Generate new random states for the breeding parameters
init<<<POPULATION_SIZE*4, 1>>>(time(0), childStates);
gpuErrchk(cudaPeekAtLastError());
setRandom<<<POPULATION_SIZE*4, 1>>>(childStates, randParams_d, 100);
gpuErrchk(cudaPeekAtLastError());
// Generate new random parameters for breeding and child mutation
thrust::device_vector<double> mutations(POPULATION_SIZE);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + POPULATION_SIZE,
mutations.begin(),
normal(0.0, 0.001));
double* mutPtr = thrust::raw_pointer_cast(&mutations[0]);
// Breed members and copy over to the new generation
breedGeneration<<<TOTALTHREADS, BLOCKSIZE>>>(POPULATION_SIZE, N_PARAMETERS, randParams_d, popPtr, newPopPtr, parentsPool_d, mutPtr);
gpuErrchk(cudaPeekAtLastError());
thrust::copy(thrust::device, newPopulation.begin(), newPopulation.end(), population.begin());
// Evaluate all members and identify the most fit individual
score<<<TOTALTHREADS, BLOCKSIZE>>>(POPULATION_SIZE, N_PARAMETERS, popPtr, scoresPtr);
gpuErrchk(cudaPeekAtLastError());
best = *(thrust::min_element(popScores.begin(), popScores.end()));
best_index = min_index(popScores);
std::cout << "Bred generation " << generation << " Best score: " << best << " at index: " << best_index << " ";
for (int i=0; i<N_PARAMETERS; i++) {
std::cout << population[best_index * N_PARAMETERS + i] << " ";
}
std::cout << std::endl;
// Keep track of how many generations have occured.
generation++;
}
cudaFree(states);
cudaFree(randParents);
cudaFree(parentsPool_d);
cudaFree(childStates);
cudaFree(randParams_d);
return 0;
}