forked from pradyGn/CuCNN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFFL.cu
134 lines (117 loc) · 3.75 KB
/
FFL.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
const int N = 4;
__global__ void forward_propagation_fc(float* input, float* weights, float* bias, float* output) {
int i = threadIdx.x;
float sum = 0.0f;
for(int j = 0; j < N; j++){
sum += bias[j] + weights[i*N + j] * input[j];
}
output[i] = sum;
}
int main()
{
// Allocate memory for arrays
#define N 4
__global__ void forward_propagation_fc(float* input, float* weights, float* bias, float* output) {
//int i = blockIdx.x * blockDim.x + threadIdx.x;
int i = threadIdx.x;
float sum = 0;
for(int j = 0; j < N; j++){
sum += bias[j] + weights[i*N + j] * input[j];
}
output[i] = sum;
//output[i] = bias[i] + weights[i] * input[i];
}
void initialize(float *matrix, int matrix_M, int matrix_N){
for (int i = 0; i < matrix_M; i++){
for (int j = 0; j < matrix_N; j++){
matrix[(i*matrix_N) + j] = j + i;
}
}
}
void check_matrix(float *matrix, int matrix_M, int matrix_N){
for (int i=0; i<matrix_M; i++){
for (int j=0; j<matrix_N; j++)
{
printf("%.2f", matrix[(i*matrix_M)+j]);
printf(" ");
}
printf("\n");
}
printf("\n");
}
int main(){
// Allocate the input and output arrays.
float* input = (float*)malloc(N * sizeof(float));
float* weights = (float*)malloc(N*N * sizeof(float));
float* output = (float*)malloc(N * sizeof(float));
float* biases = (float*)malloc(N * sizeof(float));
// Initialize all arrays
// Initialize the input and output arrays.
for (int i = 0; i < N; i++) {
input[i] = i;
for(int j = 0; j < N; j++){
weights[i*N + j] = 0.5f;}
output[i] = 0.0f;
biases[i] = 0.0f;
}
// Allocate CUDA Memory
// Allocate the CUDA memory for the input and output arrays.
float* d_input;
cudaMalloc(&d_input, N * sizeof(float));
float* d_weights;
cudaMalloc(&d_weights, N*N * sizeof(float));
float* d_output;
cudaMalloc(&d_output, N * sizeof(float));
float* d_biases;
cudaMalloc(&d_biases, N * sizeof(float));
// Copy the required parameters to device
// Copy the input and output arrays to the CUDA device.
cudaMemcpy(d_input, input, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_weights, weights, N*N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_biases, biases, N * sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel.
dim3 blocks(1);
dim3 threads(N);
forward_propagation_fc<<<blocks, threads>>>(d_input, d_weights, d_biases, d_output);
// Copy the output array back to the host.
cudaMemcpy(output, d_output, N * sizeof(float), cudaMemcpyDeviceToHost);
// Print input array
dim3 gridsize(1);
dim3 blocksize(N);
//fully_connected_forward<<<blocks, threads>>>(d_input, d_weights, d_output, 1, N, N);
forward_propagation_fc<<<gridsize, blocksize>>>(d_input, d_weights, d_biases, d_output);
// Copy the output array back to the host.
cudaMemcpy(output, d_output, N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
printf("%f ", input[i]);
}
printf("\n\n\n");
// Print weights
for (int i = 0; i < N; i++) {
for (int j = 0;j < N; j++){
printf("%f ", weights[i*N + j]);
}}
printf("\n\n\n");
// Print the output array.
for (int i = 0; i < N; i++) {
printf("%f ", output[i]);
}
// Free CUDA memory.
// Free the CUDA memory.
cudaFree(d_input);
cudaFree(d_weights);
cudaFree(d_output);
cudaFree(d_biases);
// Free host memory.
// Free the host memory.
free(input);
free(weights);
free(output);
free(biases);
return 0;
}
return 0;
}