-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbm-aes-encrypt-cuda-v3.2.cu
143 lines (114 loc) · 5.97 KB
/
bm-aes-encrypt-cuda-v3.2.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>
#include <cuda_runtime.h>
#include <string.h>
#include "bm-utils-cuda.h"
/*
For benchmarking
Optimization:
-v1 Constant Memory: S box
-v1 Shared Memory: IV and expanded key
-v1 Pinned Memory: plaintext and ciphertext
-v2 Coalesced Memory Access: In previous code, each thread is accessing a different block of the plaintext and ciphertext arrays. If the blocks are not contiguous in memory, this could slow down the program. This code rearrange the data so that the blocks accessed by threads in the same warp are contiguous in memory.
-v3 Divergence Avoidance:
-v3.1 aes_ctr_encrypt_kernel(): In the original function, the divergence is caused by the conditional statement if (blockId < numBlocks). This divergence can be avoided by ensuring that the number of threads is a multiple of the number of blocks, which means padding the data to a multiple of the block size.
-v3.2 mul_v2(): In this modified version, the if (b & 1) and if (high_bit) conditions are replaced with arithmetic operations. This ensures all threads in a warp take the same execution path, avoiding divergence.
*/
// Declare fixed data in constant memory
__constant__ unsigned char d_sbox_v3_2[256];
__global__ void aes_ctr_encrypt_kernel_v3_2(unsigned char *plaintext, unsigned char *ciphertext, unsigned char *expandedKey, unsigned char *iv, int numBlocks, int dataSize) {
// Calculate the unique thread ID within the grid
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Create shared memory arrays for the IV and expanded key
__shared__ unsigned char shared_iv[AES_BLOCK_SIZE];
__shared__ unsigned char shared_expandedKey[176];
// Copy the IV and expanded key to shared memory
if (threadIdx.x < AES_BLOCK_SIZE) {
shared_iv[threadIdx.x] = iv[threadIdx.x];
}
if (threadIdx.x < 176) {
shared_expandedKey[threadIdx.x] = expandedKey[threadIdx.x];
}
// Synchronize to make sure the arrays are fully loaded
__syncthreads();
// Define the counter and initialize it with the IV
unsigned char counter[AES_BLOCK_SIZE];
// Calculate the number of blocks processed by each thread
int blocksPerThread = (numBlocks + gridDim.x * blockDim.x - 1) / (gridDim.x * blockDim.x);
// Process multiple blocks of plaintext/ciphertext
for (int block = 0; block < blocksPerThread; ++block) {
int blockId = tid + block * gridDim.x * blockDim.x;
// Skip the iteration if the blockId is out of range
if (blockId >= numBlocks) {
continue;
}
memcpy(counter, shared_iv, AES_BLOCK_SIZE);
// Increment the counter by the block ID
increment_counter(counter, blockId);
// Calculate the block size
int blockSize = (blockId == numBlocks - 1 && dataSize % AES_BLOCK_SIZE != 0) ? dataSize % AES_BLOCK_SIZE : AES_BLOCK_SIZE;
// Encrypt the counter to get the ciphertext block
unsigned char ciphertextBlock[AES_BLOCK_SIZE];
aes_encrypt_block_v2(counter, ciphertextBlock, shared_expandedKey, d_sbox_v3_2);
// XOR the plaintext with the ciphertext block
for (int i = 0; i < blockSize; ++i) {
ciphertext[blockId * AES_BLOCK_SIZE + i] = plaintext[blockId * AES_BLOCK_SIZE + i] ^ ciphertextBlock[i];
}
}
}
std::pair<double, double> aes_encrypt_cuda_v3_2(unsigned char *plaintext,
size_t dataSize,
unsigned char *key,
unsigned char *iv,
unsigned char *ciphertext) {
auto start = std::chrono::high_resolution_clock::now();
unsigned char *d_plaintext, *d_ciphertext, *d_iv;
unsigned char *d_expandedKey;
// Call the host function to expand the key
unsigned char expandedKey[176];
KeyExpansionHost(key, expandedKey);
// Calculate the number of AES blocks needed
size_t numBlocks = (dataSize + AES_BLOCK_SIZE - 1) / AES_BLOCK_SIZE;
// Define the size of the grid and the blocks
dim3 threadsPerBlock(256); // Use a reasonable number of threads per block
dim3 blocksPerGrid((numBlocks + threadsPerBlock.x - 1) / threadsPerBlock.x);
// Allocate device memory
cudaMalloc((void **)&d_iv, AES_BLOCK_SIZE * sizeof(unsigned char));
cudaMalloc((void **)&d_expandedKey, 176);
cudaMalloc((void **)&d_plaintext, dataSize * sizeof(unsigned char));
cudaMalloc((void **)&d_ciphertext, dataSize * sizeof(unsigned char));
// Copy S-box to device constant memory
cudaMemcpyToSymbol(d_sbox_v3_2, h_sbox, sizeof(h_sbox));
// Copy host memory to device
cudaMemcpy(d_plaintext, plaintext, dataSize * sizeof(unsigned char),
cudaMemcpyHostToDevice);
cudaMemcpy(d_iv, iv, AES_BLOCK_SIZE * sizeof(unsigned char),
cudaMemcpyHostToDevice);
cudaMemcpy(d_expandedKey, expandedKey, 176, cudaMemcpyHostToDevice);
// Launch AES-CTR encryption kernel
auto kernel_start = std::chrono::high_resolution_clock::now();
aes_ctr_encrypt_kernel_v3_2<<<blocksPerGrid, threadsPerBlock>>>(
d_plaintext, d_ciphertext, d_expandedKey, d_iv, numBlocks, dataSize);
// Synchronize device
cudaDeviceSynchronize();
auto kernel_stop = std::chrono::high_resolution_clock::now();
// Copy device ciphertext back to host
cudaMemcpy(ciphertext, d_ciphertext, dataSize * sizeof(unsigned char),
cudaMemcpyDeviceToHost);
// Get the stop time
auto stop = std::chrono::high_resolution_clock::now();
// Cleanup
cudaFree(d_plaintext);
cudaFree(d_ciphertext);
cudaFree(d_iv);
cudaFree(d_expandedKey);
// Calculate the elapsed time and print
return std::make_pair(
std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
.count(),
std::chrono::duration_cast<std::chrono::microseconds>(kernel_stop -
kernel_start)
.count());
}