forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofile_gpt2.cu
101 lines (82 loc) · 3.75 KB
/
profile_gpt2.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
This code is a convenience tool for profiling the CUDA kernels in the training
loop of train_gpt2.cu. Compile:
make profile_gpt2cu NO_MULTI_GPU=1
And then e.g. use ncu from NVIDIA. The CLI docs for example:
https://docs.nvidia.com/nsight-compute/NsightComputeCli/
TLDR run like:
sudo ncu --set full --import-source yes -o profile -f ./profile_gpt2cu
This:
- `--set full` means we'll collect A LOT of metrics. take out for less
- `--import-source yes` means we'll get the source code in the profile
- `-o profile` writes the results into file profile.ncu-rep
- `-f` forces overwrite of the profile.ncu-rep file
- `./profile_gpt2cu` is the executable we want to profile
This writes results into profile.ncu-rep output file.
You can open this up in NVIDIA Nsight Compute UI.
For example, I have NVIDIA Nsight Compute installed on my Mac, and I rsync
the profile.ncu-rep from a cloud box to local to pretty view.
*/
#define TESTING
#include "train_gpt2.cu"
int main() {
// set up the device
int deviceIdx = 0;
cudaCheck(cudaSetDevice(deviceIdx));
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceIdx);
printf("[System]\n");
printf("Device %d: %s\n", deviceIdx, deviceProp.name);
cuda_num_SMs = deviceProp.multiProcessorCount;
cuda_threads_per_SM = deviceProp.maxThreadsPerMultiProcessor;
cuda_arch_major = deviceProp.major;
cuda_arch_minor = deviceProp.minor;
cudaCheck(cudaStreamCreate(&main_stream));
cudaEventCreateWithFlags(&main_event, cudaEventDisableTiming);
cudaEventCreateWithFlags(&loss_event, cudaEventDisableTiming);
for (int i = 0; i < num_parallel_streams; i++) {
cudaCheck(cudaStreamCreate(¶llel_streams[i]));
cudaEventCreateWithFlags(¶llel_events[i], cudaEventDisableTiming);
}
// setup cuBLAS and cuBLASLt
cublasCheck(cublasCreate(&cublas_handle));
cublasCheck(cublasSetStream(cublas_handle, main_stream));
cublasCheck(cublasLtCreate(&cublaslt_handle));
// TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
int enable_tf32 = deviceProp.major >= 8 ? 1 : 0;
printf("enable_tf32: %d\n", enable_tf32);
cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
// setup the (global) cuBLASLt workspace
cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
create_cudnn();
// build the GPT-2 model from a checkpoint
GPT2 model;
gpt2_build_from_checkpoint(&model, "gpt2_124M_bf16.bin");
int B = 24; // if program OOMs decrease this number, e.g. all the way down to 4 or etc
int T = 1024; // if even that OOMs move on to this one. keep them nice and powers of 2
printf("batch size: %d\n", B);
printf("sequence length: %d\n", T);
int* x = (int*)mallocCheck(B * T * sizeof(int));
int* y = (int*)mallocCheck(B * T * sizeof(int));
for(int i = 0; i < B * T; ++i) {
x[i] = i % model.config.vocab_size;
y[i] = i % model.config.vocab_size;
}
// override number of layers to 1 because all layers repeat the same kernels, only profile once
model.config.num_layers = 1;
// do a training step
gpt2_forward(&model, x, y, B, T);
gpt2_zero_grad(&model);
gpt2_backward(&model);
gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1);
cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
// free
gpt2_free(&model);
destroy_cudnn();
cudaCheck(cudaFree(cublaslt_workspace));
cublasCheck(cublasDestroy(cublas_handle));
cublasCheck(cublasLtDestroy(cublaslt_handle));
return 0;
}