From eaedc294a3c76a89c459893deba8b2627a11541e Mon Sep 17 00:00:00 2001 From: Viren Abhyankar Date: Thu, 26 Jan 2023 16:17:48 -0800 Subject: [PATCH] Softmax kernels (#593) --- .../flexflow/ops/kernels/softmax_kernels.h | 52 +++++++++++ include/flexflow/ops/softmax.h | 39 -------- src/ops/{ => kernels}/softmax.cpp | 90 ++++++++++--------- src/ops/{ => kernels}/softmax.cu | 90 ++++++++++--------- src/ops/softmax.cc | 7 +- 5 files changed, 151 insertions(+), 127 deletions(-) create mode 100644 include/flexflow/ops/kernels/softmax_kernels.h rename src/ops/{ => kernels}/softmax.cpp (77%) rename src/ops/{ => kernels}/softmax.cu (77%) diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h new file mode 100644 index 0000000000..81b34d8558 --- /dev/null +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -0,0 +1,52 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/ops/softmax.h" + +namespace FlexFlow { + +class SoftmaxMeta : public OpMeta { +public: + SoftmaxMeta(FFHandler handle, + Softmax const *softmax, + Legion::Domain const &input_domain); +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + cudnnTensorDescriptor_t inputTensor; +#else + miopenTensorDescriptor_t inputTensor; +#endif + bool profiling; + int dim; + char op_name[MAX_OPNAME]; +}; + +namespace Kernels { +namespace Softmax { + +void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr); + +void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements); + +namespace Internal { +void forward_kernel(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr, + ffStream_t stream); +void backward_kernel(float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements, + ffStream_t stream); +} // namespace Internal +} // namespace Softmax +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index c9d6a1e6a9..25a20315bd 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -1,33 +1,13 @@ #ifndef _FLEXFLOW_SOFTMAX_H #define _FLEXFLOW_SOFTMAX_H -#include "flexflow/device.h" -#include "flexflow/fftype.h" #include "flexflow/layer.h" #include "flexflow/node.h" -#include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/ops/softmax_params.h" namespace FlexFlow { -class Softmax; - -class SoftmaxMeta : public OpMeta { -public: - SoftmaxMeta(FFHandler handle, - Softmax const *softmax, - Legion::Domain const &input_domain); -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnTensorDescriptor_t inputTensor; -#else - miopenTensorDescriptor_t inputTensor; -#endif - bool profiling; - int dim; - char op_name[MAX_OPNAME]; -}; - class Softmax : public Op { public: using Params = SoftmaxParams; @@ -63,28 +43,9 @@ class Softmax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - void init_meta(SoftmaxMeta *m, - Legion::Rect<2> const &input, - Legion::Rect<2> const &output) const; bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; - static void forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, - ffStream_t stream); - static void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); - static void backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements, - ffStream_t stream); - static void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements); - Params get_params() const; private: diff --git a/src/ops/softmax.cpp b/src/ops/kernels/softmax.cpp similarity index 77% rename from src/ops/softmax.cpp rename to src/ops/kernels/softmax.cpp index e53b41f4a4..d63bd0edc5 100644 --- a/src/ops/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/softmax.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/hash_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -33,29 +33,12 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, std::strcpy(op_name, softmax->name); } -/* static */ -void Softmax::forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, - hipStream_t stream) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); +namespace Kernels { +namespace Softmax { - float alpha = 1.0f, beta = 0.0f; - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &alpha, - m->inputTensor, - input_ptr, - &beta, - m->inputTensor, - output_ptr, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); -} - -/* static */ -void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr) { +void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -65,7 +48,7 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - Softmax::forward_kernel(m, input_ptr, output_ptr, stream); + Internal::forward_kernel(m, input_ptr, output_ptr, stream); if (m->profiling) { hipEventRecord(t_end, stream); checkCUDA(hipEventSynchronize(t_end)); @@ -81,23 +64,10 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, } } -/* static */ -void Softmax::backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements, - hipStream_t stream) { - checkCUDA(hipMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); -} - -/* static */ -void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements) { +void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -107,7 +77,7 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, hipEventCreate(&t_end); hipEventRecord(t_start, stream); } - Softmax::backward_kernel( + Internal::backward_kernel( input_grad_ptr, output_grad_ptr, num_elements, stream); if (m->profiling) { hipEventRecord(t_end, stream); @@ -124,4 +94,38 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, } } -}; // namespace FlexFlow +namespace Internal { + +void forward_kernel(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &alpha, + m->inputTensor, + input_ptr, + &beta, + m->inputTensor, + output_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); +} + +void backward_kernel(float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements, + hipStream_t stream) { + checkCUDA(hipMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); +} + +} // namespace Internal +} // namespace Softmax +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/softmax.cu b/src/ops/kernels/softmax.cu similarity index 77% rename from src/ops/softmax.cu rename to src/ops/kernels/softmax.cu index 7114f06274..d83d9952c9 100644 --- a/src/ops/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "flexflow/ops/softmax.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/cuda_helper.h" #include "flexflow/utils/hash_utils.h" @@ -32,29 +32,12 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, std::strcpy(op_name, softmax->name); } -/* static */ -void Softmax::forward_kernel(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr, - cudaStream_t stream) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); +namespace Kernels { +namespace Softmax { - float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - m->inputTensor, - input_ptr, - &beta, - m->inputTensor, - output_ptr)); -} - -/* static */ -void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr) { +void forward_kernel_wrapper(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -64,7 +47,7 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Softmax::forward_kernel(m, input_ptr, output_ptr, stream); + Internal::forward_kernel(m, input_ptr, output_ptr, stream); if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -80,23 +63,10 @@ void Softmax::forward_kernel_wrapper(SoftmaxMeta const *m, } } -/* static */ -void Softmax::backward_kernel(float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements, - cudaStream_t stream) { - checkCUDA(cudaMemcpyAsync(input_grad_ptr, - output_grad_ptr, - num_elements * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); -} - -/* static */ -void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements) { +void backward_kernel_wrapper(SoftmaxMeta const *m, + float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -106,7 +76,7 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Softmax::backward_kernel( + Internal::backward_kernel( input_grad_ptr, output_grad_ptr, num_elements, stream); if (m->profiling) { cudaEventRecord(t_end, stream); @@ -123,4 +93,38 @@ void Softmax::backward_kernel_wrapper(SoftmaxMeta const *m, } } -}; // namespace FlexFlow +namespace Internal { + +void forward_kernel(SoftmaxMeta const *m, + float const *input_ptr, + float *output_ptr, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->inputTensor, + input_ptr, + &beta, + m->inputTensor, + output_ptr)); +} + +void backward_kernel(float *input_grad_ptr, + float const *output_grad_ptr, + size_t num_elements, + cudaStream_t stream) { + checkCUDA(cudaMemcpyAsync(input_grad_ptr, + output_grad_ptr, + num_elements * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); +} + +} // namespace Internal +} // namespace Softmax +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 813104292f..029b20afd1 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -15,6 +15,7 @@ #include "flexflow/ops/softmax.h" #include "flexflow/model.h" +#include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/utils/hash_utils.h" namespace FlexFlow { @@ -34,6 +35,8 @@ using Legion::Task; using Legion::TaskArgument; using Legion::TaskLauncher; +using namespace FlexFlow::Kernels::Softmax; + /* Params */ bool operator==(SoftmaxParams const &lhs, SoftmaxParams const &rhs) { return lhs.dim == rhs.dim; @@ -252,7 +255,7 @@ void Softmax::forward_task_with_dim(Task const *task, runtime, false /*readOutput*/); - Softmax::forward_kernel_wrapper(m, acc_input.ptr, acc_output.ptr); + forward_kernel_wrapper(m, acc_input.ptr, acc_output.ptr); } void Softmax::backward(FFModel const &ff) { @@ -327,7 +330,7 @@ void Softmax::backward_task_with_dim(Task const *task, // make sure the image indices match! assert(acc_input_grad.rect == acc_output_grad.rect); - Softmax::backward_kernel_wrapper( + backward_kernel_wrapper( m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); }