From 40c38a6bedadd41dc3c043e89237842df137b252 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 31 Jul 2024 00:57:21 -0700 Subject: [PATCH] Automated Code Change PiperOrigin-RevId: 657898281 --- .../internal/optimized/legacy_optimized_ops.h | 1052 +++++++++-------- .../internal/optimized/resize_bilinear.h | 169 +-- 2 files changed, 623 insertions(+), 598 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h index eb83c9b54bbd97..8c8c7288143055 100644 --- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h @@ -213,14 +213,14 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, template inline void LegacyDepthwiseConvWithRounding( const DepthwiseParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data, int thread_start, int thread_end, int thread_dim) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data, int thread_start, int thread_end, int thread_dim) { ruy::profiler::ScopeLabel label("DepthwiseConv/8bit"); const int depth_multiplier = params.depth_multiplier; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; const int dilation_width_factor = params.dilation_width_factor; const int dilation_height_factor = params.dilation_height_factor; TFLITE_DCHECK_GE(dilation_width_factor, 1); @@ -267,10 +267,10 @@ inline void LegacyDepthwiseConvWithRounding( inline void LegacyDepthwiseConvImpl( const DepthwiseParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data, int thread_start, int thread_end, int thread_dim) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data, int thread_start, int thread_end, int thread_dim) { return LegacyDepthwiseConvWithRounding< DepthwiseConvOutputRounding::kAwayFromZero>( params, input_shape, input_data, filter_shape, filter_data, bias_shape, @@ -278,16 +278,16 @@ inline void LegacyDepthwiseConvImpl( thread_dim); } -inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, +inline void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, int stride_width, int stride_height, int dilation_width_factor, int dilation_height_factor, int pad_width, int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { tflite::DepthwiseParams op_params; // Padding type is ignored, but still set. @@ -318,15 +318,15 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, /*thread_end=*/output_height, /*thread_dim=*/1); } -inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, +inline void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, int stride_width, int stride_height, int pad_width, int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, filter_offset, bias_data, bias_dims, stride_width, @@ -338,15 +338,15 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // Legacy, for compatibility with old checked-in code. template -void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, +void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { + int pad_height, int depth_multiplier, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims) { if (Ac == FusedActivationFunctionType::kNone) { TFLITE_DCHECK_EQ(output_activation_min, 0); TFLITE_DCHECK_EQ(output_activation_max, 255); @@ -361,15 +361,15 @@ void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // Legacy, for compatibility with old checked-in code. template -void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { +void DepthwiseConv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, + int stride, int pad_width, int pad_height, + int depth_multiplier, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims) { DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, filter_offset, bias_data, bias_dims, stride, stride, pad_width, pad_height, depth_multiplier, @@ -437,10 +437,10 @@ inline int HowManyConvThreads(const RuntimeShape& output_shape, inline void DepthwiseConv( const DepthwiseParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data, gemmlowp::GemmContext* gemmlowp_context = nullptr) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data, gemmlowp::GemmContext* gemmlowp_context = nullptr) { ruy::profiler::ScopeLabel label("DepthwiseConv"); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -477,7 +477,7 @@ inline void DepthwiseConv( for (int i = 0; i < thread_count; ++i) { int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i); - tasks[i] = new LegacyDepthwiseConvWorkerTask( + tasks[i] = new LegacyDepthwiseConvWorkerTask( params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); @@ -490,8 +490,8 @@ inline void DepthwiseConv( template struct LegacyPerChannelDepthwiseConvWorkerTask : public gemmlowp::Task { LegacyPerChannelDepthwiseConvWorkerTask( - const DepthwiseParams& params, const int32* output_multiplier, - const int32* output_shift, const RuntimeShape& input_shape, + const DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, const T* input_data, const RuntimeShape& filter_shape, const T* filter_data, const RuntimeShape& bias_shape, const TS* bias_data, const RuntimeShape& output_shape, T* output_data, int thread_start, @@ -521,8 +521,8 @@ struct LegacyPerChannelDepthwiseConvWorkerTask : public gemmlowp::Task { private: const DepthwiseParams& params_; - const int32* output_multiplier_; - const int32* output_shift_; + const int32_t* output_multiplier_; + const int32_t* output_shift_; const RuntimeShape& input_shape_; const T* input_data_; const RuntimeShape& filter_shape_; @@ -537,12 +537,12 @@ struct LegacyPerChannelDepthwiseConvWorkerTask : public gemmlowp::Task { }; inline void DepthwiseConvPerChannel( - const DepthwiseParams& params, const int32* output_multiplier, - const int32* output_shift, const RuntimeShape& input_shape, - const int8* input_data, const RuntimeShape& filter_shape, - const int8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, int8* output_data, - gemmlowp::GemmContext* gemmlowp_context = nullptr) { + const DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data, gemmlowp::GemmContext* gemmlowp_context = nullptr) { ruy::profiler::ScopeLabel label("DepthwiseConvInt8"); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -581,7 +581,7 @@ inline void DepthwiseConvPerChannel( for (int i = 0; i < thread_count; ++i) { int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i); - tasks[i] = new LegacyPerChannelDepthwiseConvWorkerTask( + tasks[i] = new LegacyPerChannelDepthwiseConvWorkerTask( params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); @@ -713,17 +713,17 @@ void FullyConnected(const float* input_data, const Dims<4>& input_dims, } struct GemmlowpOutputPipeline { - typedef gemmlowp::VectorMap + typedef gemmlowp::VectorMap ColVectorMap; typedef std::tuple, gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> Pipeline; - static Pipeline MakeExp(const int32* bias_data, int output_rows, - int32 output_offset, int32 output_multiplier, - int output_left_shift, int32 output_activation_min, - int32 output_activation_max) { + static Pipeline MakeExp(const int32_t* bias_data, int output_rows, + int32_t output_offset, int32_t output_multiplier, + int output_left_shift, int32_t output_activation_min, + int32_t output_activation_max) { ColVectorMap bias_vector(bias_data, output_rows); gemmlowp::OutputStageBiasAddition bias_addition_stage; bias_addition_stage.bias_vector = bias_vector; @@ -741,17 +741,17 @@ struct GemmlowpOutputPipeline { }; struct GemmlowpOutputPipelineInt8 { - typedef gemmlowp::VectorMap + typedef gemmlowp::VectorMap ColVectorMap; typedef std::tuple, gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToInt8> Pipeline; - static Pipeline MakeExp(const int32* bias_data, int output_rows, - int32 output_offset, int32 output_multiplier, - int output_left_shift, int32 output_activation_min, - int32 output_activation_max) { + static Pipeline MakeExp(const int32_t* bias_data, int output_rows, + int32_t output_offset, int32_t output_multiplier, + int output_left_shift, int32_t output_activation_min, + int32_t output_activation_max) { ColVectorMap bias_vector(bias_data, output_rows); gemmlowp::OutputStageBiasAddition bias_addition_stage; bias_addition_stage.bias_vector = bias_vector; @@ -770,13 +770,14 @@ struct GemmlowpOutputPipelineInt8 { #ifdef USE_NEON inline void LegacyFullyConnectedAsGEMVWorkerImpl( - const RuntimeShape& input_shape, const uint8* input_data, - int32 input_offset, const RuntimeShape& filter_shape, - const uint8* filter_data, int32 filter_offset, - const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, int32 output_activation_min, - int32 output_activation_max, const RuntimeShape& output_shape, - uint8* output_data, int row_start, int row_end) { + const RuntimeShape& input_shape, const uint8_t* input_data, + int32_t input_offset, const RuntimeShape& filter_shape, + const uint8_t* filter_data, int32_t filter_offset, + const RuntimeShape& bias_shape, const int32_t* bias_data, + int32_t output_offset, int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + const RuntimeShape& output_shape, uint8_t* output_data, int row_start, + int row_end) { ruy::profiler::ScopeLabel label("FullyConnectedAsGEMV/8bit"); TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); @@ -806,7 +807,7 @@ inline void LegacyFullyConnectedAsGEMVWorkerImpl( int in = 0; for (; in <= input_size - 16; in += 16) { const uint8x16_t input_val_u8 = vld1q_u8(input_data + in); - const uint8* filter_ptr = filter_data + in + out * input_size; + const uint8_t* filter_ptr = filter_data + in + out * input_size; uint8x16_t filter_val_u8_0 = vld1q_u8(filter_ptr); optimized_ops_preload_l1_stream(filter_ptr + 64); filter_ptr += input_size; @@ -884,7 +885,7 @@ inline void LegacyFullyConnectedAsGEMVWorkerImpl( } for (; in <= input_size - 8; in += 8) { const uint8x8_t input_val_u8 = vld1_u8(input_data + in); - const uint8* filter_ptr = filter_data + in + out * input_size; + const uint8_t* filter_ptr = filter_data + in + out * input_size; uint8x8_t filter_val_u8_0 = vld1_u8(filter_ptr); filter_ptr += input_size; uint8x8_t filter_val_u8_1 = vld1_u8(filter_ptr); @@ -920,16 +921,16 @@ inline void LegacyFullyConnectedAsGEMVWorkerImpl( vget_high_s16(input_val)); } if (in < input_size) { - int32 buf[16]; + int32_t buf[16]; vst1q_s32(buf + 0, acc0); vst1q_s32(buf + 4, acc1); vst1q_s32(buf + 8, acc2); vst1q_s32(buf + 12, acc3); for (; in < input_size; in++) { int lane = (in + 8 - input_size) % 4; - const int32 input_val = input_data[in] + input_offset; + const int32_t input_val = input_data[in] + input_offset; for (int k = 0; k < kPeel; k++) { - int32 filter_val = + int32_t filter_val = filter_data[in + (out + k) * input_size] + filter_offset; buf[lane + 4 * k] += filter_val * input_val; } @@ -958,7 +959,7 @@ inline void LegacyFullyConnectedAsGEMVWorkerImpl( int32x4_t bias_vec = vld1q_s32(bias_data + out); reduced = vaddq_s32(reduced, bias_vec); if (shift_left) { - const int32 multiplier_power_of_two = 1 << output_shift; + const int32_t multiplier_power_of_two = 1 << output_shift; reduced = vmulq_n_s32(reduced, multiplier_power_of_two); reduced = vqrdmulhq_n_s32(reduced, output_multiplier); } else { @@ -988,13 +989,13 @@ inline void LegacyFullyConnectedAsGEMVWorkerImpl( struct LegacyFullyConnectedAsGEMVWorkerTask : public gemmlowp::Task { LegacyFullyConnectedAsGEMVWorkerTask( - const RuntimeShape& input_shape, const uint8* input_data, - int32 input_offset, const RuntimeShape& filter_shape, - const uint8* filter_data, int32 filter_offset, - const RuntimeShape& bias_shape, const int32* bias_data, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - const RuntimeShape& output_shape, uint8* output_data, int row_start, + const RuntimeShape& input_shape, const uint8_t* input_data, + int32_t input_offset, const RuntimeShape& filter_shape, + const uint8_t* filter_data, int32_t filter_offset, + const RuntimeShape& bias_shape, const int32_t* bias_data, + int32_t output_offset, int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + const RuntimeShape& output_shape, uint8_t* output_data, int row_start, int row_end) : input_shape_(input_shape), input_data_(input_data), @@ -1024,32 +1025,33 @@ struct LegacyFullyConnectedAsGEMVWorkerTask : public gemmlowp::Task { } const RuntimeShape& input_shape_; - const uint8* input_data_; - int32 input_offset_; + const uint8_t* input_data_; + int32_t input_offset_; const RuntimeShape& filter_shape_; - const uint8* filter_data_; - int32 filter_offset_; + const uint8_t* filter_data_; + int32_t filter_offset_; const RuntimeShape& bias_shape_; - const int32* bias_data_; - int32 output_offset_; - int32 output_multiplier_; + const int32_t* bias_data_; + int32_t output_offset_; + int32_t output_multiplier_; int output_shift_; - int32 output_activation_min_; - int32 output_activation_max_; + int32_t output_activation_min_; + int32_t output_activation_max_; const RuntimeShape& output_shape_; - uint8* output_data_; + uint8_t* output_data_; int row_start_; int row_end_; }; inline void FullyConnectedAsGEMV( - const RuntimeShape& input_shape, const uint8* input_data, - int32 input_offset, const RuntimeShape& filter_shape, - const uint8* filter_data, int32 filter_offset, - const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, int32 output_activation_min, - int32 output_activation_max, const RuntimeShape& output_shape, - uint8* output_data, gemmlowp::GemmContext* gemmlowp_context) { + const RuntimeShape& input_shape, const uint8_t* input_data, + int32_t input_offset, const RuntimeShape& filter_shape, + const uint8_t* filter_data, int32_t filter_offset, + const RuntimeShape& bias_shape, const int32_t* bias_data, + int32_t output_offset, int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + const RuntimeShape& output_shape, uint8_t* output_data, + gemmlowp::GemmContext* gemmlowp_context) { const int output_dim_count = output_shape.DimensionsCount(); const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); const int output_rows = output_shape.Dims(output_dim_count - 1); @@ -1090,18 +1092,18 @@ inline void FullyConnectedAsGEMV( inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data, gemmlowp::GemmContext* gemmlowp_context) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data, gemmlowp::GemmContext* gemmlowp_context) { ruy::profiler::ScopeLabel label("FullyConnected/8bit"); - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); // TODO(b/62193649): This really should be: @@ -1132,16 +1134,16 @@ inline void FullyConnected( TFLITE_DCHECK_EQ(output_rows, filter_rows); TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows); - gemmlowp::MatrixMap filter_matrix( - filter_data, output_rows, filter_cols, filter_cols); - gemmlowp::MatrixMap input_matrix( + gemmlowp::MatrixMap + filter_matrix(filter_data, output_rows, filter_cols, filter_cols); + gemmlowp::MatrixMap input_matrix( input_data, filter_cols, batches, filter_cols); - gemmlowp::MatrixMap output_matrix( + gemmlowp::MatrixMap output_matrix( output_data, output_rows, batches, output_rows); const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp( bias_data, output_rows, output_offset, output_multiplier, output_shift, output_activation_min, output_activation_max); - gemmlowp::GemmWithOutputPipeline( gemmlowp_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, output_pipeline); @@ -1634,18 +1636,18 @@ inline void GEMVForLstmCellWithSymmetricRange( inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data_int32, const RuntimeShape& output_shape, - int16* output_data, gemmlowp::GemmContext* gemmlowp_context) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data_int32, const RuntimeShape& output_shape, + int16_t* output_data, gemmlowp::GemmContext* gemmlowp_context) { ruy::profiler::ScopeLabel label("FullyConnected/Uint8Int16"); - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; // This is a copy of the reference implementation. We do not currently have a // properly optimized version. (void)gemmlowp_context; // only used in properly optimized code. @@ -1690,13 +1692,13 @@ inline void FullyConnected( } } #endif - gemmlowp::MatrixMap weights_matrix( - filter_data, output_depth, accum_depth); - gemmlowp::MatrixMap input_matrix( + gemmlowp::MatrixMap + weights_matrix(filter_data, output_depth, accum_depth); + gemmlowp::MatrixMap input_matrix( input_data, accum_depth, batches); - gemmlowp::MatrixMap output_matrix( + gemmlowp::MatrixMap output_matrix( output_data, output_depth, batches); - typedef gemmlowp::VectorMap + typedef gemmlowp::VectorMap ColVectorMap; ColVectorMap bias_vector(bias_data_int32, output_depth); gemmlowp::OutputStageBiasAddition bias_addition_stage; @@ -1713,19 +1715,19 @@ inline void FullyConnected( auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage, clamp_stage, saturating_cast_int16_stage); - gemmlowp::GemmWithOutputPipeline( gemmlowp_context, weights_matrix, input_matrix, &output_matrix, filter_offset, input_offset, output_pipeline); } -inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, +inline void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims, gemmlowp::GemmContext* gemmlowp_context) { tflite::FullyConnectedParams op_params; @@ -1744,13 +1746,16 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, gemmlowp_context); } -inline void FullyConnected( - const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, - const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data_int32, const Dims<4>& bias_dims, int32 output_offset, - int32 output_multiplier, int output_shift, int32 output_activation_min, - int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemmlowp_context) { +inline void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data_int32, + const Dims<4>& bias_dims, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, + int32_t output_activation_max, int16_t* output_data, + const Dims<4>& output_dims, + gemmlowp::GemmContext* gemmlowp_context) { tflite::FullyConnectedParams op_params; op_params.input_offset = input_offset; op_params.weights_offset = filter_offset; @@ -1769,13 +1774,13 @@ inline void FullyConnected( // legacy, for compatibility with old checked-in code template -void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, +void FullyConnected(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims, gemmlowp::GemmContext* gemmlowp_context) { static_assert(Ac == FusedActivationFunctionType::kNone || @@ -1793,12 +1798,13 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, #ifdef USE_NEON inline void LegacyInt8FullyConnectedAsGEMVWorkerImpl( const RuntimeShape& input_shape, const int8_t* input_data, - int32 input_offset, const RuntimeShape& filter_shape, - const int8_t* filter_data, int32 filter_offset, - const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, int32 output_activation_min, - int32 output_activation_max, const RuntimeShape& output_shape, - int8_t* output_data, int row_start, int row_end) { + int32_t input_offset, const RuntimeShape& filter_shape, + const int8_t* filter_data, int32_t filter_offset, + const RuntimeShape& bias_shape, const int32_t* bias_data, + int32_t output_offset, int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + const RuntimeShape& output_shape, int8_t* output_data, int row_start, + int row_end) { ruy::profiler::ScopeLabel label("FullyConnectedAsGEMVInt8/8bit"); TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); @@ -1931,16 +1937,16 @@ inline void LegacyInt8FullyConnectedAsGEMVWorkerImpl( vget_high_s16(input_val)); } if (in < input_size) { - int32 buf[16]; + int32_t buf[16]; vst1q_s32(buf + 0, acc0); vst1q_s32(buf + 4, acc1); vst1q_s32(buf + 8, acc2); vst1q_s32(buf + 12, acc3); for (; in < input_size; in++) { int lane = (in + 8 - input_size) % 4; - const int32 input_val = input_data[in] + input_offset; + const int32_t input_val = input_data[in] + input_offset; for (int k = 0; k < kPeel; k++) { - int32 filter_val = + int32_t filter_val = filter_data[in + (out + k) * input_size] + filter_offset; buf[lane + 4 * k] += filter_val * input_val; } @@ -1969,7 +1975,7 @@ inline void LegacyInt8FullyConnectedAsGEMVWorkerImpl( int32x4_t bias_vec = vld1q_s32(bias_data + out); reduced = vaddq_s32(reduced, bias_vec); if (shift_left) { - const int32 multiplier_power_of_two = 1 << output_shift; + const int32_t multiplier_power_of_two = 1 << output_shift; reduced = vmulq_n_s32(reduced, multiplier_power_of_two); reduced = vqrdmulhq_n_s32(reduced, output_multiplier); } else { @@ -2000,11 +2006,11 @@ inline void LegacyInt8FullyConnectedAsGEMVWorkerImpl( struct LegacyInt8FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task { LegacyInt8FullyConnectedAsGEMVWorkerTask( const RuntimeShape& input_shape, const int8_t* input_data, - int32 input_offset, const RuntimeShape& filter_shape, - const int8_t* filter_data, int32 filter_offset, - const RuntimeShape& bias_shape, const int32* bias_data, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, + int32_t input_offset, const RuntimeShape& filter_shape, + const int8_t* filter_data, int32_t filter_offset, + const RuntimeShape& bias_shape, const int32_t* bias_data, + int32_t output_offset, int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, const RuntimeShape& output_shape, int8_t* output_data, int row_start, int row_end) : input_shape_(input_shape), @@ -2036,17 +2042,17 @@ struct LegacyInt8FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task { const RuntimeShape& input_shape_; const int8_t* input_data_; - int32 input_offset_; + int32_t input_offset_; const RuntimeShape& filter_shape_; const int8_t* filter_data_; - int32 filter_offset_; + int32_t filter_offset_; const RuntimeShape& bias_shape_; - const int32* bias_data_; - int32 output_offset_; - int32 output_multiplier_; + const int32_t* bias_data_; + int32_t output_offset_; + int32_t output_multiplier_; int output_shift_; - int32 output_activation_min_; - int32 output_activation_max_; + int32_t output_activation_min_; + int32_t output_activation_max_; const RuntimeShape& output_shape_; int8_t* output_data_; int row_start_; @@ -2055,12 +2061,13 @@ struct LegacyInt8FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task { inline void LegacyInt8FullyConnectedAsGEMV( const RuntimeShape& input_shape, const int8_t* input_data, - int32 input_offset, const RuntimeShape& filter_shape, - const int8_t* filter_data, int32 filter_offset, - const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, int32 output_activation_min, - int32 output_activation_max, const RuntimeShape& output_shape, - int8_t* output_data, gemmlowp::GemmContext* gemmlowp_context) { + int32_t input_offset, const RuntimeShape& filter_shape, + const int8_t* filter_data, int32_t filter_offset, + const RuntimeShape& bias_shape, const int32_t* bias_data, + int32_t output_offset, int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + const RuntimeShape& output_shape, int8_t* output_data, + gemmlowp::GemmContext* gemmlowp_context) { const int output_dim_count = output_shape.DimensionsCount(); const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); const int output_rows = output_shape.Dims(output_dim_count - 1); @@ -2104,20 +2111,20 @@ inline void LegacyInt8FullyConnectedAsGEMV( inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, - const int8* input_data, const RuntimeShape& filter_shape, - const int8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, int8* output_data, - gemmlowp::GemmContext* gemmlowp_context) { + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data, gemmlowp::GemmContext* gemmlowp_context) { ruy::profiler::ScopeLabel label("FullyConnectedInt8/8bit"); #ifdef USE_NEON - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); // TODO(b/62193649): This really should be: @@ -2174,13 +2181,13 @@ inline void FullyConnected( } struct LegacyShuffledFullyConnectedWorkerTask : gemmlowp::Task { - LegacyShuffledFullyConnectedWorkerTask(const uint8* input_data, - const int8* shuffled_weights_data, + LegacyShuffledFullyConnectedWorkerTask(const uint8_t* input_data, + const int8_t* shuffled_weights_data, int batches, int output_depth, int output_stride, int accum_depth, - const int32* bias_data, - int32 output_multiplier, - int output_shift, int16* output_data) + const int32_t* bias_data, + int32_t output_multiplier, + int output_shift, int16_t* output_data) : input_data_(input_data), shuffled_weights_data_(shuffled_weights_data), batches_(batches), @@ -2199,30 +2206,30 @@ struct LegacyShuffledFullyConnectedWorkerTask : gemmlowp::Task { output_shift_, output_data_); } - const uint8* input_data_; - const int8* shuffled_weights_data_; + const uint8_t* input_data_; + const int8_t* shuffled_weights_data_; int batches_; int output_depth_; int output_stride_; int accum_depth_; - const int32* bias_data_; - int32 output_multiplier_; + const int32_t* bias_data_; + int32_t output_multiplier_; int output_shift_; - int16* output_data_; + int16_t* output_data_; }; inline void ShuffledFullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& weights_shape, - const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int16* output_data, uint8* shuffled_input_workspace_data, + const uint8_t* input_data, const RuntimeShape& weights_shape, + const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int16_t* output_data, uint8_t* shuffled_input_workspace_data, gemmlowp::GemmContext* gemmlowp_context) { ruy::profiler::ScopeLabel label("ShuffledFullyConnected/8bit"); - const int32 output_multiplier = params.output_multiplier; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; (void)gemmlowp_context; // only used in optimized code. TFLITE_DCHECK_EQ(output_activation_min, -32768); TFLITE_DCHECK_EQ(output_activation_max, 32767); @@ -2246,8 +2253,8 @@ inline void ShuffledFullyConnected( // so that just reinterpreting them as int8 values is equivalent to // subtracting 128 from them, thus implementing for free the subtraction of // the zero_point value 128. - const int8* int8_shuffled_weights_data = - reinterpret_cast(shuffled_weights_data); + const int8_t* int8_shuffled_weights_data = + reinterpret_cast(shuffled_weights_data); // Shuffling and xoring of input activations into the workspace buffer if (batches == 1) { @@ -2264,12 +2271,12 @@ inline void ShuffledFullyConnected( } #endif } else if (batches == 4) { - uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data; int c = 0; #ifdef USE_NEON const uint8x16_t signbit = vdupq_n_u8(0x80); for (c = 0; c < accum_depth; c += 16) { - const uint8* src_data_ptr = input_data + c; + const uint8_t* src_data_ptr = input_data + c; uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth); uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth); uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth); @@ -2337,12 +2344,12 @@ inline void ShuffledFullyConnected( } inline void ShuffledFullyConnected( - const uint8* input_data, const Dims<4>& input_dims, - const uint8* shuffled_weights_data, const Dims<4>& weights_dims, - const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - int16* output_data, const Dims<4>& output_dims, - uint8* shuffled_input_workspace_data, + const uint8_t* input_data, const Dims<4>& input_dims, + const uint8_t* shuffled_weights_data, const Dims<4>& weights_dims, + const int32_t* bias_data, const Dims<4>& bias_dims, + int32_t output_multiplier, int output_shift, int32_t output_activation_min, + int32_t output_activation_max, int16_t* output_data, + const Dims<4>& output_dims, uint8_t* shuffled_input_workspace_data, gemmlowp::GemmContext* gemmlowp_context) { tflite::FullyConnectedParams op_params; op_params.output_multiplier = output_multiplier; @@ -2363,7 +2370,7 @@ inline void ExtractPatchIntoBufferColumn( const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth, int stride_width, int stride_height, int pad_width, int pad_height, int in_width, int in_height, int in_depth, int single_buffer_length, - int buffer_id, const T* in_data, T* conv_buffer_data, uint8 zero_byte) { + int buffer_id, const T* in_data, T* conv_buffer_data, uint8_t zero_byte) { ExtractPatchIntoBufferColumn( DimsToShape(input_dims), w, h, b, kheight, kwidth, stride_width, stride_height, pad_width, pad_height, in_width, in_height, in_depth, @@ -2375,7 +2382,7 @@ void DilatedIm2col(const T* input_data, const Dims<4>& input_dims, const Dims<4>& filter_dims, int stride_width, int stride_height, int dilation_width_factor, int dilation_height_factor, int pad_width, int pad_height, - const Dims<4>& output_dims, uint8 zero_byte, + const Dims<4>& output_dims, uint8_t zero_byte, T* im2col_data) { tflite::ConvParams op_params; // Padding type is ignored, but still set. @@ -2395,7 +2402,7 @@ void DilatedIm2col(const T* input_data, const Dims<4>& input_dims, template void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, int stride_height, int pad_width, int pad_height, int kheight, - int kwidth, uint8 zero_byte, T* output_data, + int kwidth, uint8_t zero_byte, T* output_data, const Dims<4>& output_dims) { tflite::ConvParams op_params; // Padding type is ignored, but still set. @@ -2415,7 +2422,7 @@ void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, template void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, int pad_width, int pad_height, int kheight, int kwidth, - uint8 zero_byte, T* output_data, const Dims<4>& output_dims) { + uint8_t zero_byte, T* output_data, const Dims<4>& output_dims) { Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, kwidth, zero_byte, output_data, output_dims); } @@ -2441,7 +2448,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, ruy::profiler::ScopeLabel label("Conv"); // NB: the float 0.0f value is represented by all zero bytes. - const uint8 float_zero_byte = 0x00; + const uint8_t float_zero_byte = 0x00; const float* gemm_input_data = nullptr; const RuntimeShape* gemm_input_shape = nullptr; const int filter_width = filter_shape.Dims(2); @@ -2622,28 +2629,29 @@ void Conv(const float* input_data, const Dims<4>& input_dims, } inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data, const RuntimeShape& im2col_shape, - uint8* im2col_data, gemmlowp::GemmContext* gemmlowp_context) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data, const RuntimeShape& im2col_shape, + uint8_t* im2col_data, + gemmlowp::GemmContext* gemmlowp_context) { ruy::profiler::ScopeLabel label("Conv/8bit"); const int stride_width = params.stride_width; const int stride_height = params.stride_height; const int dilation_width_factor = params.dilation_width_factor; const int dilation_height_factor = params.dilation_height_factor; - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); - const uint8* gemm_input_data = nullptr; + const uint8_t* gemm_input_data = nullptr; const RuntimeShape* gemm_input_shape = nullptr; const int filter_width = filter_shape.Dims(2); const int filter_height = filter_shape.Dims(1); @@ -2712,31 +2720,32 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, } #endif - gemmlowp::MatrixMap filter_matrix( - filter_data, filter_rows, filter_cols); - gemmlowp::MatrixMap input_matrix( + gemmlowp::MatrixMap + filter_matrix(filter_data, filter_rows, filter_cols); + gemmlowp::MatrixMap input_matrix( gemm_input_data, gemm_input_rows, gemm_input_cols); - gemmlowp::MatrixMap output_matrix( + gemmlowp::MatrixMap output_matrix( output_data, output_rows, output_cols); const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp( bias_data, output_rows, output_offset, output_multiplier, output_shift, output_activation_min, output_activation_max); - gemmlowp::GemmWithOutputPipeline( gemmlowp_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, output_pipeline); } -inline void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, +inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, int stride_width, int stride_height, int dilation_width_factor, int dilation_height_factor, int pad_width, int pad_height, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, - uint8* im2col_data, const Dims<4>& im2col_dims, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, + const Dims<4>& output_dims, uint8_t* im2col_data, + const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemmlowp_context) { tflite::ConvParams op_params; // Padding type is ignored, but still set. @@ -2761,16 +2770,16 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, output_data, DimsToShape(im2col_dims), im2col_data, gemmlowp_context); } -inline void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, +inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, int stride_width, int stride_height, int pad_width, - int pad_height, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, + int pad_height, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims, + uint8_t* im2col_data, const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemmlowp_context) { Conv(input_data, input_dims, input_offset, filter_data, filter_dims, filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1, @@ -2781,16 +2790,16 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code template -inline void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, +inline void Conv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, int stride_width, int stride_height, int pad_width, - int pad_height, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, + int pad_height, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims, + uint8_t* im2col_data, const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemmlowp_context) { static_assert(Ac == FusedActivationFunctionType::kNone || Ac == FusedActivationFunctionType::kRelu || @@ -2810,15 +2819,16 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code template -void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemmlowp_context) { +void Conv(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims, + uint8_t* im2col_data, const Dims<4>& im2col_dims, + gemmlowp::GemmContext* gemmlowp_context) { static_assert(Ac == FusedActivationFunctionType::kNone || Ac == FusedActivationFunctionType::kRelu || Ac == FusedActivationFunctionType::kRelu6 || @@ -2835,7 +2845,7 @@ void Conv(const uint8* input_data, const Dims<4>& input_dims, template void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, int pad_width, int pad_height, int kheight, int kwidth, - uint8 zero_byte, T* output_data, const Dims<4>& output_dims) { + uint8_t zero_byte, T* output_data, const Dims<4>& output_dims) { Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, kwidth, zero_byte, output_data, output_dims); } @@ -2863,13 +2873,14 @@ void ConvAsGemm(const float* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code template -void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, +void ConvAsGemm(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_offset, const uint8_t* filter_data, + const Dims<4>& filter_dims, int32_t filter_offset, + const int32_t* bias_data, const Dims<4>& bias_dims, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, + const Dims<4>& output_dims, gemmlowp::GemmContext* gemmlowp_context) { ruy::profiler::ScopeLabel label("ConvAsGemm/8bit"); static_assert(Ac == FusedActivationFunctionType::kNone || @@ -2890,16 +2901,16 @@ void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims, TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); - gemmlowp::MatrixMap filter_matrix( - filter_data, output_rows, filter_cols, filter_cols); - gemmlowp::MatrixMap input_matrix( + gemmlowp::MatrixMap + filter_matrix(filter_data, output_rows, filter_cols, filter_cols); + gemmlowp::MatrixMap input_matrix( input_data, filter_cols, output_cols, filter_cols); - gemmlowp::MatrixMap output_matrix( + gemmlowp::MatrixMap output_matrix( output_data, output_rows, output_cols, output_rows); const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp( bias_data, output_rows, output_offset, output_multiplier, -output_shift, output_activation_min, output_activation_max); - gemmlowp::GemmWithOutputPipeline( gemmlowp_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, output_pipeline); @@ -2962,7 +2973,7 @@ template void TransposeIm2col(const T* input_data, const Dims<4>& input_dims, const Dims<4>& filter_dims, int stride_width, int stride_height, int pad_width, int pad_height, - const Dims<4>& output_dims, uint8 zero_byte, + const Dims<4>& output_dims, uint8_t zero_byte, T* im2col_data) { tflite::ConvParams op_params; // Padding type is ignored, but still set. @@ -3120,25 +3131,25 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims, template inline void LstmCell( const LstmCellParams& params, const RuntimeShape& unextended_input_shape, - const uint8* input_data_uint8, + const uint8_t* input_data_uint8, const RuntimeShape& unextended_prev_activ_shape, - const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape, - const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape, - const int32* bias_data_int32, + const uint8_t* prev_activ_data_uint8, const RuntimeShape& weights_shape, + const uint8_t* weights_data_uint8, + const RuntimeShape& unextended_bias_shape, const int32_t* bias_data_int32, const RuntimeShape& unextended_prev_state_shape, - const int16* prev_state_data_int16, + const int16_t* prev_state_data_int16, const RuntimeShape& unextended_output_state_shape, - int16* output_state_data_int16, + int16_t* output_state_data_int16, const RuntimeShape& unextended_output_activ_shape, - uint8* output_activ_data_uint8, + uint8_t* output_activ_data_uint8, const RuntimeShape& unextended_concat_temp_shape, - uint8* concat_temp_data_uint8, + uint8_t* concat_temp_data_uint8, const RuntimeShape& unextended_activ_temp_shape, - int16* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) { + int16_t* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) { ruy::profiler::ScopeLabel label( "LstmCell/quantized (8bit external, 16bit internal)"); - int32 weights_zero_point = params.weights_zero_point; - int32 accum_multiplier = params.accum_multiplier; + int32_t weights_zero_point = params.weights_zero_point; + int32_t accum_multiplier = params.accum_multiplier; int accum_shift = params.accum_shift; TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); @@ -3193,8 +3204,8 @@ inline void LstmCell( TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth); // Depth-concatenate prev_activ and input data together. - uint8 const* concat_input_arrays_data[2] = {input_data_uint8, - prev_activ_data_uint8}; + const uint8_t* concat_input_arrays_data[2] = {input_data_uint8, + prev_activ_data_uint8}; const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape, &prev_activ_shape}; tflite::ConcatenationParams concat_params; @@ -3220,13 +3231,13 @@ inline void LstmCell( } #endif if (!gemm_already_performed) { - gemmlowp::MatrixMap + gemmlowp::MatrixMap weights_matrix(weights_data_uint8, fc_output_depth, fc_accum_depth); - gemmlowp::MatrixMap input_matrix( - concat_temp_data_uint8, fc_accum_depth, fc_batches); - gemmlowp::MatrixMap output_matrix( + gemmlowp::MatrixMap + input_matrix(concat_temp_data_uint8, fc_accum_depth, fc_batches); + gemmlowp::MatrixMap output_matrix( activ_temp_data_int16, fc_output_depth, fc_batches); - typedef gemmlowp::VectorMap + typedef gemmlowp::VectorMap ColVectorMap; ColVectorMap bias_vector(bias_data_int32, fc_output_depth); gemmlowp::OutputStageBiasAddition bias_addition_stage; @@ -3239,21 +3250,23 @@ inline void LstmCell( auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage, saturating_cast_int16_stage); gemmlowp::GemmWithOutputPipeline< - uint8, int16, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + uint8_t, int16_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( gemmlowp_context, weights_matrix, input_matrix, &output_matrix, -weights_zero_point, -128, output_pipeline); } // Rest of the LSTM cell: tanh and logistic math functions, and some adds // and muls, all done in 16-bit fixed-point. - const int16* input_gate_input_ptr = activ_temp_data_int16; - const int16* input_modulation_gate_input_ptr = + const int16_t* input_gate_input_ptr = activ_temp_data_int16; + const int16_t* input_modulation_gate_input_ptr = activ_temp_data_int16 + output_depth; - const int16* forget_gate_input_ptr = activ_temp_data_int16 + 2 * output_depth; - const int16* output_gate_input_ptr = activ_temp_data_int16 + 3 * output_depth; - const int16* prev_state_ptr = prev_state_data_int16; - int16* output_state_data_ptr = output_state_data_int16; - uint8* output_activ_data_ptr = output_activ_data_uint8; + const int16_t* forget_gate_input_ptr = + activ_temp_data_int16 + 2 * output_depth; + const int16_t* output_gate_input_ptr = + activ_temp_data_int16 + 3 * output_depth; + const int16_t* prev_state_ptr = prev_state_data_int16; + int16_t* output_state_data_ptr = output_state_data_int16; + uint8_t* output_activ_data_ptr = output_activ_data_uint8; for (int b = 0; b < outer_size; ++b) { int c = 0; @@ -3391,10 +3404,10 @@ inline void LstmCell( *output_state_data_ptr++ = new_state.raw(); // Down-scale the output activations to 8-bit integers, saturating, // and store back to memory. - int16 rescaled_output_activ = + int16_t rescaled_output_activ = gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8); - int16 clamped_output_activ = - std::max(-128, std::min(127, rescaled_output_activ)); + int16_t clamped_output_activ = std::max( + -128, std::min(127, rescaled_output_activ)); *output_activ_data_ptr++ = 128 + clamped_output_activ; } input_gate_input_ptr += 3 * output_depth; @@ -3405,17 +3418,18 @@ inline void LstmCell( } template -void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, - const uint8* prev_activ_data_uint8, - const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, - const Dims<4>& weights_dims, const int32* bias_data_int32, - const Dims<4>& bias_dims, const int16* prev_state_data_int16, - const Dims<4>& prev_state_dims, int16* output_state_data_int16, - const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, - const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, - const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, - const Dims<4>& activ_temp_dims, int32 weights_zero_point, - int32 accum_multiplier, int accum_shift, +void LstmCell(const uint8_t* input_data_uint8, const Dims<4>& input_dims, + const uint8_t* prev_activ_data_uint8, + const Dims<4>& prev_activ_dims, const uint8_t* weights_data_uint8, + const Dims<4>& weights_dims, const int32_t* bias_data_int32, + const Dims<4>& bias_dims, const int16_t* prev_state_data_int16, + const Dims<4>& prev_state_dims, int16_t* output_state_data_int16, + const Dims<4>& output_state_dims, + uint8_t* output_activ_data_uint8, + const Dims<4>& output_activ_dims, uint8_t* concat_temp_data_uint8, + const Dims<4>& concat_temp_dims, int16_t* activ_temp_data_int16, + const Dims<4>& activ_temp_dims, int32_t weights_zero_point, + int32_t accum_multiplier, int accum_shift, gemmlowp::GemmContext* gemmlowp_context) { tflite::LstmCellParams op_params; op_params.weights_zero_point = weights_zero_point; @@ -3458,9 +3472,9 @@ void L2Normalization(const float* input_data, const RuntimeShape& input_shape, output_data); } -inline void L2Normalization(const uint8* input_data, +inline void L2Normalization(const uint8_t* input_data, const RuntimeShape& input_shape, - int32 input_zero_point, uint8* output_data, + int32_t input_zero_point, uint8_t* output_data, const RuntimeShape& output_shape) { tflite::L2NormalizationParams op_params; op_params.input_zero_point = input_zero_point; @@ -3476,9 +3490,9 @@ void L2Normalization(const float* input_data, const Dims<4>& input_dims, DimsToShape(output_dims)); } -inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, uint8* output_data, - const Dims<4>& output_dims) { +inline void L2Normalization(const uint8_t* input_data, + const Dims<4>& input_dims, int32_t input_zero_point, + uint8_t* output_data, const Dims<4>& output_dims) { L2Normalization(input_data, DimsToShape(input_dims), input_zero_point, output_data, DimsToShape(output_dims)); } @@ -3506,14 +3520,15 @@ void Add(const float* input1_data, const Dims<4>& input1_dims, } template -inline void Add(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, int input2_shift, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { +inline void Add(int left_shift, const uint8_t* input1_data, + const Dims<4>& input1_dims, int32_t input1_offset, + int32_t input1_multiplier, int input1_shift, + const uint8_t* input2_data, const Dims<4>& input2_dims, + int32_t input2_offset, int32_t input2_multiplier, + int input2_shift, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims) { constexpr int kReverseShift = -1; static_assert(Ac == FusedActivationFunctionType::kNone || Ac == FusedActivationFunctionType::kRelu || @@ -3545,15 +3560,15 @@ inline void Add(int left_shift, const uint8* input1_data, } template -void Add(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32* output_data, const Dims<4>& output_dims) { +void Add(const int32_t* input1_data, const Dims<4>& input1_dims, + const int32_t* input2_data, const Dims<4>& input2_dims, + int32_t* output_data, const Dims<4>& output_dims) { ruy::profiler::ScopeLabel label("Add/int32"); TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); tflite::ArithmeticParams op_params; - op_params.quantized_activation_min = std::numeric_limits::min(); - op_params.quantized_activation_max = std::numeric_limits::max(); + op_params.quantized_activation_min = std::numeric_limits::min(); + op_params.quantized_activation_max = std::numeric_limits::max(); Add(op_params, DimsToShape(input1_dims), input1_data, DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), output_data); @@ -3573,15 +3588,15 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, } template -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, +inline void BroadcastAdd(int left_shift, const uint8_t* input1_data, + const Dims<4>& input1_dims, int32_t input1_offset, + int32_t input1_multiplier, int input1_shift, + const uint8_t* input2_data, const Dims<4>& input2_dims, + int32_t input2_offset, int32_t input2_multiplier, + int input2_shift, int32_t output_offset, + int32_t output_multiplier, int output_shift, + int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { constexpr int kReverseShift = -1; static_assert(Ac == FusedActivationFunctionType::kNone || @@ -3616,12 +3631,13 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, template inline void BroadcastAddFivefold( int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { + const uint8_t* input1_data, const Dims<4>& input1_dims, + int32_t input1_offset, int32_t input1_multiplier, int input1_shift, + const uint8_t* input2_data, const Dims<4>& input2_dims, + int32_t input2_offset, int32_t input2_multiplier, int input2_shift, + int32_t output_offset, int32_t output_multiplier, int output_shift, + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims) { constexpr int kReverseShift = -1; static_assert(Ac == FusedActivationFunctionType::kNone || Ac == FusedActivationFunctionType::kRelu || @@ -3672,11 +3688,11 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, } template -inline void Add(const int16* input1_data, const Dims<4>& input1_dims, - int input1_shift, const int16* input2_data, +inline void Add(const int16_t* input1_data, const Dims<4>& input1_dims, + int input1_shift, const int16_t* input2_data, const Dims<4>& input2_dims, int input2_shift, - int16 output_activation_min, int16 output_activation_max, - int16* output_data, const Dims<4>& output_dims) { + int16_t output_activation_min, int16_t output_activation_max, + int16_t* output_data, const Dims<4>& output_dims) { constexpr int kReverseShift = -1; static_assert(Ac == FusedActivationFunctionType::kNone || Ac == FusedActivationFunctionType::kRelu || @@ -3728,12 +3744,12 @@ void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, output_data); } -inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, - int32 input1_offset, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, +inline void BroadcastMul(const uint8_t* input1_data, const Dims<4>& input1_dims, + int32_t input1_offset, const uint8_t* input2_data, + const Dims<4>& input2_dims, int32_t input2_offset, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { tflite::ArithmeticParams op_params; SetActivationParams(output_activation_min, output_activation_max, &op_params); @@ -3750,12 +3766,12 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, // legacy, for compatibility with old checked-in code template -inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, - int32 input1_offset, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, +inline void BroadcastMul(const uint8_t* input1_data, const Dims<4>& input1_dims, + int32_t input1_offset, const uint8_t* input2_data, + const Dims<4>& input2_dims, int32_t input2_offset, + int32_t output_offset, int32_t output_multiplier, + int output_shift, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, input2_dims, input2_offset, output_offset, output_multiplier, @@ -3808,11 +3824,11 @@ bool AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, output_dims); } -inline bool AveragePool(const uint8* input_data, const Dims<4>& input_dims, +inline bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims, int stride_width, int stride_height, int pad_width, int pad_height, int filter_width, int filter_height, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, + int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { tflite::PoolParams params; params.stride_height = stride_height; @@ -3829,11 +3845,11 @@ inline bool AveragePool(const uint8* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code template -bool AveragePool(const uint8* input_data, const Dims<4>& input_dims, +bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims, int stride_width, int stride_height, int pad_width, int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims) { static_assert(Ac == FusedActivationFunctionType::kNone || Ac == FusedActivationFunctionType::kRelu || Ac == FusedActivationFunctionType::kRelu6 || @@ -3851,10 +3867,10 @@ bool AveragePool(const uint8* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code template -bool AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, - int filter_height, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, +bool AveragePool(const uint8_t* input_data, const Dims<4>& input_dims, + int stride, int pad_width, int pad_height, int filter_width, + int filter_height, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { return AveragePool(input_data, input_dims, stride, stride, pad_width, pad_height, filter_width, filter_height, @@ -3902,11 +3918,12 @@ void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, filter_width, filter_height, output_data, output_dims); } -inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, +inline void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims, int stride_width, int stride_height, int pad_width, int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { + int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, + const Dims<4>& output_dims) { PoolParams params; params.stride_height = stride_height; params.stride_width = stride_width; @@ -3922,10 +3939,10 @@ inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code template -void MaxPool(const uint8* input_data, const Dims<4>& input_dims, +void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims, int stride_width, int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, + int filter_width, int filter_height, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { static_assert(Ac == FusedActivationFunctionType::kNone || Ac == FusedActivationFunctionType::kRelu || @@ -3943,10 +3960,10 @@ void MaxPool(const uint8* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code template -void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, +void MaxPool(const uint8_t* input_data, const Dims<4>& input_dims, int stride, int pad_width, int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { + int32_t output_activation_min, int32_t output_activation_max, + uint8_t* output_data, const Dims<4>& output_dims) { MaxPool(input_data, input_dims, stride, stride, pad_width, pad_height, filter_width, filter_height, output_activation_min, output_activation_max, output_data, output_dims); @@ -3993,10 +4010,10 @@ void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, } inline void Softmax(const SoftmaxParams& params, - const RuntimeShape& input_shape, const uint8* input_data, - const RuntimeShape& output_shape, uint8* output_data) { - const int32 input_beta_multiplier = params.input_multiplier; - const int32 input_beta_left_shift = params.input_left_shift; + const RuntimeShape& input_shape, const uint8_t* input_data, + const RuntimeShape& output_shape, uint8_t* output_data) { + const int32_t input_beta_multiplier = params.input_multiplier; + const int32_t input_beta_left_shift = params.input_left_shift; const int diff_min = params.diff_min; // The representation chosen for the input to the exp() function is Q5.26. // We need to leave extra space since values that we skip might be as large as @@ -4006,9 +4023,10 @@ inline void Softmax(const SoftmaxParams& params, static const int kScaledDiffIntegerBits = 5; static const int kAccumulationIntegerBits = 12; using FixedPointScaledDiff = - gemmlowp::FixedPoint; - using FixedPointAccum = gemmlowp::FixedPoint; - using FixedPoint0 = gemmlowp::FixedPoint; + gemmlowp::FixedPoint; + using FixedPointAccum = + gemmlowp::FixedPoint; + using FixedPoint0 = gemmlowp::FixedPoint; ruy::profiler::ScopeLabel label("Softmax/8bit"); const int trailing_dim = input_shape.DimensionsCount() - 1; @@ -4018,11 +4036,11 @@ inline void Softmax(const SoftmaxParams& params, MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int b = 0; b < outer_size; ++b) { - const uint8* input_data_ptr = input_data + b * depth; - uint8* output_data_ptr = output_data + b * depth; + const uint8_t* input_data_ptr = input_data + b * depth; + uint8_t* output_data_ptr = output_data + b * depth; // Determine the largest entry in the current row - uint8 max_in_row = 0; + uint8_t max_in_row = 0; { int c = 0; #ifdef USE_NEON @@ -4114,9 +4132,10 @@ inline void Softmax(const SoftmaxParams& params, FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0)); #endif for (; c < depth; ++c) { - int32 input_diff = static_cast(input_data_ptr[c]) - max_in_row; + int32_t input_diff = + static_cast(input_data_ptr[c]) - max_in_row; if (input_diff >= diff_min) { - const int32 input_diff_rescaled = + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( input_diff, input_beta_multiplier, input_beta_left_shift); const FixedPointScaledDiff scaled_diff_f8 = @@ -4172,16 +4191,17 @@ inline void Softmax(const SoftmaxParams& params, } #endif for (; c < depth; ++c) { - int32 input_diff = static_cast(input_data_ptr[c]) - max_in_row; + int32_t input_diff = + static_cast(input_data_ptr[c]) - max_in_row; if (input_diff >= diff_min) { - const int32 input_diff_rescaled = + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( input_diff, input_beta_multiplier, input_beta_left_shift); const FixedPointScaledDiff scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled); FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); - int32 unsat_output = gemmlowp::RoundingDivideByPOT( + int32_t unsat_output = gemmlowp::RoundingDivideByPOT( (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); output_data_ptr[c] = std::max(std::min(unsat_output, 255), 0); @@ -4209,20 +4229,20 @@ inline void Softmax(const float* input_data, const Dims<4>& input_dims, DimsToShape(output_dims)); } -inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape, - int32 input_beta_multiplier, int32 input_beta_left_shift, - int diff_min, uint8* output_data, - const RuntimeShape& output_shape) { +inline void Softmax(const uint8_t* input_data, const RuntimeShape& input_shape, + int32_t input_beta_multiplier, + int32_t input_beta_left_shift, int diff_min, + uint8_t* output_data, const RuntimeShape& output_shape) { SoftmaxParams params; params.input_multiplier = input_beta_multiplier; params.input_left_shift = input_beta_left_shift; params.diff_min = diff_min; Softmax(params, input_shape, input_data, output_shape, output_data); } -inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, - int32 input_beta_multiplier, int32 input_beta_left_shift, - int diff_min, uint8* output_data, - const Dims<4>& output_dims) { +inline void Softmax(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_beta_multiplier, + int32_t input_beta_left_shift, int diff_min, + uint8_t* output_data, const Dims<4>& output_dims) { Softmax(input_data, DimsToShape(input_dims), input_beta_multiplier, input_beta_left_shift, diff_min, output_data, DimsToShape(output_dims)); @@ -4241,11 +4261,12 @@ inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims, DimsToShape(output_dims)); } -inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape, - int32 input_multiplier, int32 input_left_shift, - int32 reverse_scaling_divisor, - int32 reverse_scaling_right_shift, int diff_min, - uint8* output_data, const RuntimeShape& output_shape) { +inline void LogSoftmax(const uint8_t* input_data, + const RuntimeShape& input_shape, + int32_t input_multiplier, int32_t input_left_shift, + int32_t reverse_scaling_divisor, + int32_t reverse_scaling_right_shift, int diff_min, + uint8_t* output_data, const RuntimeShape& output_shape) { SoftmaxParams params; params.input_multiplier = input_multiplier; params.input_left_shift = input_left_shift; @@ -4256,11 +4277,11 @@ inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape, output_data); } -inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, - int32 input_multiplier, int32 input_left_shift, - int32 reverse_scaling_divisor, - int32 reverse_scaling_right_shift, int diff_min, - uint8* output_data, const Dims<4>& output_dims) { +inline void LogSoftmax(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_multiplier, int32_t input_left_shift, + int32_t reverse_scaling_divisor, + int32_t reverse_scaling_right_shift, int diff_min, + uint8_t* output_data, const Dims<4>& output_dims) { reference_ops::LogSoftmax( input_data, DimsToShape(input_dims), input_multiplier, input_left_shift, reverse_scaling_divisor, reverse_scaling_right_shift, diff_min, @@ -4268,12 +4289,12 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, } inline void Logistic(const LogisticParams& params, - const RuntimeShape& input_shape, const uint8* input_data, - const RuntimeShape& output_shape, uint8* output_data) { + const RuntimeShape& input_shape, const uint8_t* input_data, + const RuntimeShape& output_shape, uint8_t* output_data) { ruy::profiler::ScopeLabel label("Logistic/Uint8"); - const int32 input_zero_point = params.input_zero_point; - const int32 input_range_radius = params.input_range_radius; - const int32 input_multiplier = params.input_multiplier; + const int32_t input_zero_point = params.input_zero_point; + const int32_t input_range_radius = params.input_range_radius; + const int32_t input_multiplier = params.input_multiplier; const int input_left_shift = params.input_left_shift; const int size = MatchingFlatSize(input_shape, output_shape); @@ -4378,39 +4399,39 @@ inline void Logistic(const LogisticParams& params, #endif // Leftover loop: handle one value at a time with scalar code. for (; c < size; ++c) { - const uint8 input_val_u8 = input_data[c]; - const int32 input_val_centered = - static_cast(input_val_u8) - input_zero_point; - uint8 output_val; + const uint8_t input_val_u8 = input_data[c]; + const int32_t input_val_centered = + static_cast(input_val_u8) - input_zero_point; + uint8_t output_val; if (input_val_centered < -input_range_radius) { output_val = 0; } else if (input_val_centered > input_range_radius) { output_val = 255; } else { - const int32 input_val_rescaled = + const int32_t input_val_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( input_val_centered, input_multiplier, input_left_shift); - using FixedPoint4 = gemmlowp::FixedPoint; - using FixedPoint0 = gemmlowp::FixedPoint; + using FixedPoint4 = gemmlowp::FixedPoint; + using FixedPoint0 = gemmlowp::FixedPoint; const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled); const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); using gemmlowp::RoundingDivideByPOT; - int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); + int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); if (output_val_s32 == 256) { output_val_s32 = 255; } TFLITE_DCHECK_GE(output_val_s32, 0); TFLITE_DCHECK_LE(output_val_s32, 255); - output_val = static_cast(output_val_s32); + output_val = static_cast(output_val_s32); } output_data[c] = output_val; } } -inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const RuntimeShape& output_shape) { +inline void Logistic(const uint8_t* input_data, const RuntimeShape& input_shape, + int32_t input_zero_point, int32_t input_range_radius, + int32_t input_multiplier, int input_left_shift, + uint8_t* output_data, const RuntimeShape& output_shape) { LogisticParams params; params.input_zero_point = input_zero_point; params.input_range_radius = input_range_radius; @@ -4425,31 +4446,31 @@ inline void Logistic(const float* input_data, const Dims<4>& input_dims, output_data); } -inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const Dims<4>& output_dims) { +inline void Logistic(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_zero_point, int32_t input_range_radius, + int32_t input_multiplier, int input_left_shift, + uint8_t* output_data, const Dims<4>& output_dims) { Logistic(input_data, DimsToShape(input_dims), input_zero_point, input_range_radius, input_multiplier, input_left_shift, output_data, DimsToShape(output_dims)); } -inline void Logistic(const RuntimeShape& input_shape, const int16* input_data, - const RuntimeShape& output_shape, int16* output_data) { +inline void Logistic(const RuntimeShape& input_shape, const int16_t* input_data, + const RuntimeShape& output_shape, int16_t* output_data) { LogisticParams params; // No params currently needed by int16 Logistic. Logistic(params, input_shape, input_data, output_shape, output_data); } -inline void Logistic(const int16* input_data, const RuntimeShape& input_shape, - int16* output_data, const RuntimeShape& output_shape) { +inline void Logistic(const int16_t* input_data, const RuntimeShape& input_shape, + int16_t* output_data, const RuntimeShape& output_shape) { LogisticParams params; // No params currently needed by int16 Logistic. Logistic(params, input_shape, input_data, output_shape, output_data); } -inline void Logistic(const int16* input_data, const Dims<4>& input_dims, - int16* output_data, const Dims<4>& output_dims) { +inline void Logistic(const int16_t* input_data, const Dims<4>& input_dims, + int16_t* output_data, const Dims<4>& output_dims) { Logistic(input_data, DimsToShape(input_dims), output_data, DimsToShape(output_dims)); } @@ -4461,13 +4482,13 @@ inline void Tanh(const float* input_data, const Dims<4>& input_dims, } inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& output_shape, - uint8* output_data) { + const uint8_t* input_data, const RuntimeShape& output_shape, + uint8_t* output_data) { // Note that this is almost the exact same code as in Logistic(). ruy::profiler::ScopeLabel label("Tanh"); - const int32 input_zero_point = params.input_zero_point; - const int32 input_range_radius = params.input_range_radius; - const int32 input_multiplier = params.input_multiplier; + const int32_t input_zero_point = params.input_zero_point; + const int32_t input_range_radius = params.input_range_radius; + const int32_t input_multiplier = params.input_multiplier; const int input_left_shift = params.input_left_shift; const int size = MatchingFlatSize(input_shape, output_shape); @@ -4580,40 +4601,40 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, #endif // Leftover loop: handle one value at a time with scalar code. for (; c < size; ++c) { - const uint8 input_val_u8 = input_data[c]; - const int32 input_val_centered = - static_cast(input_val_u8) - input_zero_point; - uint8 output_val; + const uint8_t input_val_u8 = input_data[c]; + const int32_t input_val_centered = + static_cast(input_val_u8) - input_zero_point; + uint8_t output_val; if (input_val_centered < -input_range_radius) { output_val = 0; } else if (input_val_centered > input_range_radius) { output_val = 255; } else { - const int32 input_val_rescaled = + const int32_t input_val_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( input_val_centered, input_multiplier, input_left_shift); - using FixedPoint4 = gemmlowp::FixedPoint; - using FixedPoint0 = gemmlowp::FixedPoint; + using FixedPoint4 = gemmlowp::FixedPoint; + using FixedPoint0 = gemmlowp::FixedPoint; const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled); const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4); using gemmlowp::RoundingDivideByPOT; - int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24); + int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24); output_val_s32 += output_zero_point; if (output_val_s32 == 256) { output_val_s32 = 255; } TFLITE_DCHECK_GE(output_val_s32, 0); TFLITE_DCHECK_LE(output_val_s32, 255); - output_val = static_cast(output_val_s32); + output_val = static_cast(output_val_s32); } output_data[c] = output_val; } } -inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const RuntimeShape& output_shape) { +inline void Tanh(const uint8_t* input_data, const RuntimeShape& input_shape, + int32_t input_zero_point, int32_t input_range_radius, + int32_t input_multiplier, int input_left_shift, + uint8_t* output_data, const RuntimeShape& output_shape) { TanhParams params; params.input_zero_point = input_zero_point; params.input_range_radius = input_range_radius; @@ -4622,25 +4643,25 @@ inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape, Tanh(params, input_shape, input_data, output_shape, output_data); } -inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const Dims<4>& output_dims) { +inline void Tanh(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t input_zero_point, int32_t input_range_radius, + int32_t input_multiplier, int input_left_shift, + uint8_t* output_data, const Dims<4>& output_dims) { Tanh(input_data, DimsToShape(input_dims), input_zero_point, input_range_radius, input_multiplier, input_left_shift, output_data, DimsToShape(output_dims)); } -inline void Tanh(const int16* input_data, const RuntimeShape& input_shape, - int input_left_shift, int16* output_data, +inline void Tanh(const int16_t* input_data, const RuntimeShape& input_shape, + int input_left_shift, int16_t* output_data, const RuntimeShape& output_shape) { TanhParams params; params.input_left_shift = input_left_shift; Tanh(params, input_shape, input_data, output_shape, output_data); } -inline void Tanh(const int16* input_data, const Dims<4>& input_dims, - int input_left_shift, int16* output_data, +inline void Tanh(const int16_t* input_data, const Dims<4>& input_dims, + int input_left_shift, int16_t* output_data, const Dims<4>& output_dims) { Tanh(input_data, DimsToShape(input_dims), input_left_shift, output_data, DimsToShape(output_dims)); @@ -4692,10 +4713,10 @@ void Mul(const float* input1_data, const Dims<4>& input1_dims, output_activation_max, output_data, output_dims); } -inline void Mul(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32 output_activation_min, int32 output_activation_max, - int32* output_data, const Dims<4>& output_dims) { +inline void Mul(const int32_t* input1_data, const Dims<4>& input1_dims, + const int32_t* input2_data, const Dims<4>& input2_dims, + int32_t output_activation_min, int32_t output_activation_max, + int32_t* output_data, const Dims<4>& output_dims) { tflite::ArithmeticParams op_params; op_params.quantized_activation_min = output_activation_min; op_params.quantized_activation_max = output_activation_max; @@ -4706,9 +4727,9 @@ inline void Mul(const int32* input1_data, const Dims<4>& input1_dims, } template -void Mul(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32* output_data, const Dims<4>& output_dims) { +void Mul(const int32_t* input1_data, const Dims<4>& input1_dims, + const int32_t* input2_data, const Dims<4>& input2_dims, + int32_t* output_data, const Dims<4>& output_dims) { TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); tflite::ArithmeticParams op_params; // No parameters needed. @@ -4718,9 +4739,9 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims, DimsToShape(output_dims), output_data); } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int16* output_data, const Dims<4>& output_dims) { +inline void Mul(const int16_t* input1_data, const Dims<4>& input1_dims, + const int16_t* input2_data, const Dims<4>& input2_dims, + int16_t* output_data, const Dims<4>& output_dims) { tflite::ArithmeticParams op_params; // No parameters needed. @@ -4729,10 +4750,10 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, output_data); } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int32 output_offset, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, +inline void Mul(const int16_t* input1_data, const Dims<4>& input1_dims, + const int16_t* input2_data, const Dims<4>& input2_dims, + int32_t output_offset, int32_t output_activation_min, + int32_t output_activation_max, uint8_t* output_data, const Dims<4>& output_dims) { tflite::ArithmeticParams op_params; op_params.output_offset = output_offset; @@ -4802,7 +4823,7 @@ inline void Floor(const float* input_data, const Dims<4>& input_dims, } inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, - const int32* output_size_data, + const int32_t* output_size_data, const Dims<4>& output_size_dims, float* output_data, const Dims<4>& output_dims, bool align_corners) { tflite::ResizeBilinearParams op_params; @@ -4813,10 +4834,11 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, DimsToShape(output_dims), output_data); } -inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims, - const int32* output_size_data, - const Dims<4>& output_size_dims, uint8* output_data, - const Dims<4>& output_dims, bool align_corners) { +inline void ResizeBilinear(const uint8_t* input_data, const Dims<4>& input_dims, + const int32_t* output_size_data, + const Dims<4>& output_size_dims, + uint8_t* output_data, const Dims<4>& output_dims, + bool align_corners) { tflite::ResizeBilinearParams op_params; op_params.align_corners = align_corners; op_params.half_pixel_centers = false; @@ -4827,7 +4849,7 @@ inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims, // legacy, for compatibility with old checked-in code inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, - const int32* output_size_data, + const int32_t* output_size_data, const Dims<4>& output_size_dims, float* output_data, const Dims<4>& output_dims) { ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, @@ -4835,19 +4857,19 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, } // legacy, for compatibility with old checked-in code -inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims, - const int32* output_size_data, - const Dims<4>& output_size_dims, uint8* output_data, - const Dims<4>& output_dims) { +inline void ResizeBilinear(const uint8_t* input_data, const Dims<4>& input_dims, + const int32_t* output_size_data, + const Dims<4>& output_size_dims, + uint8_t* output_data, const Dims<4>& output_dims) { ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, output_data, output_dims, /*align_corners=*/false); } template inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, - const int32* block_shape_data, + const int32_t* block_shape_data, const Dims<4>& block_shape_dims, - const int32* crops_data, const Dims<4>& crops_dims, + const int32_t* crops_data, const Dims<4>& crops_dims, T* output_data, const Dims<4>& output_dims) { BatchToSpaceND(DimsToShape(input_dims), input_data, DimsToShape(block_shape_dims), block_shape_data, @@ -4930,8 +4952,8 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, DimsToShape(output_dims), output_data); } -inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, - int32 zero_point, double scale, float* output_data, +inline void Dequantize(const uint8_t* input_data, const Dims<4>& input_dims, + int32_t zero_point, double scale, float* output_data, const Dims<4>& output_dims) { tflite::DequantizationParams op_params; op_params.zero_point = zero_point; diff --git a/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h b/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h index 2e1abf7a59ac12..909965e136443e 100644 --- a/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h +++ b/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h @@ -45,7 +45,7 @@ namespace resize_bilinear { // (a) Optimizations can be tried experimentally. // (b) Optimizations can be specialized for architectures, eg Intel vs ARM. -inline int16x8_t Load8IntoLowerS16(const uint8* data_ptr) { +inline int16x8_t Load8IntoLowerS16(const uint8_t* data_ptr) { return vreinterpretq_s16_u16(vmovl_u8(vld1_u8(data_ptr))); } @@ -54,7 +54,7 @@ inline uint16x8_t Move8IntoUpperU16(const uint8x8_t vec_val) { return vshlq_n_u16(vmovl_u8(vec_val), 8); } -inline uint16x8_t Load8IntoUpperU16(const uint8* data_ptr) { +inline uint16x8_t Load8IntoUpperU16(const uint8_t* data_ptr) { return Move8IntoUpperU16(vld1_u8(data_ptr)); } @@ -107,7 +107,7 @@ struct op_int16x8_t { // This really selects vshlq_n_s16, but requires a longer implementation to // convert the shift argument back to a constant. In some compiles are macros // requiring constant args. - inline op_int16x8_t operator<<=(int32 left_shift) { + inline op_int16x8_t operator<<=(int32_t left_shift) { switch (left_shift) { case 1: val = vshlq_n_s16(val, 1); @@ -127,7 +127,7 @@ struct op_int16x8_t { // This really selects vshrq_n_u16, but requires a longer implementation to // convert the shift argument back to a constant. In some compiles are macros // requiring constant args. - inline op_int16x8_t operator>>=(int32 right_shift) { + inline op_int16x8_t operator>>=(int32_t right_shift) { switch (right_shift) { case 1: val = vshrq_n_s16(val, 1); @@ -154,11 +154,11 @@ struct op_int16x8_t { lhs -= rhs; return lhs; } - friend inline op_int16x8_t operator<<(op_int16x8_t lhs, int32 left_shift) { + friend inline op_int16x8_t operator<<(op_int16x8_t lhs, int32_t left_shift) { lhs <<= left_shift; return lhs; } - friend inline op_int16x8_t operator>>(op_int16x8_t lhs, int32 right_shift) { + friend inline op_int16x8_t operator>>(op_int16x8_t lhs, int32_t right_shift) { lhs >>= right_shift; return lhs; } @@ -191,7 +191,7 @@ struct op_uint16x8_t { // This really selects vshlq_n_s16, but requires a longer implementation to // convert the shift argument back to a constant. In some compiles are macros // requiring constant args. - inline op_uint16x8_t operator<<=(int32 left_shift) { + inline op_uint16x8_t operator<<=(int32_t left_shift) { switch (left_shift) { case 1: val = vshlq_n_u16(val, 1); @@ -211,7 +211,7 @@ struct op_uint16x8_t { // This really selects vshrq_n_u16, but requires a longer implementation to // convert the shift argument back to a constant. In some compiles are macros // requiring constant args. - inline op_uint16x8_t operator>>=(int32 right_shift) { + inline op_uint16x8_t operator>>=(int32_t right_shift) { switch (right_shift) { case 1: val = vshrq_n_u16(val, 1); @@ -238,11 +238,13 @@ struct op_uint16x8_t { lhs -= rhs; return lhs; } - friend inline op_uint16x8_t operator<<(op_uint16x8_t lhs, int32 left_shift) { + friend inline op_uint16x8_t operator<<(op_uint16x8_t lhs, + int32_t left_shift) { lhs <<= left_shift; return lhs; } - friend inline op_uint16x8_t operator>>(op_uint16x8_t lhs, int32 right_shift) { + friend inline op_uint16x8_t operator>>(op_uint16x8_t lhs, + int32_t right_shift) { lhs >>= right_shift; return lhs; } @@ -262,20 +264,20 @@ inline op_uint16x8_t VReinterpretQU16S16(const op_int16x8_t& other) { // // This optimization is for the half_pixel_centers == true version, for uint8. // There are versions for NEON and non-NEON compilation. -inline void ResizeBilinear888Uint8(int32 batches, int32 input_height, - int32 input_width, int32 depth, - const uint8* input_data, - uint8* output_data) { +inline void ResizeBilinear888Uint8(int32_t batches, int32_t input_height, + int32_t input_width, int32_t depth, + const uint8_t* input_data, + uint8_t* output_data) { TFLITE_DCHECK_GE(input_height, 1); TFLITE_DCHECK_GE(input_width, 1); TFLITE_DCHECK_EQ(depth % 8, 0); - const int32 input_row_stride = input_width * depth; - const int32 output_row_stride = input_row_stride * 8; + const int32_t input_row_stride = input_width * depth; + const int32_t output_row_stride = input_row_stride * 8; for (int b = 0; b < batches; ++b) { - const uint8* input_base_ptr = + const uint8_t* input_base_ptr = input_data + b * input_row_stride * input_height; - uint8* output_base_ptr = + uint8_t* output_base_ptr = output_data + b * output_row_stride * input_height * 8; #ifdef USE_NEON @@ -361,24 +363,24 @@ inline void ResizeBilinear888Uint8(int32 batches, int32 input_height, } // Fill out remainder of top margin. std::memcpy(output_base_ptr + output_row_stride, output_base_ptr, - output_row_stride * sizeof(uint8)); + output_row_stride * sizeof(uint8_t)); std::memcpy(output_base_ptr + output_row_stride * 2, output_base_ptr, - output_row_stride * sizeof(uint8)); + output_row_stride * sizeof(uint8_t)); std::memcpy(output_base_ptr + output_row_stride * 3, output_base_ptr, - output_row_stride * sizeof(uint8)); + output_row_stride * sizeof(uint8_t)); output_base_ptr += output_row_stride * 4; // Main rows. for (int k = 0; k < (input_height - 1); ++k) { for (int c_block = 0; c_block < depth; c_block += 8) { - uint8* output_base_ptr_0 = output_base_ptr; - uint8* output_base_ptr_1; - uint8* output_base_ptr_2; - uint8* output_base_ptr_3; - uint8* output_base_ptr_4; - uint8* output_base_ptr_5; - uint8* output_base_ptr_6; - uint8* output_base_ptr_7; + uint8_t* output_base_ptr_0 = output_base_ptr; + uint8_t* output_base_ptr_1; + uint8_t* output_base_ptr_2; + uint8_t* output_base_ptr_3; + uint8_t* output_base_ptr_4; + uint8_t* output_base_ptr_5; + uint8_t* output_base_ptr_6; + uint8_t* output_base_ptr_7; op_uint16x8_t accum_0_c_v; op_uint16x8_t accum_1_c_v; @@ -774,11 +776,11 @@ inline void ResizeBilinear888Uint8(int32 batches, int32 input_height, } // Fill out remainder of bottom margin. std::memcpy(output_base_ptr + output_row_stride, output_base_ptr, - output_row_stride * sizeof(uint8)); + output_row_stride * sizeof(uint8_t)); std::memcpy(output_base_ptr + output_row_stride * 2, output_base_ptr, - output_row_stride * sizeof(uint8)); + output_row_stride * sizeof(uint8_t)); std::memcpy(output_base_ptr + output_row_stride * 3, output_base_ptr, - output_row_stride * sizeof(uint8)); + output_row_stride * sizeof(uint8_t)); #else // USE_NEON for (int c_block = 0; c_block < depth; c_block += 8) { @@ -1227,7 +1229,7 @@ inline void ResizeBilinear888Uint8(int32 batches, int32 input_height, } // namespace resize_bilinear #ifdef USE_NEON -inline void ResizeBilinearKernel(const float* input_ptr, int32 depth, +inline void ResizeBilinearKernel(const float* input_ptr, int32_t depth, float scale, float* output_ptr) { int ic = 0; // Handle 32 input channels at a time. @@ -1323,21 +1325,22 @@ inline void ResizeBilinearKernel(const float* input_ptr, int32 depth, } #endif -inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, - int32 x, int32 y, int32 depth, int32 batch, +inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, + int32_t y1, int32_t x, int32_t y, + int32_t depth, int32_t batch, const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); - const int32 input_width = input_shape.Dims(2); - const int32 output_width = output_shape.Dims(2); + const int32_t input_width = input_shape.Dims(2); + const int32_t output_width = output_shape.Dims(2); - const int32 input_x_offset = (x1 - x0) * depth; - const int32 input_y_offset = (y1 - y0) * depth * input_width; - const int32 output_x_offset = depth; - const int32 output_y_offset = depth * output_width; + const int32_t input_x_offset = (x1 - x0) * depth; + const int32_t input_y_offset = (y1 - y0) * depth * input_width; + const int32_t output_x_offset = depth; + const int32_t output_y_offset = depth * output_width; #ifdef USE_NEON TFLITE_DCHECK(x1 >= x0); @@ -1440,7 +1443,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, } // Handle one input channel at a time. for (; ic < depth; ic++) { - const int32 input_offset = Offset(input_shape, batch, y0, x0, ic); + const int32_t input_offset = Offset(input_shape, batch, y0, x0, ic); float x0y0 = input_data[input_offset]; float x1y0 = input_data[input_offset + input_x_offset]; @@ -1448,7 +1451,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, float x1y1 = input_data[input_offset + input_x_offset + input_y_offset]; // Top left corner. - const int32 output_offset = Offset(output_shape, batch, y, x, ic); + const int32_t output_offset = Offset(output_shape, batch, y, x, ic); output_data[output_offset] = x0y0; // Top right corner. @@ -1489,9 +1492,9 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, #endif } -inline void ResizeBilinear2x2(int32 batches, int32 input_height, - int32 input_width, int32 depth, - int32 output_height, int32 output_width, +inline void ResizeBilinear2x2(int32_t batches, int32_t input_height, + int32_t input_width, int32_t depth, + int32_t output_height, int32_t output_width, const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, @@ -1499,8 +1502,8 @@ inline void ResizeBilinear2x2(int32 batches, int32 input_height, for (int b = 0; b < batches; b++) { for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) { for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) { - int32 x1 = std::min(x0 + 1, input_width - 1); - int32 y1 = std::min(y0 + 1, input_height - 1); + int32_t x1 = std::min(x0 + 1, input_width - 1); + int32_t y1 = std::min(y0 + 1, input_height - 1); ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, input_data, output_shape, output_data); } @@ -1509,32 +1512,32 @@ inline void ResizeBilinear2x2(int32 batches, int32 input_height, } inline void ResizeBilinearGeneric( - int32 batches, int32 input_height, int32 input_width, int32 depth, - int32 output_height, int32 output_width, float height_scale, + int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, + int32_t output_height, int32_t output_width, float height_scale, float width_scale, const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data, const bool half_pixel_centers) { memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float)); - int32 output_offset = 0; + int32_t output_offset = 0; for (int b = 0; b < batches; ++b) { for (int y = 0; y < output_height; ++y) { float input_y; - int32 y0, y1; + int32_t y0, y1; reference_ops::ComputeInterpolationValues( y, height_scale, half_pixel_centers, input_height, &input_y, &y0, &y1); for (int x = 0; x < output_width; ++x) { float input_x; - int32 x0, x1; + int32_t x0, x1; reference_ops::ComputeInterpolationValues( x, width_scale, half_pixel_centers, input_width, &input_x, &x0, &x1); float* output_ptr = &output_data[output_offset]; // Run kernel on the 4 corners of the bilinear resize algorithm. - int32 input_offset = Offset(input_shape, b, y0, x0, 0); + int32_t input_offset = Offset(input_shape, b, y0, x0, 0); float scale = (1 - (input_y - y0)) * (1 - (input_x - x0)); const float* input_ptr = &input_data[input_offset]; ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); @@ -1562,8 +1565,8 @@ inline void ResizeBilinearGeneric( template inline void ResizeBilinearGenericSmallChannel( - int32 batches, int32 input_height, int32 input_width, int32 depth, - int32 output_height, int32 output_width, float height_scale, + int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, + int32_t output_height, int32_t output_width, float height_scale, float width_scale, const RuntimeShape& input_shape, const T* input_data, const RuntimeShape& output_shape, T* output_data, const bool half_pixel_centers) { @@ -1573,21 +1576,21 @@ inline void ResizeBilinearGenericSmallChannel( for (int b = 0; b < batches; ++b) { for (int y = 0; y < output_height; ++y) { float input_y; - int32 y0, y1; + int32_t y0, y1; reference_ops::ComputeInterpolationValues( y, height_scale, half_pixel_centers, input_height, &input_y, &y0, &y1); for (int x = 0; x < output_width; ++x) { float input_x; - int32 x0, x1; + int32_t x0, x1; reference_ops::ComputeInterpolationValues( x, width_scale, half_pixel_centers, input_width, &input_x, &x0, &x1); - int32 input_offset[4] = {Offset(input_shape, b, y0, x0, 0), - Offset(input_shape, b, y0, x1, 0), - Offset(input_shape, b, y1, x0, 0), - Offset(input_shape, b, y1, x1, 0)}; + int32_t input_offset[4] = {Offset(input_shape, b, y0, x0, 0), + Offset(input_shape, b, y0, x1, 0), + Offset(input_shape, b, y1, x0, 0), + Offset(input_shape, b, y1, x1, 0)}; float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)), (1 - (input_y - y0)) * (input_x - x0), (input_y - y0) * (1 - (input_x - x0)), @@ -1610,7 +1613,7 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, const RuntimeShape& unextended_input_shape, const float* input_data, const RuntimeShape& output_size_shape, - const int32* output_size_data, + const int32_t* output_size_data, const RuntimeShape& unextended_output_shape, float* output_data) { ruy::profiler::ScopeLabel label("ResizeBilinear"); @@ -1623,14 +1626,14 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape); - int32 batches = MatchingDim(input_shape, 0, output_shape, 0); - int32 input_height = input_shape.Dims(1); - int32 input_width = input_shape.Dims(2); - int32 depth = MatchingDim(input_shape, 3, output_shape, 3); + int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + int32_t input_height = input_shape.Dims(1); + int32_t input_width = input_shape.Dims(2); + int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2); - int32 output_height = output_size_data[0]; - int32 output_width = output_size_data[1]; + int32_t output_height = output_size_data[0]; + int32_t output_width = output_size_data[1]; // Specialize for 2x2 upsample. if (!op_params.align_corners && !op_params.half_pixel_centers && @@ -1659,11 +1662,11 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, // or int16 arithmetic. inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, const RuntimeShape& unextended_input_shape, - const uint8* input_data, + const uint8_t* input_data, const RuntimeShape& output_size_shape, - const int32* output_size_data, + const int32_t* output_size_data, const RuntimeShape& unextended_output_shape, - uint8* output_data) { + uint8_t* output_data) { ruy::profiler::ScopeLabel label("ResizeBilinearUint8"); // If half_pixel_centers is True, align_corners must be False. TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners); @@ -1674,18 +1677,18 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape); - int32 batches = MatchingDim(input_shape, 0, output_shape, 0); - int32 input_height = input_shape.Dims(1); - int32 input_width = input_shape.Dims(2); - int32 depth = MatchingDim(input_shape, 3, output_shape, 3); + int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + int32_t input_height = input_shape.Dims(1); + int32_t input_width = input_shape.Dims(2); + int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2); - int32 output_height = output_size_data[0]; - int32 output_width = output_size_data[1]; + int32_t output_height = output_size_data[0]; + int32_t output_width = output_size_data[1]; if (!op_params.align_corners && op_params.half_pixel_centers && ((depth % 8) == 0)) { - const int32 scale = output_height / input_height; + const int32_t scale = output_height / input_height; // Restricting the minimum output dimensions may not be necessary, but // ensures that kernels can use unrolling with minimal code size. if ((output_height >= 8) && (output_width >= 8) && @@ -1709,7 +1712,7 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, ? (static_cast(input_width - 1) / (output_width - 1)) : (static_cast(input_width) / output_width); - ResizeBilinearGenericSmallChannel( + ResizeBilinearGenericSmallChannel( batches, input_height, input_width, depth, output_height, output_width, height_scale, width_scale, input_shape, input_data, output_shape, output_data, op_params.half_pixel_centers); @@ -1718,11 +1721,11 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, // TODO(b/180609127) Create optimized int8 version from uint8. Call from here. inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, const RuntimeShape& unextended_input_shape, - const int8* input_data, + const int8_t* input_data, const RuntimeShape& unextended_output_size_shape, - const int32* output_size_data, + const int32_t* output_size_data, const RuntimeShape& unextended_output_shape, - int8* output_data) { + int8_t* output_data) { reference_ops::ResizeBilinearInteger(op_params, unextended_input_shape, input_data, unextended_output_size_shape, output_size_data,