Skip to content

Commit

Permalink
transpose: implement large1D twiddle multiply for length < 256
Browse files Browse the repository at this point in the history
* transpose: implement large1D twiddle multiply for length < 256

* rocfft-test: remove 1D prime sizes that are covered by radX

* rocfft-test: allow array type + placement to be overridden by test suites

* rocfft-test: test all 1D C2C sizes < 8k
  • Loading branch information
evetsso authored Apr 15, 2021
1 parent 7e4c5f0 commit a470ba6
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 3 deletions.
6 changes: 5 additions & 1 deletion clients/tests/accuracy_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ inline auto param_generator_base(const std::vector<rocfft_transform_type>& typ
const std::vector<std::vector<size_t>>& v_lengths,
const std::vector<rocfft_precision>& precision_range,
const std::vector<size_t>& batch_range,
decltype(generate_types) types_generator,
const stride_generator& istride,
const stride_generator& ostride,
const std::vector<std::vector<size_t>>& ioffset_range,
Expand All @@ -424,7 +425,7 @@ inline auto param_generator_base(const std::vector<rocfft_transform_type>& typ
{
for(const auto batch : batch_range)
{
for(const auto& types : generate_types(transform_type, place_range))
for(const auto& types : types_generator(transform_type, place_range))
{
for(const auto& istride_dist : istride.generate(lengths, batch))
{
Expand Down Expand Up @@ -485,6 +486,7 @@ inline auto param_generator(const std::vector<std::vector<size_t>>& v_length
v_lengths,
precision_range,
batch_range,
generate_types,
istride,
ostride,
ioffset_range,
Expand All @@ -507,6 +509,7 @@ inline auto param_generator_complex(const std::vector<std::vector<size_t>>&
v_lengths,
precision_range,
batch_range,
generate_types,
istride,
ostride,
ioffset_range,
Expand All @@ -529,6 +532,7 @@ inline auto param_generator_real(const std::vector<std::vector<size_t>>& v_l
v_lengths,
precision_range,
batch_range,
generate_types,
istride,
ostride,
ioffset_range,
Expand Down
51 changes: 50 additions & 1 deletion clients/tests/accuracy_test_1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,32 @@ const static std::vector<size_t> mix_range
900, 1250, 1500, 1875, 2160, 2187, 2250, 2500, 3000, 4000, 12000, 24000, 72000};

const static std::vector<size_t> prime_range
= {7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97};
= {17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97};

static std::vector<size_t> small_1D_sizes()
{
static const size_t SMALL_1D_MAX = 8192;

// generate a list of sizes from 2 and up, skipping any sizes that are already covered
std::vector<size_t> covered_sizes;
std::copy(pow2_range.begin(), pow2_range.end(), std::back_inserter(covered_sizes));
std::copy(pow3_range.begin(), pow3_range.end(), std::back_inserter(covered_sizes));
std::copy(pow5_range.begin(), pow5_range.end(), std::back_inserter(covered_sizes));
std::copy(radX_range.begin(), radX_range.end(), std::back_inserter(covered_sizes));
std::copy(mix_range.begin(), mix_range.end(), std::back_inserter(covered_sizes));
std::copy(prime_range.begin(), prime_range.end(), std::back_inserter(covered_sizes));
std::sort(covered_sizes.begin(), covered_sizes.end());

std::vector<size_t> output;
for(size_t i = 2; i < SMALL_1D_MAX; ++i)
{
if(!std::binary_search(covered_sizes.begin(), covered_sizes.end(), i))
{
output.push_back(i);
}
}
return output;
}

const static std::vector<std::vector<size_t>> stride_range = {{1}};

Expand Down Expand Up @@ -225,6 +250,30 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D,
place_range)),
accuracy_test::TestName);

// small 1D sizes just need to make sure our factorization isn't
// completely broken, so we just check simple C2C outplace interleaved
INSTANTIATE_TEST_SUITE_P(small_1D,
accuracy_test,
::testing::ValuesIn(param_generator_base(
{rocfft_transform_type_complex_forward},
{small_1D_sizes()},
{rocfft_precision_single},
{1},
[](rocfft_transform_type t,
const std::vector<rocfft_result_placement>& place_range) {
return std::vector<type_place_io_t>{
std::make_tuple(t,
place_range[0],
rocfft_array_type_complex_interleaved,
rocfft_array_type_complex_interleaved)};
},
stride_range,
stride_range,
ioffset_range_zero,
ooffset_range_zero,
{rocfft_placement_notinplace})),
accuracy_test::TestName);

// NB:
// We have known non-unit strides issues for 1D:
// - C2C middle size(for instance, single precision, 8192)
Expand Down
13 changes: 12 additions & 1 deletion library/src/device/kernels/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,18 @@
#define TRANSPOSE_TWIDDLE_MUL(tmp) \
if(WITH_TWL) \
{ \
if(TWL == 2) \
if(TWL == 1) \
{ \
if(DIR == -1) \
{ \
TWIDDLE_STEP_MUL_FWD(TWLstep1, twiddles_large, (gx + tx1) * (gy + ty1 + i), tmp); \
} \
else \
{ \
TWIDDLE_STEP_MUL_INV(TWLstep1, twiddles_large, (gx + tx1) * (gy + ty1 + i), tmp); \
} \
} \
else if(TWL == 2) \
{ \
if(DIR == -1) \
{ \
Expand Down
73 changes: 73 additions & 0 deletions library/src/device/transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,77 @@ rocfft_status rocfft_transpose_outofplace_template(size_t m,
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 0, 1, false, false, false>));

// twl=1:
tmap.emplace(
std::make_tuple(1, -1, true, true, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, true, true, true>));
tmap.emplace(
std::make_tuple(1, -1, false, true, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, false, true, true>));
tmap.emplace(
std::make_tuple(1, -1, true, false, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, true, false, true>));
tmap.emplace(
std::make_tuple(1, -1, false, false, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, false, false, true>));

tmap.emplace(
std::make_tuple(1, 1, true, true, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, true, true, true>));
tmap.emplace(
std::make_tuple(1, 1, false, true, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, false, true, true>));

tmap.emplace(
std::make_tuple(1, 1, true, false, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, true, false, true>));
tmap.emplace(
std::make_tuple(1, 1, false, false, true),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, false, false, true>));

tmap.emplace(
std::make_tuple(1, -1, true, true, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, true, true, false>));
tmap.emplace(
std::make_tuple(1, -1, false, true, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, false, true, false>));
tmap.emplace(
std::make_tuple(1, -1, true, false, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, true, false, false>));
tmap.emplace(
std::make_tuple(1, -1, false, false, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, -1, false, false, false>));

tmap.emplace(
std::make_tuple(1, 1, true, true, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, true, true, false>));
tmap.emplace(
std::make_tuple(1, 1, false, true, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, false, true, false>));

tmap.emplace(
std::make_tuple(1, 1, true, false, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, true, false, false>));
tmap.emplace(
std::make_tuple(1, 1, false, false, false),
&HIP_KERNEL_NAME(
transpose_kernel2<T, TA, TB, TRANSPOSE_DIM_X, TRANSPOSE_DIM_Y, true, 1, 1, false, false, false>));

// twl=2:
tmap.emplace(
std::make_tuple(2, -1, true, true, true),
Expand Down Expand Up @@ -578,6 +649,8 @@ void rocfft_internal_transpose_var2(const void* data_p, void* back_p)
twl = 3;
else if(data->node->large1D > (size_t)256)
twl = 2;
else if(data->node->large1D > 0)
twl = 1;
else
twl = 0;

Expand Down

0 comments on commit a470ba6

Please sign in to comment.