diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml index 682c928ce..757430695 100644 --- a/.github/workflows/sycl.yml +++ b/.github/workflows/sycl.yml @@ -11,8 +11,8 @@ on: - '*' jobs: - opensycl-clang14-omp-docker: - name: ${{ matrix.name }} + hipsycl-clang14-omp-docker: + name: hipsycl-clang14-omp-docker runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 @@ -21,4 +21,12 @@ jobs: docker build . --tag nmtools:sycl-clang14-omp --file docker/sycl.dockerfile - name: run tests run: | - docker run --rm nmtools:sycl-clang14-omp \ No newline at end of file + docker run --rm nmtools:sycl-clang14-omp + hipsycl-clang14-cuda-docker: + name: hipsycl-clang14-cuda-docker + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: build docker + run: | + docker build . --tag nmtools:sycl-clang14-cuda --file docker/sycl-cuda.dockerfile \ No newline at end of file diff --git a/cmake/toolchains/sycl-clang14-cuda.cmake b/cmake/toolchains/sycl-clang14-cuda.cmake new file mode 100644 index 000000000..d7082e710 --- /dev/null +++ b/cmake/toolchains/sycl-clang14-cuda.cmake @@ -0,0 +1,17 @@ +set(CMAKE_C_COMPILER /usr/local/bin/syclcc) +set(CMAKE_CXX_COMPILER /usr/local/bin/syclcc) + +SET (CMAKE_C_COMPILER_WORKS 1) +add_compile_options(-W -Wall -Wextra -Werror + -Wno-gnu-string-literal-operator-template + -Wno-unknown-cuda-version + --acpp-targets="cuda:sm_60" + --acpp-clang=/usr/bin/clang++-14 +) +add_link_options( + --acpp-targets="cuda:sm_60" + --acpp-clang=/usr/bin/clang++-14 +) +set(NMTOOLS_TEST_CUDA_PATH "/usr/local/cuda/" CACHE STRING "manually set cuda path") +link_directories("${NMTOOLS_TEST_CUDA_PATH}/lib64") +link_libraries(cudart_static dl rt pthread) \ No newline at end of file diff --git a/cmake/toolchains/sycl-clang14-omp.cmake b/cmake/toolchains/sycl-clang14-omp.cmake index 05b71a5b5..172c510c3 100644 --- a/cmake/toolchains/sycl-clang14-omp.cmake +++ b/cmake/toolchains/sycl-clang14-omp.cmake @@ -5,4 +5,8 @@ SET (CMAKE_C_COMPILER_WORKS 1) add_compile_options(-W -Wall -Werror -Wextra -Wno-gnu-string-literal-operator-template --acpp-targets=omp --acpp-clang=/usr/bin/clang++-14 +) +add_link_options( + --acpp-targets=omp + --acpp-clang=/usr/bin/clang++-14 ) \ No newline at end of file diff --git a/docker/cuda-sycl.dockerfile b/docker/sycl-cuda.dockerfile similarity index 85% rename from docker/cuda-sycl.dockerfile rename to docker/sycl-cuda.dockerfile index 71a43bf44..c7c163977 100644 --- a/docker/cuda-sycl.dockerfile +++ b/docker/sycl-cuda.dockerfile @@ -49,12 +49,13 @@ RUN bash scripts/install_doctest.sh RUN apt install -y libclang-dev clang-tools libomp-dev llvm-dev lld libboost-dev libboost-fiber-dev libboost-context-dev RUN bash scripts/install_opensycl.sh -ARG toolchain=sycl-clang14-omp -RUN mkdir -p build && cd build \ +ARG toolchain=sycl-clang14-cuda +RUN mkdir -p build/${toolchain} && cd build/${toolchain} \ && cmake -DCMAKE_TOOLCHAIN_FILE=cmake/toolchains/${toolchain}.cmake \ -DNMTOOLS_BUILD_META_TESTS=OFF -DNMTOOLS_BUILD_UTL_TESTS=OFF -DNMTOOLS_TEST_ALL=OFF \ -DNMTOOLS_BUILD_SYCL_TESTS=ON \ - ../ \ - && make -j2 VERBOSE=1 + ../.. \ + && make -j2 VERBOSE=1 numeric-tests-sycl-doctest -CMD ["/workspace/nmtools/build/tests/sycl/numeric-tests-sycl-doctest"] \ No newline at end of file +ENV toolchain=${toolchain} +CMD ["sh", "-c", "/workspace/nmtools/build/${toolchain}/tests/sycl/numeric-tests-sycl-doctest"] \ No newline at end of file diff --git a/docker/sycl.dockerfile b/docker/sycl.dockerfile index 1ad594aab..98da03b6c 100644 --- a/docker/sycl.dockerfile +++ b/docker/sycl.dockerfile @@ -49,11 +49,12 @@ RUN apt install -y libclang-dev clang-tools libomp-dev llvm-dev lld libboost-dev RUN bash scripts/install_opensycl.sh ARG toolchain=sycl-clang14-omp -RUN mkdir -p build && cd build \ +RUN mkdir -p build/${toolchain} && cd build/${toolchain} \ && cmake -DCMAKE_TOOLCHAIN_FILE=cmake/toolchains/${toolchain}.cmake \ -DNMTOOLS_BUILD_META_TESTS=OFF -DNMTOOLS_BUILD_UTL_TESTS=OFF -DNMTOOLS_TEST_ALL=OFF \ -DNMTOOLS_BUILD_SYCL_TESTS=ON \ - ../ \ - && make -j2 VERBOSE=1 + ../.. \ + && make -j2 VERBOSE=1 numeric-tests-sycl-doctest -CMD ["/workspace/nmtools/build/tests/sycl/numeric-tests-sycl-doctest"] \ No newline at end of file +ENV toolchain=${toolchain} +CMD ["sh", "-c", "/workspace/nmtools/build/${toolchain}/tests/sycl/numeric-tests-sycl-doctest"] \ No newline at end of file diff --git a/include/nmtools/array/eval/kernel_helper.hpp b/include/nmtools/array/eval/kernel_helper.hpp index ba6f84bd7..62ced3358 100644 --- a/include/nmtools/array/eval/kernel_helper.hpp +++ b/include/nmtools/array/eval/kernel_helper.hpp @@ -156,6 +156,14 @@ namespace nmtools::array } }; + template + constexpr auto compute_offset(kernel_size thread_id, kernel_size block_id, kernel_size block_size) + { + // TODO: check for grid shape + auto idx = block_id.x() * block_size.x() + thread_id.x(); + return idx; + } + template < typename size_type=nm_size_t , typename mutable_array_t @@ -169,14 +177,14 @@ namespace nmtools::array , kernel_size block_id , kernel_size block_size ) { - if constexpr (meta::is_maybe_v) { + if constexpr (meta::is_maybe_v>) { if (!static_cast(result)) { return; } assign_result(output,*result,thread_id,block_id,block_size); } else { auto size = nmtools::size(output); - auto idx = block_id.x() * block_size.x() + thread_id.x(); + auto idx = compute_offset(thread_id,block_id,block_size); if (idx < size) { auto flat_lhs = view::mutable_flatten(output); auto flat_rhs = view::flatten(result); diff --git a/include/nmtools/array/eval/sycl/context.hpp b/include/nmtools/array/eval/sycl/context.hpp index 0d083a984..7034f7165 100644 --- a/include/nmtools/array/eval/sycl/context.hpp +++ b/include/nmtools/array/eval/sycl/context.hpp @@ -3,10 +3,13 @@ #include "nmtools/meta.hpp" #include "nmtools/array/ndarray.hpp" +#include "nmtools/array/functional/functor.hpp" +#include "nmtools/array/eval/sycl/info.hpp" #include "nmtools/array/eval/kernel_helper.hpp" #include "nmtools/exception.hpp" #include +#include #include namespace nmtools::array @@ -118,12 +121,18 @@ namespace nmtools::array::sycl struct context_t { - using queue_t = ::sycl::queue; + using queue_t = ::sycl::queue; + using device_t = ::sycl::device; - std::shared_ptr queue; + // TODO: share same queue for same context (?) + // std::shared_ptr queue; + nmtools_maybe device = meta::Nothing; context_t() - : queue(std::make_shared()) + {} + + context_t(device_t device) + : device(device) {} template @@ -158,35 +167,42 @@ namespace nmtools::array::sycl auto create_array(const array_t& array) { static_assert( - meta::is_ndarray_v - && !meta::is_view_v + ((meta::is_ndarray_v> && meta::is_pointer_v) + || meta::is_num_v || meta::is_ndarray_v) + && !meta::is_view_v> , "unsupported array type for create_array" ); - const auto buffer = nmtools::data(array); - const auto numel = nmtools::size(array); - const auto shape = nmtools::shape(array); - const auto dim = nmtools::dim(array); - - using element_t = meta::get_element_type_t; - using dim_t = meta::remove_cvref_t; - - // TODO: keep src shape traits - using device_shape_t = nmtools_static_vector; - auto device_shape = device_shape_t{}; - device_shape.resize(dim); - for (size_t i=0; i) { + return array; + } else if constexpr (meta::is_pointer_v) { + return create_array(*array); + } else { + const auto buffer = nmtools::data(array); + const auto numel = nmtools::size(array); + const auto shape = nmtools::shape(array); + const auto dim = nmtools::dim(array); + + using element_t = meta::get_element_type_t; + using dim_t = meta::remove_cvref_t; + + // TODO: keep src shape traits + using device_shape_t = nmtools_static_vector; + auto device_shape = device_shape_t{}; + device_shape.resize(dim); + for (size_t i=0; i; - auto sycl_buffer = buffer_t(buffer,(size_t)numel); + using buffer_t = ::sycl::buffer; + auto sycl_buffer = buffer_t(buffer,(size_t)numel); - using device_array_t = device_array; - using device_array_ptr = std::shared_ptr; + using device_array_t = device_array; + using device_array_ptr = std::shared_ptr; - auto array_raw_ptr = new device_array_t{sycl_buffer,device_shape,dim}; - auto array_ptr = device_array_ptr(array_raw_ptr); - return array_ptr; + auto array_raw_ptr = new device_array_t{sycl_buffer,device_shape,dim}; + auto array_ptr = device_array_ptr(array_raw_ptr); + return array_ptr; + } } template @@ -226,6 +242,12 @@ namespace nmtools::array::sycl template typename sequence> auto run_(output_array_t& output, const function_t& f, nmtools_tuple args_pack, sequence) { + auto queue = std::shared_ptr(); + if (!device) { + queue = std::make_shared(); + } else { + queue = std::make_shared(*device); + } using element_t = meta::get_element_type_t; auto numel = nmtools::size(output); // TODO: pass actual type (constant / clipped shape) as is to device @@ -249,30 +271,55 @@ namespace nmtools::array::sycl [[maybe_unused]] constexpr auto N = sizeof...(args_t); - // .accessor() sycl-equivalent to cudaMemcpy + pass to kernel (?) + // get_accessor / .accessor() sycl-equivalent to cudaMemcpy + pass to kernel (?) auto operands = nmtools_tuple{get_accessor(nmtools::get(args_pack),cgh,access_mode)...}; - auto kernel_range = ::sycl::range<1>(thread_size); - cgh.parallel_for(kernel_range,[=](::sycl::id<1> id){ + auto kernel_range = ::sycl::nd_range<1>(thread_size, warp_size); + + // TODO: fix wrong result on cuda, but correct on openmp: + // seems like the work item is clipped to 32(?) on cuda, the rest of work item is not executed + #if 0 + std::cout << "- numel: " << numel << "\n"; + std::cout << "- warp_size: " << warp_size << "\n"; + std::cout << "- thread_size: " << thread_size << "\n"; + std::cout << "- kernel_range: " << kernel_range << "\n"; + + ::sycl::stream out(1024, 256, cgh); + out << ::sycl::endl; + #endif + + // TODO: change to nd_item with 3 dim to properly use thread & block structure like cuda + cgh.parallel_for(kernel_range,[=](::sycl::nd_item<1> item){ auto output = create_mutable_array(&output_accessor[0],&output_shape_accessor[0],output_dim); auto result = functional::apply(f,operands); // TODO: properly get the thread & kernel id and shape - auto thread_id = array::kernel_size{id.get(0),0,0}; + auto thread_id = array::kernel_size{item.get_global_id(),0,0}; auto block_id = array::kernel_size{0,0,0}; auto block_size = array::kernel_size{1,1,1}; + + #if 0 + auto idx = compute_offset(thread_id,block_id,block_size); + out << item.get_global_id() + << "->" + << idx + << ";" + << ::sycl::flush + ; + #endif + array::assign_result(output,result,thread_id,block_id,block_size); }); }); this->copy_buffer(output_buffer,output); } - + template typename tuple, typename...operands_t> auto run(const function_t& f, output_array_t& output, const tuple& operands) { constexpr auto N = sizeof...(operands_t); auto device_operands = meta::template_reduce([&](auto init, auto index){ - const auto& arg_i = nmtools::get(operands); + const auto& arg_i = nmtools::at(operands,index); if constexpr (meta::is_num_v) { return utility::tuple_append(init,arg_i); } else { @@ -286,15 +333,130 @@ namespace nmtools::array::sycl this->run_(output,f,device_operands,sequence_t{}); } }; +} // namespace nmtools::array::sycl +#define PRINT_PLATFORM_PROPERTY(platform, prop) \ + std::cout << "- " << #prop << ": " \ + << platform.get_info<::sycl::info::platform::prop>() << std::endl; + +#define PRINT_DEVICE_PROPERTY(selected_device, prop) \ +std::cout << "- " << #prop << ": " \ + << selected_device.get_info<::sycl::info::device::prop>() << std::endl; + +namespace nmtools::array::sycl +{ inline auto default_context() { static std::shared_ptr default_context; if (!default_context) { - default_context = std::make_shared(); + auto sycl_devices = ::sycl::device::get_devices(); + auto platform_idx = 0ul; + if (auto env_idx = std::getenv("NMTOOLS_SYCL_DEFAULT_PLATFORM_IDX")) { + platform_idx = std::stoi(env_idx); + } + // TODO: better logging utilities + std::cout << "\033[1;33m[nmtools sycl]\033[0m number of sycl devices: " << sycl_devices.size() << "\n"; + for (auto i=0ul; i); + PRINT_DEVICE_PROPERTY(selected_device, max_work_item_sizes<2>); + PRINT_DEVICE_PROPERTY(selected_device, max_work_item_sizes<3>); + PRINT_DEVICE_PROPERTY(selected_device, max_work_group_size); + PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_char); + PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_short); + PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_int); + PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_long); + PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_float); + PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_double); + PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_half); + PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_char); + PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_short); + PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_int); + PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_long); + PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_float); + PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_double); + PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_half); + PRINT_DEVICE_PROPERTY(selected_device, max_clock_frequency); + PRINT_DEVICE_PROPERTY(selected_device, address_bits); + PRINT_DEVICE_PROPERTY(selected_device, max_mem_alloc_size); + PRINT_DEVICE_PROPERTY(selected_device, image_support); + PRINT_DEVICE_PROPERTY(selected_device, max_read_image_args); + PRINT_DEVICE_PROPERTY(selected_device, max_write_image_args); + PRINT_DEVICE_PROPERTY(selected_device, image2d_max_height); + PRINT_DEVICE_PROPERTY(selected_device, image2d_max_width); + PRINT_DEVICE_PROPERTY(selected_device, image3d_max_height); + PRINT_DEVICE_PROPERTY(selected_device, image3d_max_width); + PRINT_DEVICE_PROPERTY(selected_device, image3d_max_depth); + PRINT_DEVICE_PROPERTY(selected_device, image_max_buffer_size); + PRINT_DEVICE_PROPERTY(selected_device, image_max_array_size); + PRINT_DEVICE_PROPERTY(selected_device, max_samplers); + PRINT_DEVICE_PROPERTY(selected_device, max_parameter_size); + PRINT_DEVICE_PROPERTY(selected_device, mem_base_addr_align); + + PRINT_DEVICE_PROPERTY(selected_device, half_fp_config); + PRINT_DEVICE_PROPERTY(selected_device, single_fp_config); + PRINT_DEVICE_PROPERTY(selected_device, double_fp_config); + PRINT_DEVICE_PROPERTY(selected_device, global_mem_cache_type); + PRINT_DEVICE_PROPERTY(selected_device, global_mem_cache_line_size); + PRINT_DEVICE_PROPERTY(selected_device, global_mem_cache_size); + PRINT_DEVICE_PROPERTY(selected_device, global_mem_size); + PRINT_DEVICE_PROPERTY(selected_device, max_constant_buffer_size); + PRINT_DEVICE_PROPERTY(selected_device, max_constant_args); + PRINT_DEVICE_PROPERTY(selected_device, local_mem_type); + PRINT_DEVICE_PROPERTY(selected_device, local_mem_size); + PRINT_DEVICE_PROPERTY(selected_device, error_correction_support); + PRINT_DEVICE_PROPERTY(selected_device, host_unified_memory); + PRINT_DEVICE_PROPERTY(selected_device, profiling_timer_resolution); + PRINT_DEVICE_PROPERTY(selected_device, is_endian_little); + PRINT_DEVICE_PROPERTY(selected_device, is_available); + PRINT_DEVICE_PROPERTY(selected_device, is_compiler_available); + PRINT_DEVICE_PROPERTY(selected_device, is_linker_available); + PRINT_DEVICE_PROPERTY(selected_device, execution_capabilities); + PRINT_DEVICE_PROPERTY(selected_device, queue_profiling); + PRINT_DEVICE_PROPERTY(selected_device, built_in_kernels); + + + PRINT_DEVICE_PROPERTY(selected_device, printf_buffer_size); + PRINT_DEVICE_PROPERTY(selected_device, preferred_interop_user_sync); + PRINT_DEVICE_PROPERTY(selected_device, partition_max_sub_devices); + + PRINT_DEVICE_PROPERTY(selected_device, partition_properties); + PRINT_DEVICE_PROPERTY(selected_device, partition_affinity_domains); + PRINT_DEVICE_PROPERTY(selected_device, partition_type_property); + PRINT_DEVICE_PROPERTY(selected_device, partition_type_affinity_domain); + PRINT_DEVICE_PROPERTY(selected_device, reference_count); + } + default_context = std::make_shared(selected_device); } return default_context; } } +#undef PRINT_PLATFORM_PROPERTY +#undef PRINT_DEVICE_PROPERTY + #endif // NMTOOLS_ARRAY_EVAL_SYCL_CONTEXT_HPP \ No newline at end of file diff --git a/include/nmtools/array/eval/sycl/info.hpp b/include/nmtools/array/eval/sycl/info.hpp new file mode 100644 index 000000000..bc20d5616 --- /dev/null +++ b/include/nmtools/array/eval/sycl/info.hpp @@ -0,0 +1,181 @@ +// LICENSE: GNU General Public License v3.0 +// adapted from https://github.com/illuhad/syclinfo/blob/f644f1a25dfd94aa07f7c58f5f6171e5651a966d/syclinfo.cpp: +// - fix for some global_mem_cache_type +// - include in nmtools namespace +// - adapt namespace changes ::sycl instead of cl::sycl +// - no boost dependencies +// - inline for c++17 + +#ifndef NMTOOLS_ARRAY_EVAL_SYCL_INFO_HPP +#define NMTOOLS_ARRAY_EVAL_SYCL_INFO_HPP + +#include +#include +#include + +namespace nmtools::array::sycl +{ + inline std::map<::sycl::info::device_type, std::string> device_type_representation { + {::sycl::info::device_type::cpu, "CPU"}, + {::sycl::info::device_type::gpu, "GPU"}, + {::sycl::info::device_type::accelerator, "Accelerator"}, + {::sycl::info::device_type::custom, "Custom"}, + {::sycl::info::device_type::automatic, "Automatic"}, + {::sycl::info::device_type::host, "Host"}, + {::sycl::info::device_type::all, "All"} + }; + + inline std::map<::sycl::info::global_mem_cache_type, std::string> global_mem_cache_type_representation { + {::sycl::info::global_mem_cache_type::none, "None"}, + {::sycl::info::global_mem_cache_type::read_only, "Read-only"}, + {::sycl::info::global_mem_cache_type::read_write, "Read-Write"} + }; + + + inline std::map<::sycl::info::local_mem_type, std::string> local_mem_type_representation { + {::sycl::info::local_mem_type::none, "None"}, + {::sycl::info::local_mem_type::local, "Local"}, + {::sycl::info::local_mem_type::global, "Global"} + }; + + inline std::map<::sycl::info::fp_config, std::string> fp_config_representation { + { ::sycl::info::fp_config::denorm, "denorm"}, + { ::sycl::info::fp_config::inf_nan, "inf_nan"}, + { ::sycl::info::fp_config::round_to_nearest, "round_to_nearest"}, + { ::sycl::info::fp_config::round_to_zero, "round_to_zero"}, + { ::sycl::info::fp_config::round_to_inf, "round_to_inf"}, + { ::sycl::info::fp_config::fma, "fma"}, + { ::sycl::info::fp_config::correctly_rounded_divide_sqrt, "correctly_rounded_divide_sqrt"}, + { ::sycl::info::fp_config::soft_float, "soft_float" } + }; + + inline std::map<::sycl::info::execution_capability, std::string> exec_capability_representation{ + {::sycl::info::execution_capability::exec_kernel, "exec_kernel"}, + {::sycl::info::execution_capability::exec_native_kernel, "exec_native_kernel"} + }; + + + inline std::map<::sycl::info::partition_property, std::string> + partition_property_representation { + {::sycl::info::partition_property::no_partition, "no_partition"}, + {::sycl::info::partition_property::partition_equally, "partition_equally"}, + {::sycl::info::partition_property::partition_by_counts, "partition_by_counts"}, + {::sycl::info::partition_property::partition_by_affinity_domain, "partition_by_affinity_domain"} + }; + + inline std::map<::sycl::info::partition_affinity_domain, std::string> + partition_affinity_domain_representation { + {::sycl::info::partition_affinity_domain::not_applicable, "not_applicable"}, + {::sycl::info::partition_affinity_domain::numa, "numa"}, + {::sycl::info::partition_affinity_domain::L4_cache, "L4_cache"}, + {::sycl::info::partition_affinity_domain::L3_cache, "L3_cache"}, + {::sycl::info::partition_affinity_domain::L2_cache, "L2_cache"}, + {::sycl::info::partition_affinity_domain::L1_cache, "L1_cache"}, + {::sycl::info::partition_affinity_domain::next_partitionable, "next_partitionable"} + }; +} // namespace nmtools::array::sycl + +template +std::ostream& operator<<(std::ostream& lhs, const std::vector& rhs) +{ + for(std::size_t i = 0; i < rhs.size(); ++i) + { + lhs << rhs[i]; + if (i != rhs.size()-1) + lhs << ", "; + } + return lhs; +} + +namespace std +{ + + using nmtools::array::sycl::device_type_representation; + using nmtools::array::sycl::local_mem_type_representation; + using nmtools::array::sycl::local_mem_type_representation; + using nmtools::array::sycl::fp_config_representation; + using nmtools::array::sycl::exec_capability_representation; + using nmtools::array::sycl::partition_property_representation; + using nmtools::array::sycl::local_mem_type_representation; + using nmtools::array::sycl::partition_affinity_domain_representation; + using nmtools::array::sycl::global_mem_cache_type_representation; + + inline std::ostream& operator<<(std::ostream& lhs, const ::sycl::id<3>& idx) + { + lhs << idx[0] << " " << idx[1] << " " << idx[2]; + return lhs; + } + + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::device_type dtype) + { + lhs << device_type_representation[dtype]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::range<1> range) + { + lhs << range[0]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::range<2> range) + { + lhs << range[0] << "," << range[1]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::range<3> range) + { + lhs << range[0] << ", " << range[1] << ", " << range[2]; + return lhs; + } + + template + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::nd_range range) + { + lhs << "global:" << range.get_global_range() << "," + << "local:" << range.get_local_range() << "," + << "group:" << range.get_group_range() + ; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::global_mem_cache_type cache_type) + { + lhs << global_mem_cache_type_representation[cache_type]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::local_mem_type local_type) + { + lhs << local_mem_type_representation[local_type]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::fp_config fpconfig) + { + lhs << fp_config_representation[fpconfig]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::execution_capability ecap) + { + lhs << exec_capability_representation[ecap]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::partition_property pprop) + { + lhs << partition_property_representation[pprop]; + return lhs; + } + + inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::partition_affinity_domain domain) + { + lhs << partition_affinity_domain_representation[domain]; + return lhs; + } +} // namespace std + +#endif // NMTOOLS_ARRAY_EVAL_SYCL_INFO_HPP \ No newline at end of file diff --git a/tests/array/CMakeLists.txt b/tests/array/CMakeLists.txt index d537a9f22..0365b6115 100644 --- a/tests/array/CMakeLists.txt +++ b/tests/array/CMakeLists.txt @@ -249,7 +249,6 @@ add_executable(${PROJECT_NAME}-doctest tests.cpp ${MISC_TEST_SOURCES} ) -target_compile_features(${PROJECT_NAME}-doctest PRIVATE cxx_std_17) if (doctest_FOUND) target_link_libraries(${PROJECT_NAME}-doctest PRIVATE doctest::doctest) endif() @@ -261,13 +260,11 @@ if (NMTOOLS_ENABLE_TEST_BENCHMARKS) target_link_libraries(${PROJECT_NAME}-doctest PRIVATE nanobench) endif() -apply_nmtools( - TARGET ${PROJECT_NAME}-doctest - COMPILE_OPTIONS -g -) +target_compile_options(${PROJECT_NAME}-doctest PRIVATE --std=c++17) target_include_directories( ${PROJECT_NAME}-doctest PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include + ${NMTOOLS_INCLUDE_DIR} ) if (NMTOOLS_CODE_COVERAGE) diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt index 1b4bb43e7..3512fda87 100644 --- a/tests/sycl/CMakeLists.txt +++ b/tests/sycl/CMakeLists.txt @@ -84,11 +84,12 @@ set(NMTOOLS_SYCL_TEST_SOURCES ${NMTOOLS_SYCL_TEST_SOURCES} array/mean.cpp composition/add_tanh.cpp - # composition/divide_subtract.cpp + composition/divide_subtract.cpp composition/fabs_square_sum.cpp composition/fabs_square.cpp - # composition/multiply_add.cpp + composition/multiply_add.cpp composition/multiply_tanh.cpp + composition/multiply_add_tanh.cpp composition/reduce_add_divide.cpp composition/reduce_add_tanh.cpp composition/reduce_maximum_subtract_exp.cpp diff --git a/tests/sycl/composition/divide_subtract.cpp b/tests/sycl/composition/divide_subtract.cpp index b13d0552c..6ee0034e6 100644 --- a/tests/sycl/composition/divide_subtract.cpp +++ b/tests/sycl/composition/divide_subtract.cpp @@ -31,10 +31,26 @@ TEST_CASE("divide_subtract(case1)" * doctest::test_suite("array::divide_subtract auto a_step = 1; auto a_flat = na::arange(a_start,a_stop,a_step); + auto b_shape = nmtools_array{128}; + auto b_numel = ix::product(b_shape); + auto b_start = -b_numel/2; + auto b_stop = b_start + b_numel; + auto b_step = 1; + auto b_flat = na::arange(b_start,b_stop,b_step); + + auto c_shape = nmtools_array{128}; + auto c_numel = ix::product(c_shape); + auto c_start = -c_numel; + auto c_stop = c_numel; + auto c_step = 2; + auto c_flat = na::arange(c_start,c_stop,c_step); + auto a = na::reshape(a_flat,a_shape); + auto b = na::reshape(b_flat,b_shape); + auto c = na::reshape(c_flat,c_shape); - auto x = view::divide(a,9); - auto y = view::subtract(x,a); + auto x = view::divide(a,b); + auto y = view::subtract(x,c); CUDA_SUBCASE( y ); } \ No newline at end of file diff --git a/tests/sycl/composition/multiply_add.cpp b/tests/sycl/composition/multiply_add.cpp index b4b92de20..5fb3ffb14 100644 --- a/tests/sycl/composition/multiply_add.cpp +++ b/tests/sycl/composition/multiply_add.cpp @@ -31,10 +31,26 @@ TEST_CASE("multiply_add(case1)" * doctest::test_suite("array::multiply_add")) auto a_step = 1; auto a_flat = na::arange(a_start,a_stop,a_step); + auto b_shape = nmtools_array{128}; + auto b_numel = ix::product(b_shape); + auto b_start = -b_numel/2; + auto b_stop = b_start + b_numel; + auto b_step = 1; + auto b_flat = na::arange(b_start,b_stop,b_step); + + auto c_shape = nmtools_array{128}; + auto c_numel = ix::product(c_shape); + auto c_start = -c_numel; + auto c_stop = c_numel; + auto c_step = 2; + auto c_flat = na::arange(c_start,c_stop,c_step); + auto a = na::reshape(a_flat,a_shape); + auto b = na::reshape(b_flat,b_shape); + auto c = na::reshape(c_flat,c_shape); - auto x = view::multiply(a,a); - auto y = view::add(x,x); + auto x = view::multiply(a,b); + auto y = view::add(x,c); CUDA_SUBCASE( y ); } \ No newline at end of file diff --git a/tests/sycl/composition/multiply_add_tanh.cpp b/tests/sycl/composition/multiply_add_tanh.cpp new file mode 100644 index 000000000..876e213e4 --- /dev/null +++ b/tests/sycl/composition/multiply_add_tanh.cpp @@ -0,0 +1,59 @@ +#include "nmtools/array/array/ufuncs/multiply.hpp" +#include "nmtools/array/array/ufuncs/add.hpp" +#include "nmtools/array/array/ufuncs/tanh.hpp" +#include "nmtools/array/array/arange.hpp" +#include "nmtools/array/array/reshape.hpp" +#include "nmtools/array/eval/sycl.hpp" +#include "nmtools/testing/doctest.hpp" +#include "nmtools/testing/data/array/arange.hpp" +#include "nmtools/array/functional/ufuncs/multiply.hpp" +#include "nmtools/array/functional/ufuncs/add.hpp" +#include "nmtools/array/functional/ufuncs/tanh.hpp" + +namespace nm = nmtools; +namespace na = nmtools::array; +namespace ix = nmtools::index; +namespace fn = nmtools::functional; +namespace view = nm::view; + +#define CUDA_SUBCASE(...) \ +{ \ + auto result = na::eval(__VA_ARGS__, na::sycl::default_context()); \ + auto expect = na::eval(__VA_ARGS__); \ + NMTOOLS_ASSERT_EQUAL( nm::shape(result), nm::shape(expect) ); \ + NMTOOLS_ASSERT_CLOSE( result, expect ); \ +} + +TEST_CASE("multiply_add_tanh(case1)" * doctest::test_suite("array::multiply_add_tanh")) +{ + auto a_shape = nmtools_array{128}; + auto a_numel = ix::product(a_shape); + auto a_start = 0; + auto a_stop = a_start + a_numel; + auto a_step = 1; + auto a_flat = na::arange(a_start,a_stop,a_step); + + auto b_shape = nmtools_array{128}; + auto b_numel = ix::product(b_shape); + auto b_start = -b_numel/2; + auto b_stop = b_start + b_numel; + auto b_step = 1; + auto b_flat = na::arange(b_start,b_stop,b_step); + + auto c_shape = nmtools_array{128}; + auto c_numel = ix::product(c_shape); + auto c_start = -c_numel; + auto c_stop = c_numel; + auto c_step = 2; + auto c_flat = na::arange(c_start,c_stop,c_step); + + auto a = na::reshape(a_flat,a_shape); + auto b = na::reshape(b_flat,b_shape); + auto c = na::reshape(c_flat,c_shape); + + auto x = view::multiply(a,b); + auto y = view::add(x,c); + auto z = view::tanh(y); + + CUDA_SUBCASE( z ); +} \ No newline at end of file