diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 682c928ce..757430695 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -11,8 +11,8 @@ on:
     - '*'
 
 jobs:
-  opensycl-clang14-omp-docker:
-    name: ${{ matrix.name }}
+  hipsycl-clang14-omp-docker:
+    name: hipsycl-clang14-omp-docker
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
@@ -21,4 +21,12 @@ jobs:
           docker build . --tag nmtools:sycl-clang14-omp --file docker/sycl.dockerfile
       - name: run tests
         run: |
-          docker run --rm nmtools:sycl-clang14-omp
\ No newline at end of file
+          docker run --rm nmtools:sycl-clang14-omp
+  hipsycl-clang14-cuda-docker:
+    name: hipsycl-clang14-cuda-docker
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: build docker
+        run: |
+          docker build . --tag nmtools:sycl-clang14-cuda --file docker/sycl-cuda.dockerfile
\ No newline at end of file
diff --git a/cmake/toolchains/sycl-clang14-cuda.cmake b/cmake/toolchains/sycl-clang14-cuda.cmake
new file mode 100644
index 000000000..d7082e710
--- /dev/null
+++ b/cmake/toolchains/sycl-clang14-cuda.cmake
@@ -0,0 +1,17 @@
+set(CMAKE_C_COMPILER /usr/local/bin/syclcc)
+set(CMAKE_CXX_COMPILER /usr/local/bin/syclcc)
+
+SET (CMAKE_C_COMPILER_WORKS 1)
+add_compile_options(-W -Wall -Wextra -Werror
+    -Wno-gnu-string-literal-operator-template
+    -Wno-unknown-cuda-version
+    --acpp-targets="cuda:sm_60"
+    --acpp-clang=/usr/bin/clang++-14
+)
+add_link_options(
+    --acpp-targets="cuda:sm_60"
+    --acpp-clang=/usr/bin/clang++-14
+)
+set(NMTOOLS_TEST_CUDA_PATH "/usr/local/cuda/" CACHE STRING "manually set cuda path")
+link_directories("${NMTOOLS_TEST_CUDA_PATH}/lib64")
+link_libraries(cudart_static dl rt pthread)
\ No newline at end of file
diff --git a/cmake/toolchains/sycl-clang14-omp.cmake b/cmake/toolchains/sycl-clang14-omp.cmake
index 05b71a5b5..172c510c3 100644
--- a/cmake/toolchains/sycl-clang14-omp.cmake
+++ b/cmake/toolchains/sycl-clang14-omp.cmake
@@ -5,4 +5,8 @@ SET (CMAKE_C_COMPILER_WORKS 1)
 add_compile_options(-W -Wall -Werror -Wextra -Wno-gnu-string-literal-operator-template
     --acpp-targets=omp
     --acpp-clang=/usr/bin/clang++-14
+)
+add_link_options(
+    --acpp-targets=omp
+    --acpp-clang=/usr/bin/clang++-14
 )
\ No newline at end of file
diff --git a/docker/cuda-sycl.dockerfile b/docker/sycl-cuda.dockerfile
similarity index 85%
rename from docker/cuda-sycl.dockerfile
rename to docker/sycl-cuda.dockerfile
index 71a43bf44..c7c163977 100644
--- a/docker/cuda-sycl.dockerfile
+++ b/docker/sycl-cuda.dockerfile
@@ -49,12 +49,13 @@ RUN bash scripts/install_doctest.sh
 RUN apt install -y libclang-dev clang-tools libomp-dev llvm-dev lld libboost-dev libboost-fiber-dev libboost-context-dev
 RUN bash scripts/install_opensycl.sh
 
-ARG toolchain=sycl-clang14-omp
-RUN mkdir -p build && cd build \
+ARG toolchain=sycl-clang14-cuda
+RUN mkdir -p build/${toolchain} && cd build/${toolchain} \
     && cmake -DCMAKE_TOOLCHAIN_FILE=cmake/toolchains/${toolchain}.cmake \
         -DNMTOOLS_BUILD_META_TESTS=OFF -DNMTOOLS_BUILD_UTL_TESTS=OFF -DNMTOOLS_TEST_ALL=OFF \
         -DNMTOOLS_BUILD_SYCL_TESTS=ON \
-        ../ \
-    && make -j2 VERBOSE=1
+        ../.. \
+    && make -j2 VERBOSE=1 numeric-tests-sycl-doctest
 
-CMD ["/workspace/nmtools/build/tests/sycl/numeric-tests-sycl-doctest"]
\ No newline at end of file
+ENV toolchain=${toolchain}
+CMD ["sh", "-c", "/workspace/nmtools/build/${toolchain}/tests/sycl/numeric-tests-sycl-doctest"]
\ No newline at end of file
diff --git a/docker/sycl.dockerfile b/docker/sycl.dockerfile
index 1ad594aab..98da03b6c 100644
--- a/docker/sycl.dockerfile
+++ b/docker/sycl.dockerfile
@@ -49,11 +49,12 @@ RUN apt install -y libclang-dev clang-tools libomp-dev llvm-dev lld libboost-dev
 RUN bash scripts/install_opensycl.sh
 
 ARG toolchain=sycl-clang14-omp
-RUN mkdir -p build && cd build \
+RUN mkdir -p build/${toolchain} && cd build/${toolchain} \
     && cmake -DCMAKE_TOOLCHAIN_FILE=cmake/toolchains/${toolchain}.cmake \
         -DNMTOOLS_BUILD_META_TESTS=OFF -DNMTOOLS_BUILD_UTL_TESTS=OFF -DNMTOOLS_TEST_ALL=OFF \
         -DNMTOOLS_BUILD_SYCL_TESTS=ON \
-        ../ \
-    && make -j2 VERBOSE=1
+        ../.. \
+    && make -j2 VERBOSE=1 numeric-tests-sycl-doctest
 
-CMD ["/workspace/nmtools/build/tests/sycl/numeric-tests-sycl-doctest"]
\ No newline at end of file
+ENV toolchain=${toolchain}
+CMD ["sh", "-c", "/workspace/nmtools/build/${toolchain}/tests/sycl/numeric-tests-sycl-doctest"]
\ No newline at end of file
diff --git a/include/nmtools/array/eval/kernel_helper.hpp b/include/nmtools/array/eval/kernel_helper.hpp
index ba6f84bd7..62ced3358 100644
--- a/include/nmtools/array/eval/kernel_helper.hpp
+++ b/include/nmtools/array/eval/kernel_helper.hpp
@@ -156,6 +156,14 @@ namespace nmtools::array
         }
     };
 
+    template <typename T>
+    constexpr auto compute_offset(kernel_size<T> thread_id, kernel_size<T> block_id, kernel_size<T> block_size)
+    {
+        // TODO: check for grid shape
+        auto idx = block_id.x() * block_size.x() + thread_id.x();
+        return idx;
+    }
+
     template <
         typename size_type=nm_size_t
         , typename mutable_array_t
@@ -169,14 +177,14 @@ namespace nmtools::array
         , kernel_size<size_type> block_id
         , kernel_size<size_type> block_size
     ) {
-        if constexpr (meta::is_maybe_v<array_t>) {
+        if constexpr (meta::is_maybe_v<meta::remove_cvref_t<decltype(result)>>) {
             if (!static_cast<bool>(result)) {
                 return;
             }
             assign_result(output,*result,thread_id,block_id,block_size);
         } else {
             auto size = nmtools::size(output);
-            auto idx = block_id.x() * block_size.x() + thread_id.x();
+            auto idx = compute_offset(thread_id,block_id,block_size);
             if (idx < size) {
                 auto flat_lhs = view::mutable_flatten(output);
                 auto flat_rhs = view::flatten(result);
diff --git a/include/nmtools/array/eval/sycl/context.hpp b/include/nmtools/array/eval/sycl/context.hpp
index 0d083a984..7034f7165 100644
--- a/include/nmtools/array/eval/sycl/context.hpp
+++ b/include/nmtools/array/eval/sycl/context.hpp
@@ -3,10 +3,13 @@
 
 #include "nmtools/meta.hpp"
 #include "nmtools/array/ndarray.hpp"
+#include "nmtools/array/functional/functor.hpp"
+#include "nmtools/array/eval/sycl/info.hpp"
 #include "nmtools/array/eval/kernel_helper.hpp"
 #include "nmtools/exception.hpp"
 
 #include <sycl/sycl.hpp>
+#include <iostream>
 #include <memory>
 
 namespace nmtools::array
@@ -118,12 +121,18 @@ namespace nmtools::array::sycl
 
     struct context_t
     {
-        using queue_t = ::sycl::queue;
+        using queue_t  = ::sycl::queue;
+        using device_t = ::sycl::device;
 
-        std::shared_ptr<queue_t> queue;
+        // TODO: share same queue for same context (?)
+        // std::shared_ptr<queue_t> queue;
+        nmtools_maybe<device_t> device = meta::Nothing;
 
         context_t()
-            : queue(std::make_shared<queue_t>())
+        {}
+
+        context_t(device_t device)
+            : device(device)
         {}
 
         template <typename T>
@@ -158,35 +167,42 @@ namespace nmtools::array::sycl
         auto create_array(const array_t& array)
         {
             static_assert(
-                meta::is_ndarray_v<array_t>
-                && !meta::is_view_v<array_t>
+                ((meta::is_ndarray_v<meta::remove_pointer_t<array_t>> && meta::is_pointer_v<array_t>)
+                    || meta::is_num_v<array_t> || meta::is_ndarray_v<array_t>)
+                    && !meta::is_view_v<meta::remove_pointer_t<array_t>>
                 , "unsupported array type for create_array"
             );
-            const auto buffer = nmtools::data(array);
-            const auto numel  = nmtools::size(array);
-            const auto shape  = nmtools::shape(array);
-            const auto dim    = nmtools::dim(array);
-
-            using element_t = meta::get_element_type_t<array_t>;
-            using dim_t     = meta::remove_cvref_t<decltype(dim)>;
-
-            // TODO: keep src shape traits
-            using device_shape_t = nmtools_static_vector<size_t,8>;
-            auto device_shape = device_shape_t{};
-            device_shape.resize(dim);
-            for (size_t i=0; i<dim; i++) {
-                at(device_shape,i) = at(shape,i);
-            }
+            if constexpr (meta::is_num_v<array_t>) {
+                return array;
+            } else if constexpr (meta::is_pointer_v<array_t>) {
+                return create_array(*array);
+            } else {
+                const auto buffer = nmtools::data(array);
+                const auto numel  = nmtools::size(array);
+                const auto shape  = nmtools::shape(array);
+                const auto dim    = nmtools::dim(array);
+
+                using element_t = meta::get_element_type_t<array_t>;
+                using dim_t     = meta::remove_cvref_t<decltype(dim)>;
+
+                // TODO: keep src shape traits
+                using device_shape_t = nmtools_static_vector<size_t,8>;
+                auto device_shape = device_shape_t{};
+                device_shape.resize(dim);
+                for (size_t i=0; i<dim; i++) {
+                    at(device_shape,i) = at(shape,i);
+                }
 
-            using buffer_t = ::sycl::buffer<element_t>;
-            auto sycl_buffer = buffer_t(buffer,(size_t)numel);
+                using buffer_t = ::sycl::buffer<element_t>;
+                auto sycl_buffer = buffer_t(buffer,(size_t)numel);
 
-            using device_array_t = device_array<buffer_t,device_shape_t,dim_t>;
-            using device_array_ptr = std::shared_ptr<device_array_t>;
+                using device_array_t = device_array<buffer_t,device_shape_t,dim_t>;
+                using device_array_ptr = std::shared_ptr<device_array_t>;
 
-            auto array_raw_ptr = new device_array_t{sycl_buffer,device_shape,dim};
-            auto array_ptr = device_array_ptr(array_raw_ptr);
-            return array_ptr;
+                auto array_raw_ptr = new device_array_t{sycl_buffer,device_shape,dim};
+                auto array_ptr = device_array_ptr(array_raw_ptr);
+                return array_ptr;
+            }
         }
 
         template <typename T>
@@ -226,6 +242,12 @@ namespace nmtools::array::sycl
         template <typename output_array_t, typename function_t, typename...args_t, auto...Is, template<auto...>typename sequence>
         auto run_(output_array_t& output, const function_t& f, nmtools_tuple<args_t...> args_pack, sequence<Is...>)
         {
+            auto queue = std::shared_ptr<queue_t>();
+            if (!device) {
+                queue = std::make_shared<queue_t>();
+            } else {
+                queue = std::make_shared<queue_t>(*device);
+            }
             using element_t = meta::get_element_type_t<output_array_t>;
             auto numel = nmtools::size(output);
             // TODO: pass actual type (constant / clipped shape) as is to device
@@ -249,30 +271,55 @@ namespace nmtools::array::sycl
                 [[maybe_unused]]
                 constexpr auto N = sizeof...(args_t);
 
-                // .accessor() sycl-equivalent to cudaMemcpy + pass to kernel (?)
+                // get_accessor / .accessor() sycl-equivalent to cudaMemcpy + pass to kernel (?)
                 auto operands = nmtools_tuple{get_accessor(nmtools::get<Is>(args_pack),cgh,access_mode)...};
 
-                auto kernel_range = ::sycl::range<1>(thread_size);
-                cgh.parallel_for(kernel_range,[=](::sycl::id<1> id){
+                auto kernel_range = ::sycl::nd_range<1>(thread_size, warp_size);
+
+                // TODO: fix wrong result on cuda, but correct on openmp:
+                // seems like the work item is clipped to 32(?) on cuda, the rest of work item is not executed
+                #if 0
+                std::cout << "- numel: " << numel << "\n";
+                std::cout << "- warp_size: " << warp_size << "\n";
+                std::cout << "- thread_size: " << thread_size << "\n";
+                std::cout << "- kernel_range: " << kernel_range << "\n";
+
+                ::sycl::stream out(1024, 256, cgh);
+                out << ::sycl::endl;
+                #endif
+
+                // TODO: change to nd_item with 3 dim to properly use thread & block structure like cuda
+                cgh.parallel_for(kernel_range,[=](::sycl::nd_item<1> item){
                     auto output = create_mutable_array(&output_accessor[0],&output_shape_accessor[0],output_dim);
                     auto result = functional::apply(f,operands);
                     // TODO: properly get the thread & kernel id and shape
-                    auto thread_id  = array::kernel_size<size_t>{id.get(0),0,0};
+                    auto thread_id  = array::kernel_size<size_t>{item.get_global_id(),0,0};
                     auto block_id   = array::kernel_size<size_t>{0,0,0};
                     auto block_size = array::kernel_size<size_t>{1,1,1};
+
+                    #if 0
+                    auto idx = compute_offset(thread_id,block_id,block_size);
+                    out << item.get_global_id()
+                        << "->"
+                        << idx
+                        << ";"
+                        << ::sycl::flush
+                    ;
+                    #endif
+
                     array::assign_result(output,result,thread_id,block_id,block_size);
                 });
             });
 
             this->copy_buffer(output_buffer,output);
         }
-        
+
         template <typename function_t, typename output_array_t, template<typename...>typename tuple, typename...operands_t>
         auto run(const function_t& f, output_array_t& output, const tuple<operands_t...>& operands)
         {
             constexpr auto N = sizeof...(operands_t);
             auto device_operands = meta::template_reduce<N>([&](auto init, auto index){
-                const auto& arg_i = nmtools::get<index>(operands);
+                const auto& arg_i = nmtools::at(operands,index);
                 if constexpr (meta::is_num_v<decltype(arg_i)>) {
                     return utility::tuple_append(init,arg_i);
                 } else {
@@ -286,15 +333,130 @@ namespace nmtools::array::sycl
             this->run_(output,f,device_operands,sequence_t{});
         }
     };
+} // namespace nmtools::array::sycl
 
+#define PRINT_PLATFORM_PROPERTY(platform, prop) \
+    std::cout << "- " << #prop << ": " \
+        << platform.get_info<::sycl::info::platform::prop>() << std::endl;
+    
+#define PRINT_DEVICE_PROPERTY(selected_device, prop) \
+std::cout << "- " << #prop << ": " \
+    << selected_device.get_info<::sycl::info::device::prop>() << std::endl;
+
+namespace nmtools::array::sycl
+{
     inline auto default_context()
     {
         static std::shared_ptr<context_t> default_context;
         if (!default_context) {
-            default_context = std::make_shared<context_t>();
+            auto sycl_devices = ::sycl::device::get_devices();
+            auto platform_idx = 0ul;
+            if (auto env_idx = std::getenv("NMTOOLS_SYCL_DEFAULT_PLATFORM_IDX")) {
+                platform_idx = std::stoi(env_idx);
+            }
+            // TODO: better logging utilities
+            std::cout << "\033[1;33m[nmtools sycl]\033[0m number of sycl devices: " << sycl_devices.size() << "\n";
+            for (auto i=0ul; i<sycl_devices.size(); i++) {
+                auto device = sycl_devices.at(i);
+                auto platform = device.get_platform();
+
+                std::cout << "\033[1;33m[nmtools sycl]\033[0m platform #" << i << ":\n";
+                PRINT_PLATFORM_PROPERTY(platform, name);
+                PRINT_PLATFORM_PROPERTY(platform, vendor);
+                PRINT_PLATFORM_PROPERTY(platform, version);
+                PRINT_PLATFORM_PROPERTY(platform, profile);
+                PRINT_PLATFORM_PROPERTY(platform, extensions);
+            }
+            auto selected_device = sycl_devices.at(platform_idx);
+            std::cout << "\033[1;33m[nmtools sycl]\033[0m default context using platform #" << platform_idx << "\n";
+            // TODO: log level
+            {
+                PRINT_DEVICE_PROPERTY(selected_device, name);
+                PRINT_DEVICE_PROPERTY(selected_device, vendor);
+                PRINT_DEVICE_PROPERTY(selected_device, driver_version);
+                PRINT_DEVICE_PROPERTY(selected_device, profile);
+                PRINT_DEVICE_PROPERTY(selected_device, version);
+                PRINT_DEVICE_PROPERTY(selected_device, opencl_c_version);
+                PRINT_DEVICE_PROPERTY(selected_device, extensions);
+                PRINT_DEVICE_PROPERTY(selected_device, device_type);
+                PRINT_DEVICE_PROPERTY(selected_device, vendor_id);
+                PRINT_DEVICE_PROPERTY(selected_device, max_compute_units);
+                PRINT_DEVICE_PROPERTY(selected_device, max_work_item_dimensions);
+                PRINT_DEVICE_PROPERTY(selected_device, max_work_item_sizes<1>);
+                PRINT_DEVICE_PROPERTY(selected_device, max_work_item_sizes<2>);
+                PRINT_DEVICE_PROPERTY(selected_device, max_work_item_sizes<3>);
+                PRINT_DEVICE_PROPERTY(selected_device, max_work_group_size);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_char);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_short);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_int);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_long);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_float);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_double);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_vector_width_half);
+                PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_char);
+                PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_short);
+                PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_int);
+                PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_long);
+                PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_float);
+                PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_double);
+                PRINT_DEVICE_PROPERTY(selected_device, native_vector_width_half);
+                PRINT_DEVICE_PROPERTY(selected_device, max_clock_frequency);
+                PRINT_DEVICE_PROPERTY(selected_device, address_bits);
+                PRINT_DEVICE_PROPERTY(selected_device, max_mem_alloc_size);
+                PRINT_DEVICE_PROPERTY(selected_device, image_support);
+                PRINT_DEVICE_PROPERTY(selected_device, max_read_image_args);
+                PRINT_DEVICE_PROPERTY(selected_device, max_write_image_args);
+                PRINT_DEVICE_PROPERTY(selected_device, image2d_max_height);
+                PRINT_DEVICE_PROPERTY(selected_device, image2d_max_width);
+                PRINT_DEVICE_PROPERTY(selected_device, image3d_max_height);
+                PRINT_DEVICE_PROPERTY(selected_device, image3d_max_width);
+                PRINT_DEVICE_PROPERTY(selected_device, image3d_max_depth);
+                PRINT_DEVICE_PROPERTY(selected_device, image_max_buffer_size);
+                PRINT_DEVICE_PROPERTY(selected_device, image_max_array_size);
+                PRINT_DEVICE_PROPERTY(selected_device, max_samplers);
+                PRINT_DEVICE_PROPERTY(selected_device, max_parameter_size);
+                PRINT_DEVICE_PROPERTY(selected_device, mem_base_addr_align);
+                
+                PRINT_DEVICE_PROPERTY(selected_device, half_fp_config);
+                PRINT_DEVICE_PROPERTY(selected_device, single_fp_config);
+                PRINT_DEVICE_PROPERTY(selected_device, double_fp_config);
+                PRINT_DEVICE_PROPERTY(selected_device, global_mem_cache_type);
+                PRINT_DEVICE_PROPERTY(selected_device, global_mem_cache_line_size);
+                PRINT_DEVICE_PROPERTY(selected_device, global_mem_cache_size);
+                PRINT_DEVICE_PROPERTY(selected_device, global_mem_size);
+                PRINT_DEVICE_PROPERTY(selected_device, max_constant_buffer_size);
+                PRINT_DEVICE_PROPERTY(selected_device, max_constant_args);
+                PRINT_DEVICE_PROPERTY(selected_device, local_mem_type);
+                PRINT_DEVICE_PROPERTY(selected_device, local_mem_size);
+                PRINT_DEVICE_PROPERTY(selected_device, error_correction_support);
+                PRINT_DEVICE_PROPERTY(selected_device, host_unified_memory);
+                PRINT_DEVICE_PROPERTY(selected_device, profiling_timer_resolution);
+                PRINT_DEVICE_PROPERTY(selected_device, is_endian_little);
+                PRINT_DEVICE_PROPERTY(selected_device, is_available);
+                PRINT_DEVICE_PROPERTY(selected_device, is_compiler_available);
+                PRINT_DEVICE_PROPERTY(selected_device, is_linker_available);
+                PRINT_DEVICE_PROPERTY(selected_device, execution_capabilities);
+                PRINT_DEVICE_PROPERTY(selected_device, queue_profiling);
+                PRINT_DEVICE_PROPERTY(selected_device, built_in_kernels);
+                
+                
+                PRINT_DEVICE_PROPERTY(selected_device, printf_buffer_size);
+                PRINT_DEVICE_PROPERTY(selected_device, preferred_interop_user_sync);
+                PRINT_DEVICE_PROPERTY(selected_device, partition_max_sub_devices);
+                
+                PRINT_DEVICE_PROPERTY(selected_device, partition_properties);
+                PRINT_DEVICE_PROPERTY(selected_device, partition_affinity_domains);
+                PRINT_DEVICE_PROPERTY(selected_device, partition_type_property);
+                PRINT_DEVICE_PROPERTY(selected_device, partition_type_affinity_domain);
+                PRINT_DEVICE_PROPERTY(selected_device, reference_count);
+            }
+            default_context = std::make_shared<context_t>(selected_device);
         }
         return default_context;
     }
 }
 
+#undef PRINT_PLATFORM_PROPERTY
+#undef PRINT_DEVICE_PROPERTY
+
 #endif // NMTOOLS_ARRAY_EVAL_SYCL_CONTEXT_HPP
\ No newline at end of file
diff --git a/include/nmtools/array/eval/sycl/info.hpp b/include/nmtools/array/eval/sycl/info.hpp
new file mode 100644
index 000000000..bc20d5616
--- /dev/null
+++ b/include/nmtools/array/eval/sycl/info.hpp
@@ -0,0 +1,181 @@
+// LICENSE: GNU General Public License v3.0
+// adapted from https://github.com/illuhad/syclinfo/blob/f644f1a25dfd94aa07f7c58f5f6171e5651a966d/syclinfo.cpp:
+// - fix for some global_mem_cache_type
+// - include in nmtools namespace
+// - adapt namespace changes ::sycl instead of cl::sycl
+// - no boost dependencies
+// - inline for c++17
+
+#ifndef NMTOOLS_ARRAY_EVAL_SYCL_INFO_HPP
+#define NMTOOLS_ARRAY_EVAL_SYCL_INFO_HPP
+
+#include <sycl/sycl.hpp>
+#include <iostream>
+#include <map>
+
+namespace nmtools::array::sycl
+{
+    inline std::map<::sycl::info::device_type, std::string> device_type_representation {
+    {::sycl::info::device_type::cpu, "CPU"},
+    {::sycl::info::device_type::gpu, "GPU"},
+    {::sycl::info::device_type::accelerator, "Accelerator"},
+    {::sycl::info::device_type::custom, "Custom"},
+    {::sycl::info::device_type::automatic, "Automatic"},
+    {::sycl::info::device_type::host, "Host"},
+    {::sycl::info::device_type::all, "All"}
+    };
+
+    inline std::map<::sycl::info::global_mem_cache_type, std::string> global_mem_cache_type_representation {
+    {::sycl::info::global_mem_cache_type::none, "None"},
+    {::sycl::info::global_mem_cache_type::read_only, "Read-only"},
+    {::sycl::info::global_mem_cache_type::read_write, "Read-Write"}
+    };
+
+
+    inline std::map<::sycl::info::local_mem_type, std::string> local_mem_type_representation {
+    {::sycl::info::local_mem_type::none, "None"},
+    {::sycl::info::local_mem_type::local, "Local"},
+    {::sycl::info::local_mem_type::global, "Global"}
+    };
+
+    inline std::map<::sycl::info::fp_config, std::string> fp_config_representation {
+    { ::sycl::info::fp_config::denorm, "denorm"},
+    { ::sycl::info::fp_config::inf_nan, "inf_nan"},
+    { ::sycl::info::fp_config::round_to_nearest, "round_to_nearest"},
+    { ::sycl::info::fp_config::round_to_zero, "round_to_zero"},
+    { ::sycl::info::fp_config::round_to_inf, "round_to_inf"},
+    { ::sycl::info::fp_config::fma, "fma"},
+    { ::sycl::info::fp_config::correctly_rounded_divide_sqrt, "correctly_rounded_divide_sqrt"},
+    { ::sycl::info::fp_config::soft_float, "soft_float" }
+    };
+
+    inline std::map<::sycl::info::execution_capability, std::string> exec_capability_representation{
+    {::sycl::info::execution_capability::exec_kernel, "exec_kernel"},
+    {::sycl::info::execution_capability::exec_native_kernel, "exec_native_kernel"}
+    };
+
+
+    inline std::map<::sycl::info::partition_property, std::string> 
+                            partition_property_representation {
+    {::sycl::info::partition_property::no_partition, "no_partition"},
+    {::sycl::info::partition_property::partition_equally, "partition_equally"},
+    {::sycl::info::partition_property::partition_by_counts, "partition_by_counts"},
+    {::sycl::info::partition_property::partition_by_affinity_domain, "partition_by_affinity_domain"}
+    };
+
+    inline std::map<::sycl::info::partition_affinity_domain, std::string> 
+                            partition_affinity_domain_representation {
+    {::sycl::info::partition_affinity_domain::not_applicable, "not_applicable"},
+    {::sycl::info::partition_affinity_domain::numa, "numa"},
+    {::sycl::info::partition_affinity_domain::L4_cache, "L4_cache"},
+    {::sycl::info::partition_affinity_domain::L3_cache, "L3_cache"},
+    {::sycl::info::partition_affinity_domain::L2_cache, "L2_cache"},
+    {::sycl::info::partition_affinity_domain::L1_cache, "L1_cache"},
+    {::sycl::info::partition_affinity_domain::next_partitionable, "next_partitionable"}
+    };
+} // namespace nmtools::array::sycl
+
+template<class T>
+std::ostream& operator<<(std::ostream& lhs, const std::vector<T>& rhs)
+{
+  for(std::size_t i = 0; i < rhs.size(); ++i)
+  {
+    lhs << rhs[i];
+    if (i != rhs.size()-1)
+      lhs << ", ";
+  }
+  return lhs;
+}
+
+namespace std
+{
+
+    using nmtools::array::sycl::device_type_representation;
+    using nmtools::array::sycl::local_mem_type_representation;
+    using nmtools::array::sycl::local_mem_type_representation;
+    using nmtools::array::sycl::fp_config_representation;
+    using nmtools::array::sycl::exec_capability_representation;
+    using nmtools::array::sycl::partition_property_representation;
+    using nmtools::array::sycl::local_mem_type_representation;
+    using nmtools::array::sycl::partition_affinity_domain_representation;
+    using nmtools::array::sycl::global_mem_cache_type_representation;
+
+    inline std::ostream& operator<<(std::ostream& lhs, const ::sycl::id<3>& idx)
+    {
+        lhs << idx[0] << " " << idx[1] << " " << idx[2];
+        return lhs;
+    }
+
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::device_type dtype)
+    {
+        lhs << device_type_representation[dtype];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::range<1> range)
+    {
+        lhs << range[0];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::range<2> range)
+    {
+        lhs << range[0] << "," << range[1];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::range<3> range)
+    {
+        lhs << range[0] << ", " << range[1] << ", " << range[2];
+        return lhs;
+    }
+
+    template <auto Dimension>
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::nd_range<Dimension> range)
+    {
+        lhs << "global:" << range.get_global_range() << ","
+            << "local:" << range.get_local_range() << ","
+            << "group:" << range.get_group_range()
+        ;
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::global_mem_cache_type cache_type)
+    {
+        lhs << global_mem_cache_type_representation[cache_type];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::local_mem_type local_type)
+    {
+        lhs << local_mem_type_representation[local_type];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::fp_config fpconfig)
+    {
+        lhs << fp_config_representation[fpconfig];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::execution_capability ecap)
+    {
+        lhs << exec_capability_representation[ecap];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::partition_property pprop)
+    {
+        lhs << partition_property_representation[pprop];
+        return lhs;
+    }
+
+    inline std::ostream& operator<<(std::ostream& lhs, ::sycl::info::partition_affinity_domain domain)
+    {
+        lhs << partition_affinity_domain_representation[domain];
+        return lhs;
+    }
+} // namespace std
+
+#endif // NMTOOLS_ARRAY_EVAL_SYCL_INFO_HPP
\ No newline at end of file
diff --git a/tests/array/CMakeLists.txt b/tests/array/CMakeLists.txt
index d537a9f22..0365b6115 100644
--- a/tests/array/CMakeLists.txt
+++ b/tests/array/CMakeLists.txt
@@ -249,7 +249,6 @@ add_executable(${PROJECT_NAME}-doctest tests.cpp
     ${MISC_TEST_SOURCES}
 )
 
-target_compile_features(${PROJECT_NAME}-doctest PRIVATE cxx_std_17)
 if (doctest_FOUND)
     target_link_libraries(${PROJECT_NAME}-doctest PRIVATE doctest::doctest)
 endif()
@@ -261,13 +260,11 @@ if (NMTOOLS_ENABLE_TEST_BENCHMARKS)
     target_link_libraries(${PROJECT_NAME}-doctest PRIVATE nanobench)
 endif()
 
-apply_nmtools(
-    TARGET ${PROJECT_NAME}-doctest
-    COMPILE_OPTIONS -g
-)
+target_compile_options(${PROJECT_NAME}-doctest PRIVATE --std=c++17)
 target_include_directories(
     ${PROJECT_NAME}-doctest PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${NMTOOLS_INCLUDE_DIR}
 )
 
 if (NMTOOLS_CODE_COVERAGE)
diff --git a/tests/sycl/CMakeLists.txt b/tests/sycl/CMakeLists.txt
index 1b4bb43e7..3512fda87 100644
--- a/tests/sycl/CMakeLists.txt
+++ b/tests/sycl/CMakeLists.txt
@@ -84,11 +84,12 @@ set(NMTOOLS_SYCL_TEST_SOURCES ${NMTOOLS_SYCL_TEST_SOURCES}
     array/mean.cpp
 
     composition/add_tanh.cpp
-    # composition/divide_subtract.cpp
+    composition/divide_subtract.cpp
     composition/fabs_square_sum.cpp
     composition/fabs_square.cpp
-    # composition/multiply_add.cpp
+    composition/multiply_add.cpp
     composition/multiply_tanh.cpp
+    composition/multiply_add_tanh.cpp
     composition/reduce_add_divide.cpp
     composition/reduce_add_tanh.cpp
     composition/reduce_maximum_subtract_exp.cpp
diff --git a/tests/sycl/composition/divide_subtract.cpp b/tests/sycl/composition/divide_subtract.cpp
index b13d0552c..6ee0034e6 100644
--- a/tests/sycl/composition/divide_subtract.cpp
+++ b/tests/sycl/composition/divide_subtract.cpp
@@ -31,10 +31,26 @@ TEST_CASE("divide_subtract(case1)" * doctest::test_suite("array::divide_subtract
     auto a_step  = 1;
     auto a_flat  = na::arange(a_start,a_stop,a_step);
 
+    auto b_shape = nmtools_array{128};
+    auto b_numel = ix::product(b_shape);
+    auto b_start = -b_numel/2;
+    auto b_stop  = b_start + b_numel;
+    auto b_step  = 1;
+    auto b_flat  = na::arange(b_start,b_stop,b_step);
+
+    auto c_shape = nmtools_array{128};
+    auto c_numel = ix::product(c_shape);
+    auto c_start = -c_numel;
+    auto c_stop  = c_numel;
+    auto c_step  = 2;
+    auto c_flat  = na::arange(c_start,c_stop,c_step);
+
     auto a = na::reshape(a_flat,a_shape);
+    auto b = na::reshape(b_flat,b_shape);
+    auto c = na::reshape(c_flat,c_shape);
 
-    auto x = view::divide(a,9);
-    auto y = view::subtract(x,a);
+    auto x = view::divide(a,b);
+    auto y = view::subtract(x,c);
 
     CUDA_SUBCASE( y );
 }
\ No newline at end of file
diff --git a/tests/sycl/composition/multiply_add.cpp b/tests/sycl/composition/multiply_add.cpp
index b4b92de20..5fb3ffb14 100644
--- a/tests/sycl/composition/multiply_add.cpp
+++ b/tests/sycl/composition/multiply_add.cpp
@@ -31,10 +31,26 @@ TEST_CASE("multiply_add(case1)" * doctest::test_suite("array::multiply_add"))
     auto a_step  = 1;
     auto a_flat  = na::arange(a_start,a_stop,a_step);
 
+    auto b_shape = nmtools_array{128};
+    auto b_numel = ix::product(b_shape);
+    auto b_start = -b_numel/2;
+    auto b_stop  = b_start + b_numel;
+    auto b_step  = 1;
+    auto b_flat  = na::arange(b_start,b_stop,b_step);
+
+    auto c_shape = nmtools_array{128};
+    auto c_numel = ix::product(c_shape);
+    auto c_start = -c_numel;
+    auto c_stop  = c_numel;
+    auto c_step  = 2;
+    auto c_flat  = na::arange(c_start,c_stop,c_step);
+
     auto a = na::reshape(a_flat,a_shape);
+    auto b = na::reshape(b_flat,b_shape);
+    auto c = na::reshape(c_flat,c_shape);
 
-    auto x = view::multiply(a,a);
-    auto y = view::add(x,x);
+    auto x = view::multiply(a,b);
+    auto y = view::add(x,c);
 
     CUDA_SUBCASE( y );
 }
\ No newline at end of file
diff --git a/tests/sycl/composition/multiply_add_tanh.cpp b/tests/sycl/composition/multiply_add_tanh.cpp
new file mode 100644
index 000000000..876e213e4
--- /dev/null
+++ b/tests/sycl/composition/multiply_add_tanh.cpp
@@ -0,0 +1,59 @@
+#include "nmtools/array/array/ufuncs/multiply.hpp"
+#include "nmtools/array/array/ufuncs/add.hpp"
+#include "nmtools/array/array/ufuncs/tanh.hpp"
+#include "nmtools/array/array/arange.hpp"
+#include "nmtools/array/array/reshape.hpp"
+#include "nmtools/array/eval/sycl.hpp"
+#include "nmtools/testing/doctest.hpp"
+#include "nmtools/testing/data/array/arange.hpp"
+#include "nmtools/array/functional/ufuncs/multiply.hpp"
+#include "nmtools/array/functional/ufuncs/add.hpp"
+#include "nmtools/array/functional/ufuncs/tanh.hpp"
+
+namespace nm = nmtools;
+namespace na = nmtools::array;
+namespace ix = nmtools::index;
+namespace fn = nmtools::functional;
+namespace view = nm::view;
+
+#define CUDA_SUBCASE(...) \
+{ \
+    auto result = na::eval(__VA_ARGS__, na::sycl::default_context()); \
+    auto expect = na::eval(__VA_ARGS__); \
+    NMTOOLS_ASSERT_EQUAL( nm::shape(result), nm::shape(expect) ); \
+    NMTOOLS_ASSERT_CLOSE( result, expect ); \
+}
+
+TEST_CASE("multiply_add_tanh(case1)" * doctest::test_suite("array::multiply_add_tanh"))
+{
+    auto a_shape = nmtools_array{128};
+    auto a_numel = ix::product(a_shape);
+    auto a_start = 0;
+    auto a_stop  = a_start + a_numel;
+    auto a_step  = 1;
+    auto a_flat  = na::arange(a_start,a_stop,a_step);
+
+    auto b_shape = nmtools_array{128};
+    auto b_numel = ix::product(b_shape);
+    auto b_start = -b_numel/2;
+    auto b_stop  = b_start + b_numel;
+    auto b_step  = 1;
+    auto b_flat  = na::arange(b_start,b_stop,b_step);
+
+    auto c_shape = nmtools_array{128};
+    auto c_numel = ix::product(c_shape);
+    auto c_start = -c_numel;
+    auto c_stop  = c_numel;
+    auto c_step  = 2;
+    auto c_flat  = na::arange(c_start,c_stop,c_step);
+
+    auto a = na::reshape(a_flat,a_shape);
+    auto b = na::reshape(b_flat,b_shape);
+    auto c = na::reshape(c_flat,c_shape);
+
+    auto x = view::multiply(a,b);
+    auto y = view::add(x,c);
+    auto z = view::tanh(y);
+
+    CUDA_SUBCASE( z );
+}
\ No newline at end of file