diff --git a/CMakeLists.txt b/CMakeLists.txt index 32f74c0..7a431fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,8 +19,22 @@ # THE SOFTWARE. cmake_minimum_required (VERSION 2.8) -project(amdovx) +project(amdovx-core) + +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) + +find_package(OpenCL QUIET) +find_package(OpenCV QUIET) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +file(COPY examples DESTINATION ${CMAKE_BINARY_DIR}) add_subdirectory(openvx) add_subdirectory(runvx) -add_subdirectory(runcl) + +if(OpenCL_FOUND) + add_subdirectory(runcl) +endif(OpenCL_FOUND) diff --git a/README.md b/README.md index 9cf1f7e..23f0620 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,16 @@ -# AMD OpenVX (AMDOVX) -AMD OpenVX (beta preview) is a highly optimized open source implementation of the [Khronos OpenVX](https://www.khronos.org/registry/vx/) computer vision specification. It allows for rapid prototyping as well as fast execution on a wide range of computer hardware, including small embedded x86 CPUs and large workstation discrete GPUs. +# AMD OpenVX (AMDOVX) +AMD OpenVX (beta) is a highly optimized open source implementation of the [Khronos OpenVX](https://www.khronos.org/registry/vx/) computer vision specification. It allows for rapid prototyping as well as fast execution on a wide range of computer hardware, including small embedded x86 CPUs and large workstation discrete GPUs. The amdovx-core project consists of two components: * [OpenVX](openvx/README.md): AMD OpenVX library * [RunVX](runvx/README.md): command-line utility to execute OpenVX graph described in GDF text file +* [RunCL](runcl/README.md): command-line utility to build, execute, and debug OpenCL programs The OpenVX framework provides a mechanism to add new vision functions to OpenVX by 3rd party vendors. Look into github [amdovx-modules](https://github.com/GPUOpen-ProfessionalCompute-Libraries/amdovx-modules) project for additional OpenVX modules and utilities. -* [vx_loomsl](https://github.com/GPUOpen-ProfessionalCompute-Libraries/amdovx-modules/tree/master/vx_loomsl/README.md): Radeon LOOM stitching library for live 360 degree video applications -* [loom_shell](https://github.com/GPUOpen-ProfessionalCompute-Libraries/amdovx-modules/tree/master/utils/loom_shell/README.md): an interpreter to prototype 360 degree video stitching applications using a script -* [vx_ext_cv](https://github.com/GPUOpen-ProfessionalCompute-Libraries/amdovx-modules/tree/master/vx_ext_cv/README.md): OpenVX module that implemented a mechanism to access OpenCV functionality as OpenVX kernels +* **vx_nn**: OpenVX neural network module that was built on top of [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) +* **vx_loomsl**: Radeon LOOM stitching library for live 360 degree video applications +* **loom_shell**: an interpreter to prototype 360 degree video stitching applications using a script +* **vx_opencv**: OpenVX module that implemented a mechanism to access OpenCV functionality as OpenVX kernels This software is provided under a MIT-style license, see the file COPYRIGHT.txt for details. @@ -21,26 +23,29 @@ This software is provided under a MIT-style license, see the file COPYRIGHT.txt ## Pre-requisites * CPU: SSE4.1 or above CPU, 64-bit. -* GPU: Radeon R7 Series or above (Kaveri+ APU), Radeon 3xx Series or above (optional) - * DRIVER: AMD Catalyst 15.7 or higher (version 15.20) with OpenCL 2.0 runtimes - * AMD APP SDK 3.0 [download](http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/). +* GPU: Radeon Professional Graphics Cards or Vega Family of Products (16GB required for vx_loomsl and vx_nn libraries) + * Windows: install the latest drivers and OpenCL SDK [download](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases) + * Linux: install [ROCm](https://rocm.github.io/ROCmInstall.html) +* OpenCV 3 (optional) [download](https://github.com/opencv/opencv/releases) for RunVX + * Set OpenCV_DIR environment variable to OpenCV/build folder ## Build Instructions Build this project to generate AMD OpenVX library and RunVX executable. * Refer to [openvx/include/VX](openvx/include/VX) for Khronos OpenVX standard header files. * Refer to [openvx/include/vx_ext_amd.h](openvx/include/vx_ext_amd.h) for vendor extensions in AMD OpenVX library. * Refer to [runvx/README.md](runvx/README.md) for RunVX details. +* Refer to [runcl/README.md](runcl/README.md) for RunCL details. ### Build using Visual Studio Professional 2013 on 64-bit Windows 10/8.1/7 -* Install OpenCV 3.0 [download](http://opencv.org/downloads.html) for RunVX tool to support camera capture and image display +* Install OpenCV 3 with contrib [download](https://github.com/opencv/opencv/releases) for RunVX tool to support camera capture and image display (optional) * OpenCV_DIR environment variable should point to OpenCV/build folder * Use amdovx-core/amdovx.sln to build for x64 platform -* If AMD GPU (or OpenCL 2.0) is not available, set build flag ENABLE_OPENCL=0 in openvx/openvx.vcxproj and runvx/runvx.vcxproj. +* If AMD GPU (or OpenCL) is not available, set build flag ENABLE_OPENCL=0 in openvx/openvx.vcxproj and runvx/runvx.vcxproj. ### Build using CMake * Install CMake 2.8 or newer [download](http://cmake.org/download/). -* Install OpenCV 3.0 [download](https://github.com/opencv/opencv/releases/tag/3.0.0) for RunVX tool to support camera capture and image display +* Install OpenCV 3 with contrib [download](https://github.com/opencv/opencv/releases) for RunVX tool to support camera capture and image display (optional) * OpenCV_DIR environment variable should point to OpenCV/build folder * Install libssl-dev on linux (optional) * Use CMake to configure and generate Makefile -* If AMD GPU (or OpenCL 2.0) is not available, use build flag -DCMAKE_DISABLE_FIND_PACKAGE_OpenCL=TRUE. +* If AMD GPU (or OpenCL) is not available, use build flag -DCMAKE_DISABLE_FIND_PACKAGE_OpenCL=TRUE. diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake new file mode 100644 index 0000000..5109b39 --- /dev/null +++ b/cmake/FindOpenCL.cmake @@ -0,0 +1,84 @@ +################################################################################ +# +# MIT License +# +# Copyright (c) 2017 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ +find_path(OPENCL_INCLUDE_DIRS + NAMES OpenCL/cl.h CL/cl.h + HINTS + ${OPENCL_ROOT}/include + $ENV{AMDAPPSDKROOT}/include + $ENV{CUDA_PATH}/include + PATHS + /usr/include + /usr/local/include + /usr/local/cuda/include + /opt/cuda/include + /opt/rocm/opencl/include + DOC "OpenCL header file path" + ) +mark_as_advanced( OPENCL_INCLUDE_DIRS ) + +if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86_64 x64 x86_64/sdk + PATHS + /usr/lib + /usr/local/cuda/lib + /opt/cuda/lib + /opt/rocm/opencl/lib + ) +else( ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86 Win32 + + PATHS + /usr/lib + /usr/local/cuda/lib + /opt/cuda/lib + ) +endif( ) +mark_as_advanced( OPENCL_LIBRARIES ) + +include( FindPackageHandleStandardArgs ) +find_package_handle_standard_args( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) + +set(OpenCL_FOUND ${OPENCL_FOUND} CACHE INTERNAL "") +set(OpenCL_LIBRARIES ${OPENCL_LIBRARIES} CACHE INTERNAL "") +set(OpenCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS} CACHE INTERNAL "") + +if( NOT OPENCL_FOUND ) + message( STATUS "FindOpenCL looked for libraries named: OpenCL" ) +endif() diff --git a/openvx/CMakeLists.txt b/openvx/CMakeLists.txt index fbea63d..61dea04 100644 --- a/openvx/CMakeLists.txt +++ b/openvx/CMakeLists.txt @@ -77,8 +77,8 @@ if( POLICY CMP0054 ) cmake_policy( SET CMP0054 OLD ) endif() if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT /DVX_API_ENTRY=__declspec(dllexport)") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd /DVX_API_ENTRY=__declspec(dllexport)") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD /DVX_API_ENTRY=__declspec(dllexport)") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MDd /DVX_API_ENTRY=__declspec(dllexport)") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -std=c++11") target_link_libraries(openvx dl m) diff --git a/openvx/ago/ago_drama_alloc.cpp b/openvx/ago/ago_drama_alloc.cpp index 3614b59..4232767 100644 --- a/openvx/ago/ago_drama_alloc.cpp +++ b/openvx/ago/ago_drama_alloc.cpp @@ -61,6 +61,202 @@ static int agoOptimizeDramaAllocRemoveUnusedData(AgoGraph * agraph) } #if ENABLE_OPENCL +int agoGpuOclAllocBuffers(AgoGraph * graph) +{ + // get default target + vx_uint32 bufferMergeFlags = 0; + char textBuffer[1024]; + if (agoGetEnvironmentVariable("AGO_BUFFER_MERGE_FLAGS", textBuffer, sizeof(textBuffer))) { + bufferMergeFlags = atoi(textBuffer); + } + + // mark hierarchical level (start,end) of all data in the graph + for (AgoNode * node = graph->nodeList.head; node; node = node->next) { + for (vx_uint32 i = 0; i < node->paramCount; i++) { + AgoData * data = node->paramList[i]; + if (data) { + data->hierarchical_life_start = INT_MAX; + data->hierarchical_life_end = 0; + data->initialization_flags = 0; + for (vx_uint32 j = 0; j < data->numChildren; j++) { + data->children[j]->hierarchical_life_start = INT_MAX; + data->children[j]->hierarchical_life_end = 0; + data->children[j]->initialization_flags = 0; + } + } + } + } + for (AgoSuperNode * supernode = graph->supernodeList; supernode; supernode = supernode->next) { + for (AgoData * data : supernode->dataList) { + data->hierarchical_life_start = min(data->hierarchical_life_start, supernode->hierarchical_level_start); + data->hierarchical_life_end = max(data->hierarchical_life_end, supernode->hierarchical_level_end); + } + } + for (AgoNode * node = graph->nodeList.head; node; node = node->next) { + if (!node->supernode) { + for (vx_uint32 i = 0; i < node->paramCount; i++) { + AgoData * data = node->paramList[i]; + if (data) { + data->hierarchical_life_start = min(data->hierarchical_life_start, node->hierarchical_level); + data->hierarchical_life_end = max(data->hierarchical_life_end, node->hierarchical_level); + for (vx_uint32 j = 0; j < data->numChildren; j++) { + data->children[j]->hierarchical_life_start = min(data->children[j]->hierarchical_life_start, node->hierarchical_level); + data->children[j]->hierarchical_life_end = max(data->children[j]->hierarchical_life_end, node->hierarchical_level); + } + } + } + } + } + + // get the list of virtual data (D) that need GPU buffers and mark if CPU access is not needed for virtual buffers + auto isDataValidForGd = [=](AgoData * data) -> bool { + return data && data->isVirtual; + }; + std::vector D; + for (AgoSuperNode * supernode = graph->supernodeList; supernode; supernode = supernode->next) { + for (size_t i = 0; i < supernode->dataList.size(); i++) { + AgoData * data = supernode->dataList[i]; + if (supernode->dataInfo[i].needed_as_a_kernel_argument && isDataValidForGd(data) && (data->initialization_flags & 1) == 0) { + data->initialization_flags |= 1; + if (!(bufferMergeFlags & 2)) { + data->device_type_unused = AGO_TARGET_AFFINITY_CPU; + } + D.push_back(data); + } + } + } + for (AgoNode * node = graph->nodeList.head; node; node = node->next) { + if (!node->supernode) { + if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU || + node->akernel->opencl_buffer_access_enable) + { + for (vx_uint32 i = 0; i < node->paramCount; i++) { + AgoData * data = node->paramList[i]; + if (isDataValidForGd(data) && (data->initialization_flags & 1) == 0) { + data->initialization_flags |= 1; + if (!(bufferMergeFlags & 2)) { + data->device_type_unused = AGO_TARGET_AFFINITY_CPU; + } + D.push_back(data); + } + } + } + } + } + for (AgoNode * node = graph->nodeList.head; node; node = node->next) { + if (!node->supernode) { + if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_CPU && + !node->akernel->opencl_buffer_access_enable) + { + for (vx_uint32 i = 0; i < node->paramCount; i++) { + AgoData * data = node->paramList[i]; + if (data && data->isVirtual) { + data->device_type_unused &= ~AGO_TARGET_AFFINITY_CPU; + for (vx_uint32 j = 0; j < data->numChildren; j++) { + data->children[j]->device_type_unused &= ~AGO_TARGET_AFFINITY_CPU; + } + } + } + } + } + } + + // get data groups (Gd) + auto getMemObjectType = [=](AgoData * data) -> cl_mem_object_type { + cl_mem_object_type obj_type = CL_MEM_OBJECT_BUFFER; + if (data->ref.type == VX_TYPE_LUT && data->u.lut.type == VX_TYPE_UINT8) + obj_type = CL_MEM_OBJECT_IMAGE1D; + return obj_type; + }; + auto getMemObjectSize = [=](AgoData * data) -> size_t { + return data->opencl_buffer_offset + data->size; + }; + auto isMergePossible = [=](std::vector& G, AgoData * data) -> bool { + vx_uint32 s = data->hierarchical_life_start; + vx_uint32 e = data->hierarchical_life_end; + cl_mem_object_type dataMemType = getMemObjectType(data); + for (auto d : G) { + cl_mem_object_type dMemType = getMemObjectType(d); + if((dataMemType != dMemType) || + (s >= d->hierarchical_life_start && s <= d->hierarchical_life_end) || + (e >= d->hierarchical_life_start && e <= d->hierarchical_life_end)) + { + return false; + } + } + return true; + }; + auto calcMergedCost = [=](std::vector& G, AgoData * data) -> size_t { + size_t size = getMemObjectSize(data); + for (auto d : G) { + size = max(size, getMemObjectSize(d)); + } + return size; + }; + std::vector< std::vector > Gd; + std::vector< size_t > Gsize; + for (AgoData * data : D) { + size_t bestj = INT_MAX, bestCost = INT_MAX; + if (!(bufferMergeFlags & 1)) { + for (size_t j = 0; j < Gd.size(); j++) { + if(isMergePossible(Gd[j], data)) { + size_t cost = calcMergedCost(Gd[j], data); + if(cost < bestCost) { + bestj = j; + bestCost = cost; + } + } + } + } + if(bestj == INT_MAX) { + bestj = Gd.size(); + bestCost = getMemObjectSize(data); + Gd.push_back(std::vector()); + } + Gd[bestj].push_back(data); + } + + // allocate one GPU buffer per group + for (size_t j = 0; j < Gd.size(); j++) { + size_t k = 0; + for (size_t i = 1; i < Gd[j].size(); i++) { + if(getMemObjectSize(Gd[j][i]) > getMemObjectSize(Gd[j][k])) + k = i; + } + if (agoGpuOclAllocBuffer(Gd[j][k]) < 0) { + return -1; + } + for (size_t i = 0; i < Gd[j].size(); i++) { + if(i != k) { + Gd[j][i]->opencl_buffer = Gd[j][k]->opencl_buffer; + Gd[j][i]->opencl_buffer_offset = Gd[j][k]->opencl_buffer_offset; + } + } + } + + // allocate GPU buffers if node scheduled on GPU using OpenCL or using opencl_buffer_access_enable + for (AgoNode * node = graph->nodeList.head; node; node = node->next) { + if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU || + node->akernel->opencl_buffer_access_enable) + { + for (vx_uint32 i = 0; i < node->paramCount; i++) { + AgoData * data = node->paramList[i]; + if (data && !data->opencl_buffer && !data->isVirtual) { + if (agoIsPartOfDelay(data)) { + int siblingTrace[AGO_MAX_DEPTH_FROM_DELAY_OBJECT], siblingTraceCount = 0; + data = agoGetSiblingTraceToDelayForUpdate(data, siblingTrace, siblingTraceCount); + if (!data) return -1; + } + if (agoGpuOclAllocBuffer(data) < 0) { + return -1; + } + } + } + } + } + return 0; +} + static int agoOptimizeDramaAllocGpuResources(AgoGraph * graph) { // check to make sure that GPU resources are needed @@ -85,9 +281,10 @@ static int agoOptimizeDramaAllocGpuResources(AgoGraph * graph) cl_int err = -1; graph->opencl_device = context->opencl_device_list[0]; #if defined(CL_VERSION_2_0) - graph->opencl_cmdq = clCreateCommandQueueWithProperties(context->opencl_context, graph->opencl_device, NULL, &err); + cl_queue_properties properties[] = { CL_QUEUE_PROPERTIES, context->opencl_cmdq_properties, 0 }; + graph->opencl_cmdq = clCreateCommandQueueWithProperties(context->opencl_context, graph->opencl_device, properties, &err); #else - graph->opencl_cmdq = clCreateCommandQueue(context->opencl_context, graph->opencl_device, 0, &err); + graph->opencl_cmdq = clCreateCommandQueue(context->opencl_context, graph->opencl_device, context->opencl_cmdq_properties, &err); #endif if (err) { agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clCreateCommandQueueWithProperties(%p,%p,0,*) => %d\n", context->opencl_context, graph->opencl_device, err); @@ -116,6 +313,11 @@ static int agoOptimizeDramaAllocGpuResources(AgoGraph * graph) if (status == VX_SUCCESS) { if (node->opencl_type & NODE_OPENCL_TYPE_FULL_KERNEL) { strcpy(node->opencl_name, NODE_OPENCL_KERNEL_NAME); + for(vx_size dim = node->opencl_work_dim; dim < 3; dim++) { + node->opencl_global_work[dim] = 1; + node->opencl_local_work[dim] = 1; + } + node->opencl_work_dim = 3; } else { agoAddLogEntry(&node->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaAllocGpuResources: doesn't support kernel %s as a standalone OpenCL kernel\n", node->akernel->name); @@ -151,6 +353,11 @@ static int agoOptimizeDramaAllocGpuResources(AgoGraph * graph) node->opencl_local_work, node->opencl_local_buffer_usage_mask, node->opencl_local_buffer_size_in_bytes); if (status == VX_SUCCESS) { node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL; + for(vx_size dim = node->opencl_work_dim; dim < 3; dim++) { + node->opencl_global_work[dim] = 1; + node->opencl_local_work[dim] = 1; + } + node->opencl_work_dim = 3; } else if (status != AGO_ERROR_KERNEL_NOT_IMPLEMENTED) { agoAddLogEntry(&node->akernel->ref, status, "ERROR: agoOptimizeDramaAllocGpuResources: kernel %s failed to generate OpenCL code (error %d)\n", node->akernel->name, status); @@ -176,10 +383,6 @@ static int agoOptimizeDramaAllocGpuResources(AgoGraph * graph) } // link supernode into node node->supernode = supernode; - // make sure that the GPU buffer resources are allocated in node - if (agoGpuOclAllocBuffers(graph, node) < 0) { - return -1; - } // initialize supernode with OpenCL information supernode->isGpuOclSuperNode = true; supernode->opencl_cmdq = graph->opencl_cmdq; @@ -190,32 +393,33 @@ static int agoOptimizeDramaAllocGpuResources(AgoGraph * graph) } } if (supernode) { - // finalize - if (agoGpuOclSuperNodeFinalize(graph, supernode) < 0) { - return -1; - } // add supernode to the master list supernode->next = graph->supernodeList; graph->supernodeList = supernode; } } - // allocate and finalize single nodes with GPU - for (AgoNode * node = graph->nodeList.head; node; node = node->next) { - if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU && node->attr_affinity.group == 0) { - // make sure that the GPU buffer resources are allocated in node - if (agoGpuOclAllocBuffers(graph, node) < 0) { - return -1; - } - if (agoGpuOclSingleNodeFinalize(graph, node) < 0) { - return -1; - } + + // update supernodes for buffer usage and hierarchical levels + for (AgoSuperNode * supernode = graph->supernodeList; supernode; supernode = supernode->next) { + if (agoGpuOclSuperNodeUpdate(graph, supernode) < 0) { + return -1; + } + } + + // allocate GPU buffers if node scheduled on GPU using OpenCL or using opencl_buffer_access_enable + if (agoGpuOclAllocBuffers(graph) < 0) { + return -1; + } + + // finalize all GPU supernodes and single nodes + for (AgoSuperNode * supernode = graph->supernodeList; supernode; supernode = supernode->next) { + if (agoGpuOclSuperNodeFinalize(graph, supernode) < 0) { + return -1; } } - // allocate buffers for nodes with opencl_buffer_access_enable for (AgoNode * node = graph->nodeList.head; node; node = node->next) { - if (node->akernel->opencl_buffer_access_enable) { - // make sure that the GPU buffer resources are allocated in node - if (agoGpuOclAllocBuffers(graph, node) < 0) { + if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU && node->attr_affinity.group == 0) { + if (agoGpuOclSingleNodeFinalize(graph, node) < 0) { return -1; } } diff --git a/openvx/ago/ago_drama_divide.cpp b/openvx/ago/ago_drama_divide.cpp index abc7722..8e42de6 100644 --- a/openvx/ago/ago_drama_divide.cpp +++ b/openvx/ago/ago_drama_divide.cpp @@ -26,7 +26,7 @@ THE SOFTWARE. #define SANITY_CHECK_DATA_TYPE(data,data_type) if(!data || data->ref.type != data_type) return -1 #define SANITY_CHECK_DATA_TYPE_OPTIONAL(data,data_type) if( data && data->ref.type != data_type) return -1 -int agoDramaDivideAppend(AgoNodeList * nodeList, AgoNode * anode, vx_enum new_kernel_id) +int agoDramaDivideAppend(AgoNodeList * nodeList, AgoNode * anode, vx_enum new_kernel_id, vx_reference * paramList, vx_uint32 paramCount) { if (new_kernel_id == VX_KERNEL_AMD_INVALID) { // TBD: error handling @@ -35,15 +35,26 @@ int agoDramaDivideAppend(AgoNodeList * nodeList, AgoNode * anode, vx_enum new_ke } // create a new AgoNode and add it to the nodeList AgoNode * childnode = agoCreateNode((AgoGraph *)anode->ref.scope, new_kernel_id); - for (vx_uint32 i = 0; i < anode->paramCount; i++) { - childnode->paramList[i] = anode->paramList[i]; + for (vx_uint32 i = 0; i < paramCount; i++) { + childnode->paramList[i] = (AgoData *)paramList[i]; } + anode->drama_divide_invoked = true; // transfer attributes from anode to childnode agoImportNodeConfig(childnode, anode); // verify the node return agoVerifyNode(childnode); } +vx_status VX_CALLBACK agoDramaDivideAddNodeCallback(vx_node node, vx_enum kernel_id, vx_reference * paramList, vx_uint32 paramCount) +{ + return agoDramaDivideAppend(&((AgoGraph *)node->ref.scope)->nodeList, node, kernel_id, paramList, paramCount); +} + +int agoDramaDivideAppend(AgoNodeList * nodeList, AgoNode * anode, vx_enum new_kernel_id) +{ + return agoDramaDivideAppend(nodeList, anode, new_kernel_id, (vx_reference *)anode->paramList, anode->paramCount); +} + int agoDramaDivideColorConvertNode(AgoNodeList * nodeList, AgoNode * anode) { // sanity checks @@ -1764,6 +1775,36 @@ int agoDramaDivideOpticalFlowPyrLkNode(AgoNodeList * nodeList, AgoNode * anode) #endif } +int agoDramaDivideCopyNode(AgoNodeList * nodeList, AgoNode * anode) +{ + // sanity checks + SANITY_CHECK_DATA_TYPE(anode->paramList[1], anode->paramList[0]->ref.type); + // save parameters + AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList)); + anode->paramList[0] = paramList[1]; + anode->paramList[1] = paramList[0]; + anode->paramCount = 2; + vx_enum new_kernel_id = VX_KERNEL_AMD_COPY_DATA_DATA; + return agoDramaDivideAppend(nodeList, anode, new_kernel_id); +} + +int agoDramaDivideSelectNode(AgoNodeList * nodeList, AgoNode * anode) +{ + // sanity checks + SANITY_CHECK_DATA_TYPE(anode->paramList[0], VX_TYPE_SCALAR); + SANITY_CHECK_DATA_TYPE(anode->paramList[3], anode->paramList[1]->ref.type); + SANITY_CHECK_DATA_TYPE(anode->paramList[2], anode->paramList[1]->ref.type); + // save parameters + AgoData * paramList[AGO_MAX_PARAMS]; memcpy(paramList, anode->paramList, sizeof(paramList)); + anode->paramList[0] = paramList[3]; + anode->paramList[1] = paramList[0]; + anode->paramList[2] = paramList[1]; + anode->paramList[3] = paramList[2]; + anode->paramCount = 4; + vx_enum new_kernel_id = VX_KERNEL_AMD_SELECT_DATA_DATA_DATA; + return agoDramaDivideAppend(nodeList, anode, new_kernel_id); +} + int agoDramaDivideNode(AgoNodeList * nodeList, AgoNode * anode) { // save parameter list @@ -1896,6 +1937,12 @@ int agoDramaDivideNode(AgoNodeList * nodeList, AgoNode * anode) case VX_KERNEL_HALFSCALE_GAUSSIAN: status = agoDramaDivideHalfscaleGaussianNode(nodeList, anode); break; + case VX_KERNEL_COPY: + status = agoDramaDivideCopyNode(nodeList, anode); + break; + case VX_KERNEL_SELECT: + status = agoDramaDivideSelectNode(nodeList, anode); + break; default: break; } @@ -1930,7 +1977,7 @@ int agoOptimizeDramaDivide(AgoGraph * agraph) } else { // TBD: error handling - agoAddLogEntry(&anode->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaDivide: failed for node %s\n", anode->akernel->name); + agoAddLogEntry(&anode->akernel->ref, VX_FAILURE, "ERROR: agoOptimizeDramaDivide: failed for node %s (not implemented yet)\n", anode->akernel->name); astatus = -1; // advance to next node, since node divide failed aprev = anode; @@ -1939,10 +1986,11 @@ int agoOptimizeDramaDivide(AgoGraph * agraph) } else if (anode->akernel->regen_callback_f) { // try regenerating the node - vx_bool regen_not_needed = vx_true_e; - vx_status status = anode->akernel->regen_callback_f(agraph, anode, regen_not_needed); + anode->drama_divide_invoked = false; + vx_bool replace_original = vx_true_e; + vx_status status = anode->akernel->regen_callback_f(anode, agoDramaDivideAddNodeCallback, replace_original); if (status == VX_SUCCESS) { - if (regen_not_needed == vx_false_e) { + if (anode->drama_divide_invoked && replace_original) { // remove and release the current node if (aprev) aprev->next = anode->next; else agraph->nodeList.head = anode->next; diff --git a/openvx/ago/ago_drama_remove.cpp b/openvx/ago/ago_drama_remove.cpp index b0cfc95..d21dbd3 100644 --- a/openvx/ago/ago_drama_remove.cpp +++ b/openvx/ago/ago_drama_remove.cpp @@ -911,12 +911,16 @@ int agoOptimizeDramaRemoveCopyNodes(AgoGraph * agraph) for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) { AgoKernel * akernel = anode->akernel; bool nodeCanBeRemoved = false; - if (anode->akernel->id == VX_KERNEL_AMD_CHANNEL_COPY_U8_U8) + if (anode->akernel->id == VX_KERNEL_AMD_CHANNEL_COPY_U8_U8 || anode->akernel->id == VX_KERNEL_AMD_COPY_DATA_DATA) { // copy of a virtual data can be removed by just replacing the virtual data // TBD: need to handle possible optimizations with buffers in delay object AgoData * dstParam = anode->paramList[0]; AgoData * srcParam = anode->paramList[1]; + if (anode->akernel->id == VX_KERNEL_AMD_COPY_DATA_DATA) { + srcParam = anode->paramList[0]; + dstParam = anode->paramList[1]; + } bool replaceSrc = false; bool replaceDst = false; if (dstParam->isVirtual && !agoIsPartOfDelay(dstParam)) { diff --git a/openvx/ago/ago_haf_gpu_corners.cpp b/openvx/ago/ago_haf_gpu_corners.cpp index 984546d..37b1393 100644 --- a/openvx/ago/ago_haf_gpu_corners.cpp +++ b/openvx/ago/ago_haf_gpu_corners.cpp @@ -251,8 +251,8 @@ int HafGpu_FastCorners_XY_U8(AgoNode * node) // FAST without non-max supression // OpenCL work items - node->opencl_global_work[0] = inputImg->u.img.width - 6; - node->opencl_global_work[1] = inputImg->u.img.height - 6; + node->opencl_global_work[0] = (inputImg->u.img.width - 6 + work_group_width - 1) & ~(work_group_width - 1); + node->opencl_global_work[1] = (inputImg->u.img.height - 6 + work_group_height - 1) & ~(work_group_height - 1); // Pragma and data structure declarations sprintf(item, diff --git a/openvx/ago/ago_haf_gpu_special_filters.cpp b/openvx/ago/ago_haf_gpu_special_filters.cpp index eff0861..a934d0c 100644 --- a/openvx/ago/ago_haf_gpu_special_filters.cpp +++ b/openvx/ago/ago_haf_gpu_special_filters.cpp @@ -881,9 +881,9 @@ int HafGpu_CannySobelFilters(AgoNode * node) " %s_GXY(&gx, &gy, x, y, lbuf, p, stride); // LinearFilter_ANYx2_U8\n" " uint mask = select(0xffffu, 0u, y < %d); mask = select(0u, mask, y < %d);\n" // (N >> 1), height - (N >> 1) " U16x8 r; uint mp;\n" - " mp = CannyMagPhase(gx.s0, gy.s0) & mask; mp = select(mp, 0u, x < %du); r.s0 = mp;\n" // (N>>1)-0 - " mp = CannyMagPhase(gx.s1, gy.s1) & mask; mp = select(mp, 0u, x < %du); r.s0 |= (mp << 16);\n" // (N>>1)-1 - " mp = CannyMagPhase(gx.s2, gy.s2) & mask; mp = select(mp, 0u, x < %du); r.s1 = mp;\n" // (N > 5) ? (N>>1)-2 : 0 + " mp = CannyMagPhase(gx.s0, gy.s0) & mask; mp = select(mp, 0u, (int)x < %d); r.s0 = mp;\n" // (N>>1)-0 + " mp = CannyMagPhase(gx.s1, gy.s1) & mask; mp = select(mp, 0u, (int)x < %d); r.s0 |= (mp << 16);\n" // (N>>1)-1 + " mp = CannyMagPhase(gx.s2, gy.s2) & mask; mp = select(mp, 0u, (int)x < %d); r.s1 = mp;\n" // (N > 5) ? (N>>1)-2 : 0 " mp = CannyMagPhase(gx.s3, gy.s3) & mask; r.s1 |= (mp << 16);\n" // " mp = CannyMagPhase(gx.s4, gy.s4) & mask; r.s2 = mp;\n" // " mp = CannyMagPhase(gx.s5, gy.s5) & mask; mp = select(0u, mp, x < %du); r.s2 |= (mp << 16);\n" // width-(N>>1)-5 diff --git a/openvx/ago/ago_interface.cpp b/openvx/ago/ago_interface.cpp index d906f4f..c6f39cc 100644 --- a/openvx/ago/ago_interface.cpp +++ b/openvx/ago/ago_interface.cpp @@ -1210,6 +1210,7 @@ vx_status agoVerifyNode(AgoNode * node) // reset meta data of the node for output argument processing if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) { vx_meta_format meta = &node->metaList[arg]; + meta->data.ref.type = data->ref.type; meta->data.ref.external_count = 1; meta->set_valid_rectangle_callback = nullptr; if (data->ref.type == VX_TYPE_IMAGE) { @@ -1300,7 +1301,7 @@ vx_status agoVerifyNode(AgoNode * node) if (data) { if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) { vx_meta_format meta = &node->metaList[arg]; - if (kernel->argType[arg] && (meta->data.ref.type != kernel->argType[arg])) { + if (kernel->argType[arg] && kernel->argType[arg] != VX_TYPE_REFERENCE && (meta->data.ref.type != kernel->argType[arg])) { agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: output argument type mismatch for argument#%d\n", kernel->name, arg); return VX_ERROR_INVALID_TYPE; } @@ -1576,6 +1577,22 @@ int agoVerifyGraph(AgoGraph * graph) // - single writers // - no loops status = agoOptimizeDramaComputeGraphHierarchy(graph); + if (status) { + return status; + } + +#if ENABLE_OPENCL + // if all nodes run on GPU, clear enable_node_level_opencl_flush + bool cpuTargetBufferNodesExists = false; + for (AgoNode * node = graph->nodeList.head; node; node = node->next) { + if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_CPU && + !node->akernel->opencl_buffer_access_enable) + cpuTargetBufferNodesExists = true; + } + if(!cpuTargetBufferNodesExists) { + graph->enable_node_level_opencl_flush = false; + } +#endif return status; } @@ -1878,7 +1895,7 @@ static int agoDataSyncFromGpuToCpu(AgoGraph * graph, AgoNode * node, AgoData * d if (size > 0) { cl_int err = clEnqueueReadBuffer(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL); if (err) { - agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueReadBuffer() => %d\n", err); + agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueReadBuffer(0x%x,%s,%ld,%ld) => %d\n", dataToSync->ref.type, dataToSync->name.c_str(), dataToSync->opencl_buffer_offset, size, err); return -1; } } @@ -1992,6 +2009,7 @@ int agoExecuteGraph(AgoGraph * graph) memset(&graph->opencl_perf, 0, sizeof(graph->opencl_perf)); #endif // execute one nodes in one hierarchical level at a time + bool opencl_buffer_access_enable = false; for (auto enode = graph->nodeList.head; enode;) { // get snode..enode with next hierarchical_level auto hierarchical_level = enode->hierarchical_level; @@ -2035,31 +2053,63 @@ int agoExecuteGraph(AgoGraph * graph) for (auto node = snode; node != enode; node = node->next) { if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_CPU) { #if ENABLE_OPENCL - agoPerfProfileEntry(graph, ago_profile_type_wait_begin, &node->ref); - if (nodeLaunchHierarchicalLevel > 0 && nodeLaunchHierarchicalLevel < node->hierarchical_level) { - status = agoWaitForNodesCompletion(graph); - if (status != VX_SUCCESS) { - agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoWaitForNodesCompletion failed (%d:%s)\n", status, agoEnum2Name(status)); - return status; + opencl_buffer_access_enable |= (node->akernel->opencl_buffer_access_enable ? true : false); + if (!node->akernel->opencl_buffer_access_enable) { + agoPerfProfileEntry(graph, ago_profile_type_wait_begin, &node->ref); + if (nodeLaunchHierarchicalLevel > 0 && nodeLaunchHierarchicalLevel < node->hierarchical_level) { + status = agoWaitForNodesCompletion(graph); + if (status != VX_SUCCESS) { + agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoWaitForNodesCompletion failed (%d:%s)\n", status, agoEnum2Name(status)); + return status; + } + nodeLaunchHierarchicalLevel = 0; + } + if(opencl_buffer_access_enable) { + cl_int err = clFinish(graph->opencl_cmdq); + if (err) { + agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clFinish(graph) => %d\n", err); + return VX_FAILURE; + } + opencl_buffer_access_enable = false; } - nodeLaunchHierarchicalLevel = 0; + agoPerfProfileEntry(graph, ago_profile_type_wait_end, &node->ref); } - agoPerfProfileEntry(graph, ago_profile_type_wait_end, &node->ref); agoPerfProfileEntry(graph, ago_profile_type_copy_begin, &node->ref); // make sure that all input buffers are synched - for (vx_uint32 i = 0; i < node->paramCount; i++) { - AgoData * data = node->paramList[i]; - if (data && (node->parameters[i].direction == VX_INPUT || node->parameters[i].direction == VX_BIDIRECTIONAL)) { - auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data; - status = agoDataSyncFromGpuToCpu(graph, node, dataToSync); - for (vx_uint32 j = 0; !status && j < dataToSync->numChildren; j++) { - AgoData * jdata = dataToSync->children[j]; - if (jdata) - status = agoDataSyncFromGpuToCpu(graph, node, jdata); + if (node->akernel->opencl_buffer_access_enable) { + for (vx_uint32 i = 0; i < node->paramCount; i++) { + AgoData * data = node->paramList[i]; + if (data && data->opencl_buffer && + (node->parameters[i].direction == VX_INPUT || node->parameters[i].direction == VX_BIDIRECTIONAL)) + { + auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data; + if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT) && + dataToSync->opencl_buffer && !(dataToSync->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) + { + status = agoDirective((vx_reference)dataToSync, VX_DIRECTIVE_AMD_COPY_TO_OPENCL); + if(status != VX_SUCCESS) { + agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoDirective(*,VX_DIRECTIVE_AMD_COPY_TO_OPENCL) failed (%d:%s)\n", status, agoEnum2Name(status)); + return status; + } + } } - if (status) { - agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoDataSyncFromGpuToCpu failed (%d:%s) for node(%s) arg#%d data(%s)\n", status, agoEnum2Name(status), node->akernel->name, i, data->name.c_str()); - return status; + } + } + else { + for (vx_uint32 i = 0; i < node->paramCount; i++) { + AgoData * data = node->paramList[i]; + if (data && (node->parameters[i].direction == VX_INPUT || node->parameters[i].direction == VX_BIDIRECTIONAL)) { + auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data; + status = agoDataSyncFromGpuToCpu(graph, node, dataToSync); + for (vx_uint32 j = 0; !status && j < dataToSync->numChildren; j++) { + AgoData * jdata = dataToSync->children[j]; + if (jdata) + status = agoDataSyncFromGpuToCpu(graph, node, jdata); + } + if (status) { + agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoDataSyncFromGpuToCpu failed (%d:%s) for node(%s) arg#%d data(%s)\n", status, agoEnum2Name(status), node->akernel->name, i, data->name.c_str()); + return status; + } } } } @@ -2111,6 +2161,7 @@ int agoExecuteGraph(AgoGraph * graph) } } #if ENABLE_OPENCL + agoPerfProfileEntry(graph, ago_profile_type_wait_begin, &graph->ref); if (nodeLaunchHierarchicalLevel > 0) { status = agoWaitForNodesCompletion(graph); if (status != VX_SUCCESS) { @@ -2118,6 +2169,14 @@ int agoExecuteGraph(AgoGraph * graph) return status; } } + if(opencl_buffer_access_enable) { + cl_int err = clFinish(graph->opencl_cmdq); + if (err) { + agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clFinish(graph) => %d\n", err); + return VX_FAILURE; + } + } + agoPerfProfileEntry(graph, ago_profile_type_wait_end, &graph->ref); graph->opencl_perf_total.kernel_enqueue += graph->opencl_perf.kernel_enqueue; graph->opencl_perf_total.kernel_wait += graph->opencl_perf.kernel_wait; graph->opencl_perf_total.buffer_read += graph->opencl_perf.buffer_read; @@ -2251,6 +2310,14 @@ vx_status agoDirective(vx_reference reference, vx_enum directive) status = VX_ERROR_NOT_SUPPORTED; } break; + case VX_DIRECTIVE_AMD_DISABLE_OPENCL_FLUSH: + if (reference->type == VX_TYPE_GRAPH) { + ((AgoGraph *)reference)->enable_node_level_opencl_flush = false; + } + else { + status = VX_ERROR_NOT_SUPPORTED; + } + break; default: status = VX_ERROR_NOT_SUPPORTED; break; @@ -2302,7 +2369,11 @@ vx_status agoGraphDumpPerformanceProfile(AgoGraph * graph, const char * fileName if (entry.ref->type == VX_TYPE_GRAPH) strcpy(name, "GRAPH"); else if (entry.ref->type == VX_TYPE_NODE) strncpy(name, ((AgoNode *)entry.ref)->akernel->name, sizeof(name) - 1); else agoGetDataName(name, (AgoData *)entry.ref); - fprintf(fp, "%6d,%4d,%13.3f,%s\n", entry.id, entry.type, (float)(entry.time - stime) * factor, name); + static const char * type_str[] = { + "launch(s)", "launch(e)", "wait(s)", "wait(e)", "copy(s)", "copy(e)", "exec(s)", "exec(e)", + "8", "9", "10", "11", "12", "13", "14", "15" + }; + fprintf(fp, "%6d,%-9.9s,%13.3f,%s\n", entry.id, type_str[entry.type], (float)(entry.time - stime) * factor, name); } // clear the profiling data graph->performance_profile.clear(); diff --git a/openvx/ago/ago_internal.h b/openvx/ago/ago_internal.h index cff59c7..0a23cd7 100644 --- a/openvx/ago/ago_internal.h +++ b/openvx/ago/ago_internal.h @@ -34,7 +34,7 @@ THE SOFTWARE. // // version -#define AGO_VERSION "0.9.6" +#define AGO_VERSION "0.9.7" // debug configuration #define ENABLE_DEBUG_MESSAGES 0 // 0:disable 1:enable @@ -265,7 +265,8 @@ struct AgoConfigImage { vx_uint32 height; vx_df_image format; vx_uint32 stride_in_bytes; - vx_size pixel_size_in_bits; + vx_uint32 pixel_size_in_bits_num; + vx_uint32 pixel_size_in_bits_denom; vx_size components; vx_size planes; vx_bool isVirtual; @@ -417,10 +418,12 @@ struct AgoData { std::list mapped; vx_map_id nextMapId; vx_uint32 hierarchical_level; - vx_uint32 hierarchical_life_start; - vx_uint32 hierarchical_life_end; struct AgoNode * ownerOfUserBufferOpenCL; std::list roiDepList; + vx_uint32 hierarchical_life_start; + vx_uint32 hierarchical_life_end; + vx_uint32 initialization_flags; + vx_uint32 device_type_unused; public: AgoData(); ~AgoData(); @@ -507,8 +510,11 @@ struct AgoSuperNode { cl_program opencl_program; cl_kernel opencl_kernel; cl_event opencl_event; - size_t opencl_global_work[2]; + size_t opencl_global_work[3]; + size_t opencl_local_work[3]; #endif + vx_uint32 hierarchical_level_start; + vx_uint32 hierarchical_level_end; vx_status status; vx_perf_t perf; public: @@ -535,6 +541,7 @@ struct AgoNode { vx_nodecomplete_f callback; AgoSuperNode * supernode; bool initialized; + bool drama_divide_invoked; vx_uint32 valid_rect_num_inputs; vx_uint32 valid_rect_num_outputs; vx_rectangle_t ** valid_rect_inputs; @@ -615,6 +622,7 @@ struct AgoGraph { AgoSuperNode * supernodeList; cl_command_queue opencl_cmdq; cl_device_id opencl_device; + bool enable_node_level_opencl_flush; #endif AgoTargetAffinityInfo_ attr_affinity; vx_uint32 execFrameCount; @@ -682,6 +690,7 @@ struct AgoContext { #if defined(CL_VERSION_2_0) cl_device_svm_capabilities opencl_svmcaps; #endif + cl_command_queue_properties opencl_cmdq_properties; cl_uint opencl_num_devices; cl_device_id opencl_device_list[16]; char opencl_build_options[256]; @@ -757,8 +766,8 @@ AgoData * agoCreateDataFromDescription(AgoContext * acontext, AgoGraph * agraph, void agoGenerateDataName(AgoContext * acontext, const char * postfix, std::string& name); void agoGenerateVirtualDataName(AgoGraph * agraph, const char * postfix, std::string& name); int agoInitializeImageComponentsAndPlanes(AgoContext * acontext); -int agoSetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size components, vx_size planes, vx_size pixelSizeInBits, vx_color_space_e colorSpace, vx_channel_range_e channelRange); -int agoGetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size * pComponents, vx_size * pPlanes, vx_size * pPixelSizeInBits, vx_color_space_e * pColorSpace, vx_channel_range_e * pChannelRange); +int agoSetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size components, vx_size planes, vx_uint32 pixelSizeInBitsNum, vx_uint32 pixelSizeInBitsDenom, vx_color_space_e colorSpace, vx_channel_range_e channelRange); +int agoGetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size * pComponents, vx_size * pPlanes, vx_uint32 * pPixelSizeInBitsNum, vx_uint32 * pPixelSizeInBitsDenom, vx_color_space_e * pColorSpace, vx_channel_range_e * pChannelRange); int agoGetImagePlaneFormat(AgoContext * acontext, vx_df_image format, vx_uint32 width, vx_uint32 height, vx_uint32 plane, vx_df_image *pFormat, vx_uint32 * pWidth, vx_uint32 * pHeight); void agoGetDataName(vx_char * name, AgoData * data); int agoAllocData(AgoData * data); @@ -815,8 +824,9 @@ int agoGpuOclReleaseSuperNode(AgoSuperNode * supernode); int agoGpuOclReleaseData(AgoData * data); int agoGpuOclCreateContext(AgoContext * context, cl_context opencl_context); int agoGpuOclAllocBuffer(AgoData * data); -int agoGpuOclAllocBuffers(AgoGraph * graph, AgoNode * node); +int agoGpuOclAllocBuffers(AgoGraph * graph); int agoGpuOclSuperNodeMerge(AgoGraph * graph, AgoSuperNode * supernode, AgoNode * node); +int agoGpuOclSuperNodeUpdate(AgoGraph * graph, AgoSuperNode * supernode); int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode); int agoGpuOclSuperNodeLaunch(AgoGraph * graph, AgoSuperNode * supernode); int agoGpuOclSuperNodeWait(AgoGraph * graph, AgoSuperNode * supernode); @@ -877,4 +887,14 @@ inline int leftmostbit(unsigned int n) { return pos; } +inline vx_uint32 ImageWidthInBytesFloor(vx_uint32 width, const AgoData * img) +{ + return ((width * img->u.img.pixel_size_in_bits_num + img->u.img.pixel_size_in_bits_denom - 1) / img->u.img.pixel_size_in_bits_denom) >> 3; +} + +inline vx_uint32 ImageWidthInBytesCeil(vx_uint32 width, const AgoData * img) +{ + return ((width * img->u.img.pixel_size_in_bits_num + img->u.img.pixel_size_in_bits_denom - 1) / img->u.img.pixel_size_in_bits_denom + 7) >> 3; +} + #endif // __ago_internal_h__ diff --git a/openvx/ago/ago_kernel_api.cpp b/openvx/ago/ago_kernel_api.cpp index 1e4653e..63f39eb 100644 --- a/openvx/ago/ago_kernel_api.cpp +++ b/openvx/ago/ago_kernel_api.cpp @@ -2081,6 +2081,76 @@ int ovxKernel_HalfScaleGaussian(AgoNode * node, AgoKernelCommand cmd) return status; } +int ovxKernel_Copy(AgoNode * node, AgoKernelCommand cmd) +{ + // INFO: use VX_KERNEL_AMD_COPY_* kernels + vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED; + if (cmd == ago_kernel_cmd_execute) { + // TBD: not implemented yet + } + else if (cmd == ago_kernel_cmd_validate) { + // validate parameters + if (node->paramList[0]->ref.type != node->paramList[1]->ref.type) + return VX_ERROR_INVALID_PARAMETERS; + // set meta must be same as input + vx_meta_format meta; + meta = &node->metaList[1]; + meta->data.ref.type = node->paramList[0]->ref.type; + memcpy(&meta->data.u, &node->paramList[0]->u, sizeof(meta->data.u)); + status = VX_SUCCESS; + } + else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) { + status = VX_SUCCESS; + } + else if (cmd == ago_kernel_cmd_query_target_support) { + node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH + | AGO_KERNEL_FLAG_DEVICE_CPU +#if ENABLE_OPENCL + | AGO_KERNEL_FLAG_DEVICE_GPU +#endif + ; + status = VX_SUCCESS; + } + return status; +} + +int ovxKernel_Select(AgoNode * node, AgoKernelCommand cmd) +{ + // INFO: use VX_KERNEL_AMD_SELECT_* kernels + vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED; + if (cmd == ago_kernel_cmd_execute) { + // TBD: not implemented yet + } + else if (cmd == ago_kernel_cmd_validate) { + // validate parameters + if ((node->paramList[1]->ref.type != node->paramList[2]->ref.type) || (node->paramList[1]->ref.type != node->paramList[3]->ref.type)) + return VX_ERROR_INVALID_PARAMETERS; + if (memcmp(&node->paramList[1]->u, &node->paramList[2]->u, sizeof(node->paramList[1]->u)) != 0) + return VX_ERROR_INVALID_PARAMETERS; + if (node->paramList[0]->u.scalar.type != VX_TYPE_BOOL) + return VX_ERROR_INVALID_TYPE; + // set meta must be same as input + vx_meta_format meta; + meta = &node->metaList[3]; + meta->data.ref.type = node->paramList[1]->ref.type; + memcpy(&meta->data.u, &node->paramList[1]->u, sizeof(meta->data.u)); + status = VX_SUCCESS; + } + else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) { + status = VX_SUCCESS; + } + else if (cmd == ago_kernel_cmd_query_target_support) { + node->target_support_flags = AGO_KERNEL_FLAG_SUBGRAPH + | AGO_KERNEL_FLAG_DEVICE_CPU +#if ENABLE_OPENCL + | AGO_KERNEL_FLAG_DEVICE_GPU +#endif + ; + status = VX_SUCCESS; + } + return status; +} + #if ENABLE_OPENCL ////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Local OpenCL Codegen Functions @@ -2917,15 +2987,15 @@ int agoKernel_Lut_U8_U8(AgoNode * node, AgoKernelCommand cmd) "{\n" " U8x8 r;\n" " float4 f;\n" - " f.s0 = read_imagef(lut, amd_unpack0(p1.s0)).s0 * 255.0f;\n" - " f.s1 = read_imagef(lut, amd_unpack1(p1.s0)).s0 * 255.0f;\n" - " f.s2 = read_imagef(lut, amd_unpack2(p1.s0)).s0 * 255.0f;\n" - " f.s3 = read_imagef(lut, amd_unpack3(p1.s0)).s0 * 255.0f;\n" + " f.s0 = read_imagef(lut, (int)( p1.s0 & 255)).s0 * 255.0f;\n" + " f.s1 = read_imagef(lut, (int)((p1.s0 >> 8) & 255)).s0 * 255.0f;\n" + " f.s2 = read_imagef(lut, (int)((p1.s0 >> 16) & 255)).s0 * 255.0f;\n" + " f.s3 = read_imagef(lut, (int)( p1.s0 >> 24 )).s0 * 255.0f;\n" " r.s0 = amd_pack(f);\n" - " f.s0 = read_imagef(lut, amd_unpack0(p1.s1)).s0 * 255.0f;\n" - " f.s1 = read_imagef(lut, amd_unpack1(p1.s1)).s0 * 255.0f;\n" - " f.s2 = read_imagef(lut, amd_unpack2(p1.s1)).s0 * 255.0f;\n" - " f.s3 = read_imagef(lut, amd_unpack3(p1.s1)).s0 * 255.0f;\n" + " f.s0 = read_imagef(lut, (int)( p1.s1 & 255)).s0 * 255.0f;\n" + " f.s1 = read_imagef(lut, (int)((p1.s1 >> 8) & 255)).s0 * 255.0f;\n" + " f.s2 = read_imagef(lut, (int)((p1.s1 >> 16) & 255)).s0 * 255.0f;\n" + " f.s3 = read_imagef(lut, (int)( p1.s1 >> 24 )).s0 * 255.0f;\n" " r.s1 = amd_pack(f);\n" " *p0 = r;\n" "}\n" @@ -18825,3 +18895,102 @@ int agoKernel_MinMaxLocMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd) } return status; } + +int agoKernel_Copy_DATA_DATA(AgoNode * node, AgoKernelCommand cmd) +{ + vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED; + if (cmd == ago_kernel_cmd_execute) { + // TBD: not implemented yet + status = VX_ERROR_NOT_SUPPORTED; + } + else if (cmd == ago_kernel_cmd_validate) { + // validate parameters + if (node->paramList[0]->ref.type != node->paramList[1]->ref.type) + return VX_ERROR_INVALID_PARAMETERS; + // doesn't support host access buffers + if (node->paramList[0]->import_type != VX_MEMORY_TYPE_NONE || node->paramList[1]->import_type != VX_MEMORY_TYPE_NONE) + return VX_ERROR_NOT_SUPPORTED; + // doesn't support ROIs + if ((node->paramList[0]->ref.type == VX_TYPE_IMAGE && node->paramList[0]->u.img.roiMasterImage) || + (node->paramList[1]->ref.type == VX_TYPE_IMAGE && node->paramList[1]->u.img.roiMasterImage) || + (node->paramList[0]->ref.type == VX_TYPE_TENSOR && node->paramList[0]->u.tensor.roiMaster) || + (node->paramList[1]->ref.type == VX_TYPE_TENSOR && node->paramList[1]->u.tensor.roiMaster)) + return VX_ERROR_NOT_SUPPORTED; + // set meta must be same as input + vx_meta_format meta; + meta = &node->metaList[0]; + meta->data.ref.type = node->paramList[1]->ref.type; + memcpy(&meta->data.u, &node->paramList[1]->u, sizeof(meta->data.u)); + status = VX_SUCCESS; + } + else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) { + status = VX_SUCCESS; + } + else if (cmd == ago_kernel_cmd_valid_rect_callback) { + // TBD: not implemented yet + } +#if ENABLE_OPENCL + else if (cmd == ago_kernel_cmd_opencl_codegen) { + size_t work_group_size = 256; + size_t num_work_items = node->paramList[0]->size / 4; + char code[1024]; + sprintf(code, + "__kernel __attribute__((reqd_work_group_size(%ld, 1, 1)))\n" + "void %s(__global char * dst_buf, uint dst_offset, uint4 dst_stride, __global char * src_buf, uint src_offset, uint4 src_stride)\n" + "{\n" + " uint id = get_global_id(0);\n" + " if(id < %ld) ((__global float *)(dst_buf + dst_offset))[id] = ((__global float *)(src_buf + src_offset))[id];\n" + "}\n", work_group_size, NODE_OPENCL_KERNEL_NAME, num_work_items); + node->opencl_code = code; + // use completely separate kernel + node->opencl_type = NODE_OPENCL_TYPE_FULL_KERNEL; + node->opencl_work_dim = 3; + node->opencl_global_work[0] = (num_work_items + work_group_size - 1) & ~(work_group_size - 1); + node->opencl_global_work[1] = 1; + node->opencl_global_work[2] = 1; + node->opencl_local_work[0] = work_group_size; + node->opencl_local_work[1] = 1; + node->opencl_local_work[2] = 1; + status = VX_SUCCESS; + } +#endif + else if (cmd == ago_kernel_cmd_query_target_support) { + node->target_support_flags = 0; +#if ENABLE_OPENCL + if (node->paramList[0]->ref.type == VX_TYPE_TENSOR) + node->target_support_flags |= AGO_KERNEL_FLAG_DEVICE_GPU | AGO_KERNEL_FLAG_GPU_INTEG_FULL; +#endif + status = VX_SUCCESS; + } + return status; +} + +int agoKernel_Select_DATA_DATA_DATA(AgoNode * node, AgoKernelCommand cmd) +{ + vx_status status = AGO_ERROR_KERNEL_NOT_IMPLEMENTED; + if (cmd == ago_kernel_cmd_execute) { + // TBD: not implemented yet + status = VX_ERROR_NOT_SUPPORTED; + } + else if (cmd == ago_kernel_cmd_validate) { + // TBD: not implemented yet + status = VX_ERROR_NOT_SUPPORTED; + } + else if (cmd == ago_kernel_cmd_initialize || cmd == ago_kernel_cmd_shutdown) { + status = VX_SUCCESS; + } + else if (cmd == ago_kernel_cmd_valid_rect_callback) { + // TBD: not implemented yet + } +#if ENABLE_OPENCL + else if (cmd == ago_kernel_cmd_opencl_codegen) { + // TBD: not implemented yet + status = VX_ERROR_NOT_SUPPORTED; + } +#endif + else if (cmd == ago_kernel_cmd_query_target_support) { + node->target_support_flags = 0; + status = VX_SUCCESS; + } + return status; +} diff --git a/openvx/ago/ago_kernel_api.h b/openvx/ago/ago_kernel_api.h index f311a90..e56f7d9 100644 --- a/openvx/ago/ago_kernel_api.h +++ b/openvx/ago/ago_kernel_api.h @@ -29,7 +29,7 @@ THE SOFTWARE. // import all kernels into framework int agoPublishKernels(AgoContext * acontext); -// OpenVX 1.0 built-in kernels +// OpenVX 1.x built-in kernels int ovxKernel_Invalid(AgoNode * node, AgoKernelCommand cmd); int ovxKernel_ColorConvert(AgoNode * node, AgoKernelCommand cmd); int ovxKernel_ChannelExtract(AgoNode * node, AgoKernelCommand cmd); @@ -72,6 +72,8 @@ int ovxKernel_FastCorners(AgoNode * node, AgoKernelCommand cmd); int ovxKernel_OpticalFlowPyrLK(AgoNode * node, AgoKernelCommand cmd); int ovxKernel_Remap(AgoNode * node, AgoKernelCommand cmd); int ovxKernel_HalfScaleGaussian(AgoNode * node, AgoKernelCommand cmd); +int ovxKernel_Copy(AgoNode * node, AgoKernelCommand cmd); +int ovxKernel_Select(AgoNode * node, AgoKernelCommand cmd); // AMD low-level kernels int agoKernel_Set00_U8(AgoNode * node, AgoKernelCommand cmd); @@ -350,5 +352,7 @@ int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_Max(AgoNode * node, AgoKernel int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax(AgoNode * node, AgoKernelCommand cmd); int agoKernel_MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax(AgoNode * node, AgoKernelCommand cmd); int agoKernel_MinMaxLocMerge_DATA_DATA(AgoNode * node, AgoKernelCommand cmd); +int agoKernel_Copy_DATA_DATA(AgoNode * node, AgoKernelCommand cmd); +int agoKernel_Select_DATA_DATA_DATA(AgoNode * node, AgoKernelCommand cmd); #endif // __ago_kernels_api_h__ diff --git a/openvx/ago/ago_kernel_list.cpp b/openvx/ago/ago_kernel_list.cpp index beac236..297096d 100644 --- a/openvx/ago/ago_kernel_list.cpp +++ b/openvx/ago/ago_kernel_list.cpp @@ -140,6 +140,9 @@ THE SOFTWARE. #define ATYPE_ASSIm { VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_IMAGE, AGO_TYPE_MINMAXLOC_DATA } #define ATYPE_AASSIm { VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_SCALAR, VX_TYPE_SCALAR, VX_TYPE_IMAGE, AGO_TYPE_MINMAXLOC_DATA } #define ATYPE_SAAAAAAAAA { VX_TYPE_SCALAR, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY, VX_TYPE_ARRAY } +#define ATYPE_RR { VX_TYPE_REFERENCE, VX_TYPE_REFERENCE } +#define ATYPE_SRRR { VX_TYPE_SCALAR, VX_TYPE_REFERENCE, VX_TYPE_REFERENCE, VX_TYPE_REFERENCE } +#define ATYPE_RSRR { VX_TYPE_REFERENCE, VX_TYPE_SCALAR, VX_TYPE_REFERENCE, VX_TYPE_REFERENCE } // for kernOpType & kernOpInfo #define KOP_UNKNOWN AGO_KERNEL_OP_TYPE_UNKNOWN, 0, @@ -169,7 +172,7 @@ static struct { AGO_KERNEL_FLAG_GROUP_AMDLL | (cpu_avail ? AGO_KERNEL_FLAG_DEVICE_CPU : 0) | (gpu_avail ? AGO_KERNEL_FLAG_DEVICE_GPU : 0) | \ (validRectReset ? AGO_KERNEL_FLAG_VALID_RECT_RESET : 0), argCfg, argType, kernOp \ } - // OpenVX 1.0 built-in kernels + // OpenVX 1.x built-in kernels OVX_KERNEL_ENTRY( VX_KERNEL_COLOR_CONVERT , ColorConvert, "color_convert", AIN_AOUT, ATYPE_II , false ), OVX_KERNEL_ENTRY( VX_KERNEL_CHANNEL_EXTRACT , ChannelExtract, "channel_extract", AINx2_AOUT, ATYPE_ISI , false ), OVX_KERNEL_ENTRY( VX_KERNEL_CHANNEL_COMBINE , ChannelCombine, "channel_combine", AINx2_AOPTINx2_AOUT, ATYPE_IIIII , false ), @@ -211,6 +214,8 @@ static struct { OVX_KERNEL_ENTRY( VX_KERNEL_OPTICAL_FLOW_PYR_LK , OpticalFlowPyrLK, "optical_flow_pyr_lk", AINx4_AOUT_AINx5, ATYPE_PPAAASSSSS , false ), OVX_KERNEL_ENTRY( VX_KERNEL_REMAP , Remap, "remap", AINx3_AOUT, ATYPE_IRSI , true ), OVX_KERNEL_ENTRY( VX_KERNEL_HALFSCALE_GAUSSIAN , HalfScaleGaussian, "halfscale_gaussian", AIN_AOUT_AIN, ATYPE_IIS , false ), + OVX_KERNEL_ENTRY( VX_KERNEL_COPY , Copy, "copy", AIN_AOUT, ATYPE_RR , false ), + OVX_KERNEL_ENTRY( VX_KERNEL_SELECT , Select, "select", AINx3_AOUT, ATYPE_SRRR , false ), // AMD low-level kernel primitives AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SET_00_U8 , 1, 1, Set00_U8, { AOUT }, ATYPE_I , KOP_ELEMWISE , false ), AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SET_FF_U8 , 1, 1, SetFF_U8, { AOUT }, ATYPE_I , KOP_ELEMWISE , false ), @@ -488,6 +493,8 @@ static struct { AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MAX_COUNT_MINMAX , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_Max_Count_MinMax, AOUT_AOPTOUTx2_AINx2, ATYPE_ASSIm , KOP_UNKNOWN , false ), AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MINMAX_COUNT_MINMAX , 1, 0, MinMaxLoc_DATA_S16DATA_Loc_MinMax_Count_MinMax, AOUTx2_AOPTOUTx2_AINx2, ATYPE_AASSIm , KOP_UNKNOWN , false ), AGO_KERNEL_ENTRY( VX_KERNEL_AMD_MIN_MAX_LOC_MERGE_DATA_DATA , 1, 0, MinMaxLocMerge_DATA_DATA, AOUTx2_AIN_AOPTINx7, ATYPE_SAAAAAAAAA , KOP_UNKNOWN , false ), + AGO_KERNEL_ENTRY( VX_KERNEL_AMD_COPY_DATA_DATA , 1, 1, Copy_DATA_DATA, AOUT_AIN, ATYPE_RR , KOP_UNKNOWN , false ), + AGO_KERNEL_ENTRY( VX_KERNEL_AMD_SELECT_DATA_DATA_DATA , 1, 1, Select_DATA_DATA_DATA, AOUT_AIN, ATYPE_RSRR , KOP_UNKNOWN , false ), #undef AGO_KERNEL_ENTRY #undef OVX_KERNEL_ENTRY }; diff --git a/openvx/ago/ago_kernels.h b/openvx/ago/ago_kernels.h index 9f503b4..18f597b 100644 --- a/openvx/ago/ago_kernels.h +++ b/openvx/ago/ago_kernels.h @@ -434,6 +434,10 @@ enum vx_kernel_amd_e { VX_KERNEL_AMD_MIN_MAX_LOC_DATA_S16DATA_LOC_MINMAX_COUNT_MINMAX, VX_KERNEL_AMD_MIN_MAX_LOC_MERGE_DATA_DATA, + // OpenVX 1.2 kernels + VX_KERNEL_AMD_COPY_DATA_DATA, + VX_KERNEL_AMD_SELECT_DATA_DATA_DATA, + VX_KERNEL_AMD_MAX_1_0, // Used for bounds checking in the internal conformance test }; diff --git a/openvx/ago/ago_util.cpp b/openvx/ago/ago_util.cpp index 7a21ad4..827ee0a 100644 --- a/openvx/ago/ago_util.cpp +++ b/openvx/ago/ago_util.cpp @@ -1015,7 +1015,7 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData data->isNotFullyConfigured = vx_true_e; return 0; } - if (agoGetImageComponentsAndPlanes(acontext, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range)) return -1; + if (agoGetImageComponentsAndPlanes(acontext, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits_num, &data->u.img.pixel_size_in_bits_denom, &data->u.img.color_space, &data->u.img.channel_range)) return -1; if (data->u.img.planes > 1) { if (data->children) delete [] data->children; @@ -1027,17 +1027,17 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData if (agoGetImagePlaneFormat(acontext, data->u.img.format, data->u.img.width, data->u.img.height, child, &format, &width, &height)) return -1; char imgdesc[64]; sprintf(imgdesc, "image%s:%4.4s,%d,%d", data->isVirtual ? "-virtual" : "", FORMAT_STR(format), width, height); if ((data->children[child] = agoCreateDataFromDescription(acontext, agraph, imgdesc, false)) == NULL) return -1; - if (agoGetImageComponentsAndPlanes(acontext, data->children[child]->u.img.format, &data->children[child]->u.img.components, &data->children[child]->u.img.planes, &data->children[child]->u.img.pixel_size_in_bits, &data->children[child]->u.img.color_space, &data->children[child]->u.img.channel_range)) return -1; + if (agoGetImageComponentsAndPlanes(acontext, data->children[child]->u.img.format, &data->children[child]->u.img.components, &data->children[child]->u.img.planes, &data->children[child]->u.img.pixel_size_in_bits_num, &data->children[child]->u.img.pixel_size_in_bits_denom, &data->children[child]->u.img.color_space, &data->children[child]->u.img.channel_range)) return -1; data->children[child]->siblingIndex = (vx_int32)child; data->children[child]->parent = data; data->children[child]->u.img.x_scale_factor_is_2 = (data->children[child]->u.img.width != data->u.img.width ) ? 1 : 0; data->children[child]->u.img.y_scale_factor_is_2 = (data->children[child]->u.img.height != data->u.img.height) ? 1 : 0; - data->children[child]->u.img.stride_in_bytes = ALIGN16((data->children[child]->u.img.width * data->children[child]->u.img.pixel_size_in_bits + 7) >> 3); + data->children[child]->u.img.stride_in_bytes = ALIGN16(ImageWidthInBytesCeil(data->children[child]->u.img.width, data->children[child])); data->children[child]->opencl_buffer_offset = OPENCL_IMAGE_FIXED_OFFSET + data->children[child]->u.img.stride_in_bytes; } } else if (data->u.img.planes == 1) { - data->u.img.stride_in_bytes = ALIGN16((data->u.img.width * data->u.img.pixel_size_in_bits + 7) >> 3); + data->u.img.stride_in_bytes = ALIGN16(ImageWidthInBytesCeil(data->u.img.width , data)); data->opencl_buffer_offset = OPENCL_IMAGE_FIXED_OFFSET + data->u.img.stride_in_bytes; } // sanity check and update @@ -1064,7 +1064,7 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData data->u.img.isUniform = vx_true_e; memcpy(&data->u.img.format, desc, sizeof(data->u.img.format)); if (sscanf(desc + 5, "%d,%d," VX_FMT_SIZE "," VX_FMT_SIZE "," VX_FMT_SIZE "," VX_FMT_SIZE "", &data->u.img.width, &data->u.img.height, &data->u.img.uniform[0], &data->u.img.uniform[1], &data->u.img.uniform[2], &data->u.img.uniform[3]) < 2) return -1; - if (agoGetImageComponentsAndPlanes(acontext, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range)) return -1; + if (agoGetImageComponentsAndPlanes(acontext, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits_num, &data->u.img.pixel_size_in_bits_denom, &data->u.img.color_space, &data->u.img.channel_range)) return -1; data->isInitialized = vx_true_e; if (data->u.img.planes > 1) { if (data->children) @@ -1083,7 +1083,7 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData char imgdesc[64]; sprintf(imgdesc, "image-uniform%s:%4.4s,%d,%d,%d", data->isVirtual ? "-virtual" : "", FORMAT_STR(format), width, height, value); if ((data->children[child] = agoCreateDataFromDescription(acontext, agraph, imgdesc, false)) == NULL) return -1; - if (agoGetImageComponentsAndPlanes(acontext, data->children[child]->u.img.format, &data->children[child]->u.img.components, &data->children[child]->u.img.planes, &data->children[child]->u.img.pixel_size_in_bits, &data->children[child]->u.img.color_space, &data->children[child]->u.img.channel_range)) return -1; + if (agoGetImageComponentsAndPlanes(acontext, data->children[child]->u.img.format, &data->children[child]->u.img.components, &data->children[child]->u.img.planes, &data->children[child]->u.img.pixel_size_in_bits_num, &data->children[child]->u.img.pixel_size_in_bits_denom, &data->children[child]->u.img.color_space, &data->children[child]->u.img.channel_range)) return -1; data->children[child]->isInitialized = vx_true_e; data->children[child]->parent = data; data->children[child]->u.img.x_scale_factor_is_2 = (data->children[child]->u.img.width != data->u.img.width ) ? 1 : 0; @@ -1100,12 +1100,12 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData data->children[child]->u.img.minValue = (vx_int32)data->children[child]->u.img.uniform[0]; data->children[child]->u.img.maxValue = (vx_int32)data->children[child]->u.img.uniform[0]; } - data->children[child]->u.img.stride_in_bytes = ALIGN16((data->children[child]->u.img.width * data->children[child]->u.img.pixel_size_in_bits + 7) >> 3); + data->children[child]->u.img.stride_in_bytes = ALIGN16(ImageWidthInBytesCeil(data->children[child]->u.img.width, data->children[child])); data->children[child]->opencl_buffer_offset = OPENCL_IMAGE_FIXED_OFFSET + data->children[child]->u.img.stride_in_bytes; } } else if (data->u.img.planes == 1) { - data->u.img.stride_in_bytes = ALIGN16((data->u.img.width * data->u.img.pixel_size_in_bits + 7) >> 3); + data->u.img.stride_in_bytes = ALIGN16(ImageWidthInBytesCeil(data->u.img.width, data)); data->opencl_buffer_offset = OPENCL_IMAGE_FIXED_OFFSET + data->u.img.stride_in_bytes; } // set min/max values as uniform value @@ -1201,14 +1201,14 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData data->children[child]->u.img.stride_in_bytes = dataMaster->children[child]->u.img.stride_in_bytes; data->children[child]->opencl_buffer_offset = dataMaster->children[child]->opencl_buffer_offset + data->children[child]->u.img.rect_roi.start_y * data->children[child]->u.img.stride_in_bytes + - ((data->children[child]->u.img.rect_roi.start_x * (vx_uint32)data->children[child]->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor(data->children[child]->u.img.rect_roi.start_x, data->children[child]); } } else if (data->u.img.planes == 1) { data->u.img.stride_in_bytes = dataMaster->u.img.stride_in_bytes; data->opencl_buffer_offset = dataMaster->opencl_buffer_offset + data->u.img.rect_roi.start_y * data->u.img.stride_in_bytes + - ((data->u.img.rect_roi.start_x * (vx_uint32)data->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor(data->u.img.rect_roi.start_x, data); } // sanity check and update if (agoDataSanityCheckAndUpdate(data)) { @@ -1246,10 +1246,10 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData char imgdesc[64]; sprintf(imgdesc, "image%s:%4.4s,%d,%d", data->isVirtual ? "-virtual" : "", FORMAT_STR(data->u.pyr.format), width, height); if ((data->children[level] = agoCreateDataFromDescription(acontext, agraph, imgdesc, false)) == NULL) return -1; - if (agoGetImageComponentsAndPlanes(acontext, data->u.pyr.format, &data->children[level]->u.img.components, &data->children[level]->u.img.planes, &data->children[level]->u.img.pixel_size_in_bits, &data->children[level]->u.img.color_space, &data->children[level]->u.img.channel_range)) return -1; + if (agoGetImageComponentsAndPlanes(acontext, data->u.pyr.format, &data->children[level]->u.img.components, &data->children[level]->u.img.planes, &data->children[level]->u.img.pixel_size_in_bits_num, &data->children[level]->u.img.pixel_size_in_bits_denom, &data->children[level]->u.img.color_space, &data->children[level]->u.img.channel_range)) return -1; data->children[level]->siblingIndex = (vx_int32)level; data->children[level]->parent = data; - data->children[level]->u.img.stride_in_bytes = ALIGN16((data->children[level]->u.img.width * data->children[level]->u.img.pixel_size_in_bits + 7) >> 3); + data->children[level]->u.img.stride_in_bytes = ALIGN16(ImageWidthInBytesCeil(data->children[level]->u.img.width, data->children[level])); data->children[level]->opencl_buffer_offset = OPENCL_IMAGE_FIXED_OFFSET + data->children[level]->u.img.stride_in_bytes; if (data->u.pyr.scale == VX_SCALE_PYRAMID_ORB) { float orb_scale_factor[4] = { @@ -1603,12 +1603,15 @@ int agoGetDataFromDescription(AgoContext * acontext, AgoGraph * agraph, AgoData if (agoParseWordFromDescription(desc, sizeof(data_type), data_type) < 0) return -1; data->u.tensor.data_type = agoName2Enum(data_type); - if (data->u.tensor.data_type != VX_TYPE_INT16 && data->u.tensor.data_type != VX_TYPE_FLOAT32 && data->u.tensor.data_type != VX_TYPE_FLOAT16) { + if (data->u.tensor.data_type != VX_TYPE_INT16 && + data->u.tensor.data_type != VX_TYPE_UINT8 && data->u.tensor.data_type != VX_TYPE_UINT16 && + data->u.tensor.data_type != VX_TYPE_FLOAT32 && data->u.tensor.data_type != VX_TYPE_FLOAT16) + { agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGetDataFromDescription: invalid data_type for tensor: %s\n", data_type); return -1; } data->u.tensor.fixed_point_pos = 0; - if (data->u.tensor.data_type == VX_TYPE_INT16) { + if (data->u.tensor.data_type != VX_TYPE_FLOAT32 && data->u.tensor.data_type != VX_TYPE_FLOAT16) { if (*desc++ != ',') return -1; if (agoParseValueFromDescription(desc, data->u.tensor.fixed_point_pos) < 0) return -1; @@ -1809,35 +1812,36 @@ void agoGenerateVirtualDataName(AgoGraph * agraph, const char * postfix, std::st int agoInitializeImageComponentsAndPlanes(AgoContext * acontext) { - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_RGBX, 4, 1, 4 * 8, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_RGB, 3, 1, 3 * 8, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_NV12, 3, 2, 0, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_NV21, 3, 2, 0, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_UYVY, 3, 1, 2 * 8, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_YUYV, 3, 1, 2 * 8, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_IYUV, 3, 3, 0, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_YUV4, 3, 3, 0, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U8, 1, 1, 8, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U16, 1, 1, 16, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_S16, 1, 1, 16, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U32, 1, 1, 32, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_S32, 1, 1, 32, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U1_AMD, 1, 1, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F32x3_AMD, 3, 1, 3 * 32, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F32_AMD, 1, 1, 32, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F64_AMD, 1, 1, 64, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); - agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F16_AMD, 1, 1, 16, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_RGBX, 4, 1, 4 * 8, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_RGB, 3, 1, 3 * 8, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_NV12, 3, 2, 0, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_NV21, 3, 2, 0, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_UYVY, 3, 1, 2 * 8, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_YUYV, 3, 1, 2 * 8, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_IYUV, 3, 3, 0, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_YUV4, 3, 3, 0, 1, VX_COLOR_SPACE_DEFAULT, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U8, 1, 1, 8, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U16, 1, 1, 16, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_S16, 1, 1, 16, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U32, 1, 1, 32, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_S32, 1, 1, 32, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_U1_AMD, 1, 1, 1, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F32x3_AMD, 3, 1, 3 * 32, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F32_AMD, 1, 1, 32, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F64_AMD, 1, 1, 64, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); + agoSetImageComponentsAndPlanes(acontext, VX_DF_IMAGE_F16_AMD, 1, 1, 16, 1, VX_COLOR_SPACE_NONE, VX_CHANNEL_RANGE_FULL); return 0; } -int agoSetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size components, vx_size planes, vx_size pixelSizeInBits, vx_color_space_e colorSpace, vx_channel_range_e channelRange) +int agoSetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size components, vx_size planes, vx_uint32 pixelSizeInBitsNum, vx_uint32 pixelSizeInBitsDenom, vx_color_space_e colorSpace, vx_channel_range_e channelRange) { // check to make sure that there are duplicate entries for (auto it = acontext->image_format_list.begin(); it != acontext->image_format_list.end(); it++) { if (it->format == format) { if (it->desc.components == components && it->desc.planes == planes && - it->desc.pixelSizeInBits == pixelSizeInBits && + it->desc.pixelSizeInBitsNum == pixelSizeInBitsNum && + it->desc.pixelSizeInBitsDenom == pixelSizeInBitsDenom && it->desc.colorSpace == colorSpace && it->desc.channelRange == channelRange) { @@ -1855,21 +1859,23 @@ int agoSetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx item.format = format; item.desc.components = components; item.desc.planes = planes; - item.desc.pixelSizeInBits = pixelSizeInBits; + item.desc.pixelSizeInBitsNum = pixelSizeInBitsNum; + item.desc.pixelSizeInBitsDenom = pixelSizeInBitsDenom; item.desc.colorSpace = colorSpace; item.desc.channelRange = channelRange; acontext->image_format_list.push_back(item); return 0; } -int agoGetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size * pComponents, vx_size * pPlanes, vx_size * pPixelSizeInBits, vx_color_space_e * pColorSpace, vx_channel_range_e * pChannelRange) +int agoGetImageComponentsAndPlanes(AgoContext * acontext, vx_df_image format, vx_size * pComponents, vx_size * pPlanes, vx_uint32 * pPixelSizeInBitsNum, vx_uint32 * pPixelSizeInBitsDenom, vx_color_space_e * pColorSpace, vx_channel_range_e * pChannelRange) { // search format in context for (auto it = acontext->image_format_list.begin(); it != acontext->image_format_list.end(); it++) { if (it->format == format) { *pComponents = it->desc.components; *pPlanes = it->desc.planes; - *pPixelSizeInBits = it->desc.pixelSizeInBits; + *pPixelSizeInBitsNum = (vx_uint32)it->desc.pixelSizeInBitsNum; + *pPixelSizeInBitsDenom = (vx_uint32)it->desc.pixelSizeInBitsDenom; *pColorSpace = it->desc.colorSpace; *pChannelRange = it->desc.channelRange; return 0; @@ -2132,11 +2138,11 @@ int agoDataSanityCheckAndUpdate(AgoData * data) } else if (data->u.img.isROI) { // re-compute image parameters to deal with parameter changes - agoGetImageComponentsAndPlanes(data->ref.context, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range); + agoGetImageComponentsAndPlanes(data->ref.context, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits_num, &data->u.img.pixel_size_in_bits_denom, &data->u.img.color_space, &data->u.img.channel_range); // get buffer stride and compute buffer start address data->u.img.stride_in_bytes = data->u.img.roiMasterImage->u.img.stride_in_bytes; - if ((data->u.img.rect_roi.start_x * data->u.img.pixel_size_in_bits) & 7) { - agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: detected U1 ROI that doesn't start on 8-bit boundary: %s at (%d,%d)\n", data->name.length() ? data->name.c_str() : "", data->u.img.rect_roi.start_x, data->u.img.rect_roi.start_y); + if (((data->u.img.rect_roi.start_x * data->u.img.pixel_size_in_bits_num) & 7) || (data->u.img.pixel_size_in_bits_denom > 1)) { + agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: detected ROI that doesn't start on 8-bit boundary: %s at (%d,%d)\n", data->name.length() ? data->name.c_str() : "", data->u.img.rect_roi.start_x, data->u.img.rect_roi.start_y); return -1; } // set valid region to overlap of parent and ROI @@ -2157,11 +2163,11 @@ int agoDataSanityCheckAndUpdate(AgoData * data) // re-compute image parameters to deal with parameter changes // NOTE: image buffer stride needs to be multiple of 16 bytes to support CPU/GPU optimizations // NOTE: image buffer height needs to be mutliple of 16 to support OpenCL workgroup height=16 - agoGetImageComponentsAndPlanes(data->ref.context, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits, &data->u.img.color_space, &data->u.img.channel_range); + agoGetImageComponentsAndPlanes(data->ref.context, data->u.img.format, &data->u.img.components, &data->u.img.planes, &data->u.img.pixel_size_in_bits_num, &data->u.img.pixel_size_in_bits_denom, &data->u.img.color_space, &data->u.img.channel_range); // calculate other attributes and buffer size: // - make sure that the stride is multiple of 16 bytes if (data->import_type == VX_IMPORT_TYPE_NONE) { - data->u.img.stride_in_bytes = ALIGN16((data->u.img.width * data->u.img.pixel_size_in_bits + 7) >> 3); + data->u.img.stride_in_bytes = ALIGN16(ImageWidthInBytesCeil(data->u.img.width, data)); data->size = ALIGN16(data->u.img.height) * data->u.img.stride_in_bytes; } else { @@ -2311,6 +2317,10 @@ int agoAllocData(AgoData * data) // can't proceed further return -1; } + else if (data->isVirtual && (data->device_type_unused & AGO_TARGET_AFFINITY_CPU)) { + // no need to allocate: unused CPU buffers + return 0; + } if (data->ref.type == VX_TYPE_DELAY) { for (vx_uint32 child = 0; child < data->numChildren; child++) { @@ -2366,7 +2376,7 @@ int agoAllocData(AgoData * data) // get the region from master image data->buffer = data->u.img.roiMasterImage->buffer + data->u.img.rect_roi.start_y * data->u.img.stride_in_bytes + - ((data->u.img.rect_roi.start_x * data->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor(data->u.img.rect_roi.start_x, data); } else { if (data->u.img.isUniform) { @@ -2934,6 +2944,7 @@ AgoData::AgoData() opencl_buffer_offset{ 0 }, isVirtual{ vx_false_e }, isDelayed{ vx_false_e }, isNotFullyConfigured{ vx_false_e }, isInitialized{ vx_false_e }, siblingIndex{ 0 }, numChildren{ 0 }, children{ nullptr }, parent{ nullptr }, inputUsageCount{ 0 }, outputUsageCount{ 0 }, inoutUsageCount{ 0 }, + initialization_flags{ 0 }, device_type_unused{ 0 }, nextMapId{ 0 }, hierarchical_level{ 0 }, hierarchical_life_start{ 0 }, hierarchical_life_end{ 0 }, ownerOfUserBufferOpenCL{ nullptr } { memset(&u, 0, sizeof(u)); @@ -2983,10 +2994,12 @@ AgoSuperNode::AgoSuperNode() #if ENABLE_OPENCL opencl_cmdq{ nullptr }, opencl_program{ nullptr }, opencl_kernel{ nullptr }, opencl_event{ nullptr }, #endif + hierarchical_level_start{ 0 }, hierarchical_level_end{ 0 }, status{ VX_SUCCESS } { #if ENABLE_OPENCL memset(&opencl_global_work, 0, sizeof(opencl_global_work)); + memset(&opencl_local_work, 0, sizeof(opencl_local_work)); #endif memset(&perf, 0, sizeof(perf)); } @@ -2997,6 +3010,7 @@ AgoNode::AgoNode() : next{ nullptr }, akernel{ nullptr }, flags{ 0 }, localDataSize{ 0 }, localDataPtr{ nullptr }, localDataPtr_allocated{ nullptr }, valid_rect_reset{ vx_true_e }, valid_rect_num_inputs{ 0 }, valid_rect_num_outputs{ 0 }, valid_rect_inputs{ nullptr }, valid_rect_outputs{ nullptr }, paramCount{ 0 }, callback{ nullptr }, supernode{ nullptr }, initialized{ false }, target_support_flags{ 0 }, hierarchical_level{ 0 }, status{ VX_SUCCESS } + , drama_divide_invoked{ false } #if ENABLE_OPENCL , opencl_type{ 0 }, opencl_param_mem2reg_mask{ 0 }, opencl_param_discard_mask{ 0 }, opencl_param_as_value_mask{ 0 }, opencl_param_atomic_mask{ 0 }, opencl_local_buffer_usage_mask{ 0 }, opencl_local_buffer_size_in_bytes{ 0 }, opencl_work_dim{ 0 }, @@ -3050,6 +3064,7 @@ AgoGraph::AgoGraph() virtualDataGenerationCount{ 0 }, optimizer_flags{ AGO_GRAPH_OPTIMIZER_FLAGS_DEFAULT }, verified{ false }, enable_performance_profiling{ false }, execFrameCount{ 0 } #if ENABLE_OPENCL , supernodeList{ nullptr }, opencl_cmdq{ nullptr }, opencl_device{ nullptr } + , enable_node_level_opencl_flush{ true } #endif { memset(&dataList, 0, sizeof(dataList)); @@ -3097,6 +3112,7 @@ AgoContext::AgoContext() #endif , opencl_context_imported{ false }, opencl_context{ nullptr }, opencl_cmdq{ nullptr }, opencl_config_flags{ 0 }, opencl_num_devices{ 0 }, isAmdMediaOpsSupported{ true } , opencl_mem_alloc_size{ 0 }, opencl_mem_alloc_count{ 0 }, opencl_mem_release_count{ 0 } + , opencl_cmdq_properties{ 0 } #endif { memset(&kernelList, 0, sizeof(kernelList)); @@ -3140,13 +3156,6 @@ AgoContext::~AgoContext() agraph = next; } - agoResetDataList(&dataList); - for (AgoData * data = graph_garbage_data; data;) { - AgoData * item = data; - data = data->next; - delete item; - } - for (AgoNode * node = graph_garbage_node; node;) { AgoNode * item = node; node = node->next; @@ -3159,6 +3168,13 @@ AgoContext::~AgoContext() delete item; } + agoResetDataList(&dataList); + for (AgoData * data = graph_garbage_data; data;) { + AgoData * item = data; + data = data->next; + delete item; + } + for (auto it = macros.begin(); it != macros.end(); ++it) { if (it->text_allocated) free(it->text_allocated); diff --git a/openvx/ago/ago_util_opencl.cpp b/openvx/ago/ago_util_opencl.cpp index ab3b1b1..236ad28 100644 --- a/openvx/ago/ago_util_opencl.cpp +++ b/openvx/ago/ago_util_opencl.cpp @@ -317,9 +317,10 @@ int agoGpuOclCreateContext(AgoContext * context, cl_context opencl_context) #endif // create command queue for buffer sync #if defined(CL_VERSION_2_0) - context->opencl_cmdq = clCreateCommandQueueWithProperties(context->opencl_context, context->opencl_device_list[device_id], NULL, &status); + cl_queue_properties properties[] = { CL_QUEUE_PROPERTIES, context->opencl_cmdq_properties, 0 }; + context->opencl_cmdq = clCreateCommandQueueWithProperties(context->opencl_context, context->opencl_device_list[device_id], properties, &status); #else - context->opencl_cmdq = clCreateCommandQueue(context->opencl_context, context->opencl_device_list[device_id], 0, &status); + context->opencl_cmdq = clCreateCommandQueue(context->opencl_context, context->opencl_device_list[device_id], context->opencl_cmdq_properties, &status); #endif if (status) { agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: clCreateCommandQueueWithProperties(%p,%p,0,*) => %d\n", context->opencl_context, context->opencl_device_list[device_id], status); @@ -547,24 +548,6 @@ int agoGpuOclAllocBuffer(AgoData * data) return 0; } -int agoGpuOclAllocBuffers(AgoGraph * graph, AgoNode * node) -{ - for (vx_uint32 i = 0; i < node->paramCount; i++) { - AgoData * data = node->paramList[i]; - if (data && !data->opencl_buffer) { - if (agoIsPartOfDelay(data)) { - int siblingTrace[AGO_MAX_DEPTH_FROM_DELAY_OBJECT], siblingTraceCount = 0; - data = agoGetSiblingTraceToDelayForUpdate(data, siblingTrace, siblingTraceCount); - if (!data) return -1; - } - if (agoGpuOclAllocBuffer(data) < 0) { - return -1; - } - } - } - return 0; -} - int agoGpuOclSuperNodeMerge(AgoGraph * graph, AgoSuperNode * supernode, AgoNode * node) { // sanity check @@ -834,6 +817,18 @@ static int agoGpuOclSetKernelArgs(cl_kernel opencl_kernel, vx_uint32& kernelArgI return -1; } kernelArgIndex++; + vx_uint32 stride[4] = { + (vx_uint32)data->u.tensor.stride[0], + (vx_uint32)data->u.tensor.stride[1], + (vx_uint32)data->u.tensor.stride[2], + (vx_uint32)data->u.tensor.stride[3] + }; + err = clSetKernelArg(opencl_kernel, (cl_uint)kernelArgIndex, sizeof(stride), stride); + if (err) { + agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: clSetKernelArg(supernode,%d,*,tensor.offset) failed(%d) for group#%d\n", (cl_uint)kernelArgIndex, err, group); + return -1; + } + kernelArgIndex++; } else { agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoGpuOclSetKernelArgs: doesn't support object type %s in group#%d for kernel arg setting\n", agoEnum2Name(data->ref.type), group); @@ -1100,7 +1095,7 @@ static int agoGpuOclDataInputSync(AgoGraph * graph, cl_kernel opencl_kernel, vx_ if (agoGpuOclDataSetBufferAsKernelArg(data, opencl_kernel, kernelArgIndex, group) < 0) return -1; } - kernelArgIndex += 2; + kernelArgIndex += 3; if (need_read_access) { auto dataToSync = data->u.tensor.roiMaster ? data->u.tensor.roiMaster : data; if (!(dataToSync->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) { @@ -1341,7 +1336,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) if (code.find("#pragma OPENCL EXTENSION cl_amd_media_ops : enable") != std::string::npos) { std::string clmediaopscode = OPENCL_FORMAT( - "uint amd_pack(float4 src){\n" + "inline uint amd_pack(float4 src){\n" " uint dst = ((uint)(clamp (src.s0,0.0f,255.0f)) )\n" " + ((uint)(clamp (src.s1,0.0f,255.0f))<< 8 ) \n" " + ((uint)(clamp (src.s2,0.0f,255.0f))<< 16) \n" @@ -1349,37 +1344,37 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst;\n" "}\n" "\n" - "float amd_unpack3(uint src){\n" + "inline float amd_unpack3(uint src){\n" " float dst= (float)((src >> 24) & 0xff);\n" " return dst;\n" "}\n" "\n" - "float amd_unpack2(uint src){\n" + "inline float amd_unpack2(uint src){\n" " float dst= (float)((src >> 16) & 0xff);\n" " return dst;\n" "}\n" "\n" - "float amd_unpack1(uint src){\n" + "inline float amd_unpack1(uint src){\n" " float dst= (float)((src >> 8) & 0xff);\n" " return dst;\n" "}\n" "\n" - "float amd_unpack0(uint src){\n" + "inline float amd_unpack0(uint src){\n" " float dst= (float)((src)& 0xff);\n" " return dst;\n" "}\n" "\n" - "uint amd_bitalign(uint src0,uint src1, uint src2){\n" + "inline uint amd_bitalign(uint src0,uint src1, uint src2){\n" " uint dst = (uint)(as_ulong((uint2)(src1,src0)) >> (src2 & 31));\n" " return dst;\n" "}\n" "\n" - "uint amd_bytealign(uint src0,uint src1, uint src2){\n" + "inline uint amd_bytealign(uint src0,uint src1, uint src2){\n" " uint dst = (uint)(as_ulong((uint2)(src1,src0)) >> (src2 & 31) * 8 );\n" " return dst;\n" "}\n" "\n" - "uint amd_lerp(uint src0, uint src1, uint src2) {\n" + "inline uint amd_lerp(uint src0, uint src1, uint src2) {\n" " uint dst = (((((src0 >> 0) & 0xff) + ((src1 >> 0) & 0xff) + ((src2 >> 0) & 1)) >> 1) << 0) + \n" " (((((src0 >> 8) & 0xff) + ((src1 >> 8) & 0xff) + ((src2 >> 8) & 1)) >> 1) << 8) + \n" " (((((src0 >> 16) & 0xff) + ((src1 >> 16) & 0xff) + ((src2 >> 16) & 1)) >> 1) << 16) + \n" @@ -1387,7 +1382,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst;" "}\n" "\n" - "uint amd_sad(uint src0, uint src1, uint src2){ \n" + "inline uint amd_sad(uint src0, uint src1, uint src2){ \n" " uint dst = src2 + \n" " abs(((src0 >> 0) & 0xff) - ((src1 >> 0) & 0xff)) + \n" " abs(((src0 >> 8) & 0xff) - ((src1 >> 8) & 0xff)) + \n" @@ -1396,7 +1391,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst; \n" "}\n" "\n" - "uint amd_sadhi(uint src0, uint src1, uint src2){ \n" + "inline uint amd_sadhi(uint src0, uint src1, uint src2){ \n" " uint dst = src2 + \n" " (abs(((src0 >> 0) & 0xff) - ((src1 >> 0) & 0xff)) << 16) + \n" " (abs(((src0 >> 8) & 0xff) - ((src1 >> 8) & 0xff)) << 16) + \n" @@ -1405,7 +1400,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst; \n" "}\n" "\n" - "uint amd_sad4(uint4 src0, uint4 src1, uint src2) { \n" + "inline uint amd_sad4(uint4 src0, uint4 src1, uint src2) { \n" " uint dst = src2 + \n" " abs(((src0.s0 >> 0) & 0xff) - ((src1.s0 >> 0) & 0xff)) + \n" " abs(((src0.s0 >> 8) & 0xff) - ((src1.s0 >> 8) & 0xff)) + \n" @@ -1429,7 +1424,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) ); std::string clmediaops2code = OPENCL_FORMAT( - "uint amd_msad(uint src0, uint src1, uint src2){ \n" + "inline uint amd_msad(uint src0, uint src1, uint src2){ \n" " uchar4 src0u8 = as_uchar4(src0); \n" " uchar4 src1u8 = as_uchar4(src1); \n" " uint dst = src2 + \n" @@ -1440,7 +1435,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst; \n" "}\n" "\n" - "ulong amd_qsad(ulong src0, uint src1, ulong src2) { \n" + "inline ulong amd_qsad(ulong src0, uint src1, ulong src2) { \n" " uchar8 src0u8 = as_uchar8(src0); \n" " ushort4 src2u16 = as_ushort4(src2); \n" " ushort4 dstu16; \n" @@ -1452,7 +1447,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst; \n" "}\n" "\n" - "ulong amd_mqsad(ulong src0, uint src1, ulong src2) { \n" + "inline ulong amd_mqsad(ulong src0, uint src1, ulong src2) { \n" " uchar8 src0u8 = as_uchar8(src0); \n" " ushort4 src2u16 = as_ushort4(src2); \n" " ushort4 dstu16; \n" @@ -1464,7 +1459,7 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst; \n" "}\n" "\n" - "uint amd_sadw(uint src0, uint src1, uint src2) { \n" + "inline uint amd_sadw(uint src0, uint src1, uint src2) { \n" " ushort2 src0u16 = as_ushort2(src0); \n" " ushort2 src1u16 = as_ushort2(src1); \n" " uint dst = src2 + \n" @@ -1473,12 +1468,12 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst; \n" "}\n" "\n" - "uint amd_sadd(uint src0, uint src1, uint src2) { \n" + "inline uint amd_sadd(uint src0, uint src1, uint src2) { \n" " uint dst = src2 + abs(src0 - src1); \n" " return dst; \n" "}\n" "\n" - "uint amd_bfe(uint src0, uint src1, uint src2) { \n" + "inline uint amd_bfe(uint src0, uint src1, uint src2) { \n" " uint dst;" " uint offset = src1 & 31;\n" " uint width = src2 & 31;\n" @@ -1491,22 +1486,22 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) " return dst;\n" "}\n" "\n" - "uint amd_bfm(uint src0 , uint src1){ \n" + "inline uint amd_bfm(uint src0 , uint src1){ \n" " uint dst = ((1 << (src0 & 0x1f)) - 1) << (src1 & 0x1f); \n" " return dst; \n" "}\n" "\n" - "uint amd_min3(uint src0, uint src1, uint src2) { \n" + "inline uint amd_min3(uint src0, uint src1, uint src2) { \n" " uint dst = min(src0, min(src1,src2));\n" " return dst;\n " "}\n" "\n" - "uint amd_max3(uint src0, uint src1, uint src2) { \n" + "inline uint amd_max3(uint src0, uint src1, uint src2) { \n" " uint dst = max(src0, max(src1,src2)); \n" " return dst; \n" "}\n" "\n" - "uint amd_median3(uint src0, uint src1, uint src2){ \n" + "inline uint amd_median3(uint src0, uint src1, uint src2){ \n" " uint dst = max(min(src0,src1), min(max(src0,src1),src2)); \n" " return dst; \n" "}\n" @@ -1517,12 +1512,12 @@ static void agoEmulateAmdMediaOpsInOpenCL(std::string& code) } } -int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode) +int agoGpuOclSuperNodeUpdate(AgoGraph * graph, AgoSuperNode * supernode) { // make sure that all output images have same dimensions // check to make sure that max input hierarchy level is less than min output hierarchy level vx_uint32 width = 0, height = 0; - vx_uint32 max_input_hierarchical_level = 0, min_output_hierarchical_level = (1 << 30); + vx_uint32 max_input_hierarchical_level = 0, min_output_hierarchical_level = INT_MAX; for (size_t index = 0; index < supernode->dataList.size(); index++) { AgoData * data = supernode->dataList[index]; if (data->ref.type == VX_TYPE_IMAGE && supernode->dataInfo[index].argument_usage[VX_INPUT] == 0) { @@ -1552,15 +1547,38 @@ int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode) agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoGpuOclSuperNodeFinalize: doesn't support mix of hierarchical levels inside same group#%d\n", supernode->group); return -1; } + supernode->width = width; + supernode->height = height; + + // mark hierarchical level (start,end) of all supernodes + for (AgoSuperNode * supernode = graph->supernodeList; supernode; supernode = supernode->next) { + supernode->hierarchical_level_start = INT_MAX; + supernode->hierarchical_level_end = 0; + for (AgoNode * node : supernode->nodeList) { + supernode->hierarchical_level_start = min(supernode->hierarchical_level_start, node->hierarchical_level); + supernode->hierarchical_level_end = max(supernode->hierarchical_level_end, node->hierarchical_level); + } + } + + return 0; +} + +int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode) +{ + // get supernode image dimensions + vx_uint32 width = supernode->width; + vx_uint32 height = supernode->height; // decide work group dimensions (256 work-items) vx_uint32 work_group_width = AGO_OPENCL_WORKGROUP_SIZE_0; vx_uint32 work_group_height = AGO_OPENCL_WORKGROUP_SIZE_1; // save image size and compute global work // - each work item processes 8x1 pixels - supernode->width = width; - supernode->height = height; supernode->opencl_global_work[0] = (((width + 7) >> 3) + (work_group_width - 1)) & ~(work_group_width - 1); supernode->opencl_global_work[1] = ( height + (work_group_height - 1)) & ~(work_group_height - 1); + supernode->opencl_global_work[2] = 1; + supernode->opencl_local_work[0] = work_group_width; + supernode->opencl_local_work[1] = work_group_height; + supernode->opencl_local_work[2] = 1; for (size_t index = 0; index < supernode->dataList.size(); index++) { AgoData * data = supernode->dataList[index]; } @@ -1722,6 +1740,11 @@ int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode) if (node->akernel->func) { node->opencl_code = ""; status = node->akernel->func(node, ago_kernel_cmd_opencl_codegen); + for(vx_size dim = node->opencl_work_dim; dim < 3; dim++) { + node->opencl_global_work[dim] = 1; + node->opencl_local_work[dim] = 1; + } + node->opencl_work_dim = 3; } else if (node->akernel->opencl_codegen_callback_f) { // generation function declaration @@ -1769,12 +1792,14 @@ int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode) node->opencl_output_array_param_index_plus1 = 0; node->opencl_local_buffer_usage_mask = 0; node->opencl_local_buffer_size_in_bytes = 0; - vx_uint32 work_dim = 2; - vx_size global_work[3] = { supernode->opencl_global_work[0], supernode->opencl_global_work[1], 1 }; - vx_size local_work[3] = { work_group_width, work_group_height, 1 }; status = node->akernel->opencl_codegen_callback_f(node, (vx_reference *)node->paramList, node->paramCount, - true, node->opencl_name, node->opencl_code, node->opencl_build_options, work_dim, global_work, - local_work, node->opencl_local_buffer_usage_mask, node->opencl_local_buffer_size_in_bytes); + true, node->opencl_name, node->opencl_code, node->opencl_build_options, node->opencl_work_dim, supernode->opencl_global_work, + supernode->opencl_local_work, node->opencl_local_buffer_usage_mask, node->opencl_local_buffer_size_in_bytes); + for(vx_size dim = node->opencl_work_dim; dim < 3; dim++) { + node->opencl_global_work[dim] = 1; + node->opencl_local_work[dim] = 1; + } + node->opencl_work_dim = 3; } if (status != VX_SUCCESS) { agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoGpuOclSuperNodeFinalize: kernel %s in group#%d is not supported yet\n", node->akernel->name, supernode->group); @@ -2056,9 +2081,9 @@ int agoGpuOclSuperNodeLaunch(AgoGraph * graph, AgoSuperNode * supernode) // launch the kernel int64_t stime = agoGetClockCounter(); cl_int err; - err = clEnqueueNDRangeKernel(supernode->opencl_cmdq, supernode->opencl_kernel, 2, NULL, supernode->opencl_global_work, NULL, 0, NULL, &supernode->opencl_event); + err = clEnqueueNDRangeKernel(supernode->opencl_cmdq, supernode->opencl_kernel, 3, NULL, supernode->opencl_global_work, supernode->opencl_local_work, 0, NULL, &supernode->opencl_event); if (err) { - agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clEnqueueNDRangeKernel(supernode,2,*,%dx%d,...) failed(%d) for group#%d\n", (cl_uint)supernode->opencl_global_work[0], (cl_uint)supernode->opencl_global_work[1], err, supernode->group); + agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: clEnqueueNDRangeKernel(supernode,3,*,{%d,%d,%d},{%d,%d,%d},...) failed(%d) for group#%d\n", (cl_uint)supernode->opencl_global_work[0], (cl_uint)supernode->opencl_global_work[1], (cl_uint)supernode->opencl_global_work[2], (cl_uint)supernode->opencl_local_work[0], (cl_uint)supernode->opencl_local_work[1], (cl_uint)supernode->opencl_local_work[2], err, supernode->group); return -1; } err = clFlush(supernode->opencl_cmdq); @@ -2230,20 +2255,32 @@ int agoGpuOclSingleNodeLaunch(AgoGraph * graph, AgoNode * node) agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoGpuOclSingleNodeLaunch: invalid opencl_global_work_update_callback_f failed (%d) for kernel %s\n", status, node->akernel->name); return -1; } + for(vx_size dim = node->opencl_work_dim; dim < 3; dim++) { + node->opencl_global_work[dim] = 1; + node->opencl_local_work[dim] = 1; + } + node->opencl_work_dim = 3; } // launch the kernel int64_t stime = agoGetClockCounter(); cl_int err; - err = clEnqueueNDRangeKernel(graph->opencl_cmdq, node->opencl_kernel, node->opencl_work_dim, NULL, node->opencl_global_work, NULL, 0, NULL, &node->opencl_event); - if (err) { - agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clEnqueueNDRangeKernel(supernode,%d,*,{%d,%d,%d},...) failed(%d) for %s\n", (cl_uint)node->opencl_work_dim, (cl_uint)node->opencl_global_work[0], (cl_uint)node->opencl_global_work[1], (cl_uint)node->opencl_global_work[2], err, node->akernel->name); - return -1; + if(node->opencl_local_work[0] != 0) { + err = clEnqueueNDRangeKernel(graph->opencl_cmdq, node->opencl_kernel, node->opencl_work_dim, NULL, node->opencl_global_work, node->opencl_local_work, 0, NULL, &node->opencl_event); + } + else { + err = clEnqueueNDRangeKernel(graph->opencl_cmdq, node->opencl_kernel, node->opencl_work_dim, NULL, node->opencl_global_work, NULL, 0, NULL, &node->opencl_event); } - err = clFlush(graph->opencl_cmdq); if (err) { - agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clFlush(supernode) failed(%d) for %s\n", err, node->akernel->name); + agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clEnqueueNDRangeKernel(supernode,%d,*,{%d,%d,%d},{%d,%d,%d},...) failed(%d) for %s\n", (cl_uint)node->opencl_work_dim, (cl_uint)node->opencl_global_work[0], (cl_uint)node->opencl_global_work[1], (cl_uint)node->opencl_global_work[2], (cl_uint)node->opencl_local_work[0], (cl_uint)node->opencl_local_work[1], (cl_uint)node->opencl_local_work[2], err, node->akernel->name); return -1; } + if(graph->enable_node_level_opencl_flush) { + err = clFlush(graph->opencl_cmdq); + if (err) { + agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: clFlush(supernode) failed(%d) for %s\n", err, node->akernel->name); + return -1; + } + } int64_t etime = agoGetClockCounter(); graph->opencl_perf.kernel_enqueue += etime - stime; // mark that node outputs are dirty diff --git a/openvx/api/vx_api.cpp b/openvx/api/vx_api.cpp index faa9aaa..297c66b 100644 --- a/openvx/api/vx_api.cpp +++ b/openvx/api/vx_api.cpp @@ -92,7 +92,33 @@ VX_API_ENTRY vx_status VX_API_CALL vxSetContextImageFormatDescription(vx_context vx_status status = VX_ERROR_INVALID_REFERENCE; if (agoIsValidContext(context)) { status = VX_ERROR_INVALID_FORMAT; - if (desc->planes == 1 && !agoSetImageComponentsAndPlanes(context, format, desc->components, desc->planes, desc->pixelSizeInBits, desc->colorSpace, desc->channelRange)) { + if (desc->planes == 1 && !agoSetImageComponentsAndPlanes(context, format, desc->components, desc->planes, (vx_uint32)desc->pixelSizeInBitsNum, (vx_uint32)(desc->pixelSizeInBitsDenom ? desc->pixelSizeInBitsDenom : 1), desc->colorSpace, desc->channelRange)) { + status = VX_SUCCESS; + } + } + return status; +} + +/** +* \brief Get custom image format description. +* \ingroup vx_framework_reference +* \param [in] context The context. +* \param [in] format The image format. +* \param [out] desc The image format description. +* \return A \ref vx_status_e enumeration. +* \retval VX_SUCCESS No errors. +* \retval VX_ERROR_INVALID_REFERENCE if reference is not valid. +* \retval VX_ERROR_INVALID_FORMAT if format is already in use. +*/ +VX_API_ENTRY vx_status VX_API_CALL vxGetContextImageFormatDescription(vx_context context, vx_df_image format, AgoImageFormatDescription * desc) +{ + vx_status status = VX_ERROR_INVALID_REFERENCE; + if (agoIsValidContext(context)) { + status = VX_ERROR_INVALID_FORMAT; + vx_uint32 pixelSizeInBitsNum, pixelSizeInBitsDenom; + if (!agoGetImageComponentsAndPlanes(context, format, &desc->components, &desc->planes, &pixelSizeInBitsNum, &pixelSizeInBitsDenom, &desc->colorSpace, &desc->channelRange)) { + desc->pixelSizeInBitsNum = pixelSizeInBitsNum; + desc->pixelSizeInBitsDenom = pixelSizeInBitsDenom; status = VX_SUCCESS; } } @@ -235,6 +261,12 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum at status = VX_SUCCESS; } break; + case VX_CONTEXT_CL_QUEUE_PROPERTIES: + if (size == sizeof(cl_command_queue_properties)) { + *(cl_command_queue_properties *)ptr = context->opencl_cmdq_properties; + status = VX_SUCCESS; + } + break; #endif case VX_CONTEXT_MAX_TENSOR_DIMENSIONS: if (size == sizeof(vx_size)) { @@ -331,6 +363,12 @@ VX_API_ENTRY vx_status VX_API_CALL vxSetContextAttribute(vx_context context, vx_ } } break; + case VX_CONTEXT_CL_QUEUE_PROPERTIES: + if (size == sizeof(cl_command_queue_properties)) { + context->opencl_cmdq_properties = *(cl_command_queue_properties *)ptr; + status = VX_SUCCESS; + } + break; #endif default: status = VX_ERROR_NOT_SUPPORTED; @@ -884,7 +922,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxSwapImageHandle(vx_image image_, void* cons for (auto roi = image->children[i]->roiDepList.begin(); roi != image->children[i]->roiDepList.end(); roi++) { (*roi)->buffer = image->children[i]->buffer + image->children[i]->u.img.rect_roi.start_y * image->children[i]->u.img.stride_in_bytes + - ((image->children[i]->u.img.rect_roi.start_x * image->children[i]->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor(image->children[i]->u.img.rect_roi.start_x, image->children[i]); } } } @@ -899,7 +937,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxSwapImageHandle(vx_image image_, void* cons for (auto roi = image->roiDepList.begin(); roi != image->roiDepList.end(); roi++) { (*roi)->buffer = image->buffer + image->u.img.rect_roi.start_y * image->u.img.stride_in_bytes + - ((image->u.img.rect_roi.start_x * image->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor(image->u.img.rect_roi.start_x, image); } } } @@ -1173,8 +1211,8 @@ VX_API_ENTRY vx_size VX_API_CALL vxComputeImagePatchSize(vx_image image_, if (image->children) { img = image->children[plane_index]; } - size = (((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2) * - ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2) * img->u.img.pixel_size_in_bits) >> 3; + size = ImageWidthInBytesFloor(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2), img) * + ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2); } return size; } @@ -1241,12 +1279,13 @@ VX_API_ENTRY vx_status VX_API_CALL vxAccessImagePatch(vx_image image_, addr->scale_y = VX_SCALE_UNITY >> img->u.img.y_scale_factor_is_2; addr->step_x = 1 << img->u.img.x_scale_factor_is_2; addr->step_y = 1 << img->u.img.y_scale_factor_is_2; - addr->stride_x = ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3; + addr->stride_x = ((img->u.img.pixel_size_in_bits_num & 7) || (img->u.img.pixel_size_in_bits_denom > 1)) ? + 0 : (img->u.img.pixel_size_in_bits_num >> 3); addr->stride_y = img->u.img.stride_in_bytes; } vx_uint8 * ptr_internal = img->buffer + (rect->start_y >> img->u.img.y_scale_factor_is_2) * img->u.img.stride_in_bytes + - (((rect->start_x >> img->u.img.x_scale_factor_is_2) * img->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor((rect->start_x >> img->u.img.x_scale_factor_is_2), img); vx_uint8 * ptr_returned = *ptr ? (vx_uint8 *)*ptr : ptr_internal; // save the pointer and usage for use in vxCommitImagePatch status = VX_SUCCESS; @@ -1279,12 +1318,12 @@ VX_API_ENTRY vx_status VX_API_CALL vxAccessImagePatch(vx_image image_, #endif if (item.used_external_ptr) { // copy if read is requested with explicit external buffer - if (addr->stride_x == ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3) - HafCpu_ChannelCopy_U8_U8(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2) * addr->stride_x, ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2), - ptr_returned, addr->stride_y, ptr_internal, img->u.img.stride_in_bytes); + if (addr->stride_x == 0 || ((addr->stride_x << 3) == img->u.img.pixel_size_in_bits_num && img->u.img.pixel_size_in_bits_denom == 1)) + HafCpu_ChannelCopy_U8_U8(ImageWidthInBytesFloor((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2, img), + ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2), ptr_returned, addr->stride_y, ptr_internal, img->u.img.stride_in_bytes); else HafCpu_BufferCopyDisperseInDst(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2), ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2), - ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3, ptr_returned, addr->stride_y, addr->stride_x, ptr_internal, img->u.img.stride_in_bytes); + (img->u.img.pixel_size_in_bits_num / img->u.img.pixel_size_in_bits_denom + 7) >> 3, ptr_returned, addr->stride_y, addr->stride_x, ptr_internal, img->u.img.stride_in_bytes); } } } @@ -1362,14 +1401,14 @@ VX_API_ENTRY vx_status VX_API_CALL vxCommitImagePatch(vx_image image_, if (used_external_ptr) { // copy from external buffer vx_uint8 * buffer = img->buffer + (rect->start_y >> img->u.img.y_scale_factor_is_2) * img->u.img.stride_in_bytes + - (((rect->start_x >> img->u.img.x_scale_factor_is_2) * img->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor((rect->start_x >> img->u.img.x_scale_factor_is_2), img); - if (addr->stride_x == ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3) - HafCpu_ChannelCopy_U8_U8(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2) * addr->stride_x, ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2), - buffer, img->u.img.stride_in_bytes, (vx_uint8 *)ptr, addr->stride_y); + if (addr->stride_x == 0 || ((addr->stride_x << 3) == img->u.img.pixel_size_in_bits_num && img->u.img.pixel_size_in_bits_denom == 1)) + HafCpu_ChannelCopy_U8_U8(ImageWidthInBytesFloor(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2), img), + ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2), buffer, img->u.img.stride_in_bytes, (vx_uint8 *)ptr, addr->stride_y); else HafCpu_BufferCopyDisperseInSrc(((rect->end_x - rect->start_x) >> img->u.img.x_scale_factor_is_2) * addr->stride_x, ((rect->end_y - rect->start_y) >> img->u.img.y_scale_factor_is_2), - ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3, buffer, img->u.img.stride_in_bytes, (vx_uint8 *)ptr, addr->stride_y, addr->stride_x); + (img->u.img.pixel_size_in_bits_num / img->u.img.pixel_size_in_bits_denom + 7) >> 3, buffer, img->u.img.stride_in_bytes, (vx_uint8 *)ptr, addr->stride_y, addr->stride_x); } // update sync flags auto dataToSync = img->u.img.isROI ? img->u.img.roiMasterImage : img; @@ -1602,7 +1641,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxMapImagePatch(vx_image image_, const vx_rec } vx_uint8 * ptr_returned = img->buffer + (rect->start_y >> img->u.img.y_scale_factor_is_2) * img->u.img.stride_in_bytes + - (((rect->start_x >> img->u.img.x_scale_factor_is_2) * img->u.img.pixel_size_in_bits) >> 3); + ImageWidthInBytesFloor((rect->start_x >> img->u.img.x_scale_factor_is_2), img); // save the pointer and usage for use in vxCommitImagePatch status = VX_SUCCESS; for (auto i = img->mapped.begin(); i != img->mapped.end(); i++) { @@ -1641,7 +1680,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxMapImagePatch(vx_image image_, const vx_rec addr->scale_y = VX_SCALE_UNITY >> img->u.img.y_scale_factor_is_2; addr->step_x = 1 << img->u.img.x_scale_factor_is_2; addr->step_y = 1 << img->u.img.y_scale_factor_is_2; - addr->stride_x = ((vx_uint32)img->u.img.pixel_size_in_bits + 7) >> 3; + addr->stride_x = (img->u.img.pixel_size_in_bits_denom > 1 || (img->u.img.pixel_size_in_bits_num & 7)) ? 0 : (img->u.img.pixel_size_in_bits_num >> 3); addr->stride_y = img->u.img.stride_in_bytes; } } @@ -1698,8 +1737,8 @@ VX_API_ENTRY vx_status VX_API_CALL vxUnmapImagePatch(vx_image image_, vx_map_id * The function supports only channels that occupy an entire plane of a multi-planar * images, as listed below. Other cases are not supported. * VX_CHANNEL_Y from YUV4, IYUV, NV12, NV21 -* VX_CHANNEL_U from YUV4, IYUV -* VX_CHANNEL_V from YUV4, IYUV +* VX_CHANNEL_U from YUV4, IYUV, NV12, NV21 +* VX_CHANNEL_V from YUV4, IYUV, NV12, NV21 * * \param [in] img The reference to the parent image. * \param [in] channel The \ref vx_channel_e channel to use. @@ -1728,6 +1767,10 @@ VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromChannel(vx_image img, vx_enum { subImage = image->children[2]; } + else if ((channel == VX_CHANNEL_U || channel == VX_CHANNEL_V) && (image->u.img.format == VX_DF_IMAGE_NV12 || image->u.img.format == VX_DF_IMAGE_NV21)) + { + subImage = image->children[1]; + } } } if (subImage) { @@ -3190,7 +3233,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByIndex(vx_node node, vx_uint32 else if (node->parameters[index].state == VX_PARAMETER_STATE_REQUIRED && !value) { status = VX_ERROR_INVALID_REFERENCE; } - else if ((index < node->paramCount) && (!node->parameters[index].type || !value || node->parameters[index].type == value->type)) { + else if ((index < node->paramCount) && (!node->parameters[index].type || !value || node->parameters[index].type == value->type || node->parameters[index].type == VX_TYPE_REFERENCE)) { if (node->paramList[index]) { agoReleaseData(node->paramList[index], false); } @@ -3563,6 +3606,10 @@ VX_API_ENTRY vx_status VX_API_CALL vxReadScalarValue(vx_scalar ref, void *ptr) strcpy((char *)ptr, (const char *)data->buffer); break; default: + if (data->buffer) { + memcpy(ptr, data->buffer, data->size); + break; + } status = VX_ERROR_NOT_SUPPORTED; break; } @@ -3661,6 +3708,10 @@ VX_API_ENTRY vx_status VX_API_CALL vxWriteScalarValue(vx_scalar ref, const void data->isInitialized = vx_true_e; break; default: + if (ptr) { + memcpy(data->buffer,ptr, data->size); + break; + } status = VX_ERROR_NOT_SUPPORTED; break; } diff --git a/openvx/include/vx_ext_amd.h b/openvx/include/vx_ext_amd.h index cbd0ef6..f3a9b91 100644 --- a/openvx/include/vx_ext_amd.h +++ b/openvx/include/vx_ext_amd.h @@ -97,6 +97,8 @@ enum vx_context_attribute_amd_e { VX_CONTEXT_ATTRIBUTE_AMD_SET_MERGE_RULE = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_CONTEXT) + 0x04, /*! \brief tensor Data max num of dimensions supported by HW. */ VX_CONTEXT_MAX_TENSOR_DIMENSIONS = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_CONTEXT) + 0x05, + /*! \brief CL_QUEUE_PROPERTIES to be used for creating OpenCL command queue. Use a \ref cl_command_queue_properties parameter. */ + VX_CONTEXT_CL_QUEUE_PROPERTIES = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_CONTEXT) + 0x06, }; /*! \brief The AMD kernel attributes list. @@ -188,6 +190,8 @@ enum vx_directive_amd_e { /*! \brief collect performance profile capture. */ VX_DIRECTIVE_AMD_ENABLE_PROFILE_CAPTURE = VX_ENUM_BASE(VX_ID_AMD, VX_ENUM_DIRECTIVE) + 0x03, VX_DIRECTIVE_AMD_DISABLE_PROFILE_CAPTURE = VX_ENUM_BASE(VX_ID_AMD, VX_ENUM_DIRECTIVE) + 0x04, + /*! \brief disable node level flush for a graph. */ + VX_DIRECTIVE_AMD_DISABLE_OPENCL_FLUSH = VX_ENUM_BASE(VX_ID_AMD, VX_ENUM_DIRECTIVE) + 0x05, }; /*! \brief An enumeration of additional memory type imports. @@ -198,6 +202,14 @@ enum vx_memory_type_amd_e { VX_MEMORY_TYPE_OPENCL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_MEMORY_TYPE) + 0x2, }; +/*! \brief The image color space list used by the \ref VX_IMAGE_SPACE attribute of a \ref vx_image. +* \ingroup group_image +*/ +enum vx_color_space_amd_e { + /*! \brief Use to indicate that the BT.2020 coefficients are used for conversions. */ + VX_COLOR_SPACE_BT2020 = VX_ENUM_BASE(VX_ID_AMD, VX_ENUM_COLOR_SPACE) + 0x1, +}; + /*! \brief Based on the VX_DF_IMAGE definition. * \note Use \ref vx_df_image to contain these values. */ @@ -221,9 +233,10 @@ typedef struct _vx_tensor_t * vx_tensor; typedef struct { vx_size components; vx_size planes; - vx_size pixelSizeInBits; + vx_size pixelSizeInBitsNum; vx_color_space_e colorSpace; vx_channel_range_e channelRange; + vx_size pixelSizeInBitsDenom; } AgoImageFormatDescription; /*! \brief AMD data structure to specify target affinity. @@ -325,7 +338,8 @@ typedef vx_status(VX_CALLBACK * amd_kernel_opencl_codegen_callback_f) ( /*! \brief AMD usernode callback for regenerating a node. */ -typedef vx_status(VX_CALLBACK * amd_kernel_node_regen_callback_f) (vx_graph graph, vx_node node, vx_bool& regen_not_needed); +typedef vx_status(VX_CALLBACK * amd_drama_add_node_f)(vx_node node, vx_enum kernel_id, vx_reference * paramList, vx_uint32 paramCount); +typedef vx_status(VX_CALLBACK * amd_kernel_node_regen_callback_f)(vx_node node, amd_drama_add_node_f add_node_f, vx_bool& replace_original); /*! \brief AMD usernode callback for updating the OpenCL global_work[]. The framework will pass * OpenVX objects as parameters to OpenCL kernels in othe order they appear to OpenVX node and @@ -520,6 +534,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxGetModuleHandle(vx_node node, const vx_char */ VX_API_ENTRY vx_status VX_API_CALL vxSetContextImageFormatDescription(vx_context context, vx_df_image format, const AgoImageFormatDescription * desc); +/** +* \brief Get custom image format description. +* \ingroup vx_framework_reference +* \param [in] context The context. +* \param [in] format The image format. +* \param [out] desc The image format description. +* \return A \ref vx_status_e enumeration. +* \retval VX_SUCCESS No errors. +* \retval VX_ERROR_INVALID_REFERENCE if reference is not valid. +* \retval VX_ERROR_INVALID_FORMAT if format is already in use. +*/ +VX_API_ENTRY vx_status VX_API_CALL vxGetContextImageFormatDescription(vx_context context, vx_df_image format, AgoImageFormatDescription * desc); + #ifdef __cplusplus } #endif diff --git a/runcl/README.md b/runcl/README.md new file mode 100644 index 0000000..ac4f34f --- /dev/null +++ b/runcl/README.md @@ -0,0 +1,67 @@ +# AMD RunCL +RunCL is a command-line tool to build, execute, and debug OpenCL programs, with a simple, easy-to-use interface. + +## RunCL Usage + + Usage: runcl [platform-options] [-I] [[-D=] ...] + [kernel-arguments] + [/] + + [platform-options] + -v verbose + -gpu use GPU device (default) + -cpu use CPU device + -device |# use specified device + -bo OpenCL build option + + [kernel-options] + -k kernel name + -p use persistence flag + -r[link] execution count + -w waiting time + -dumpcl dump OpenCL code after pre-processing + -dumpilisa dump ISA of kernel and show ISA statistics + -dumpelf dump ELF binary + + The shall be given in the order as required by the kernel. + For value arguments use + iv#[,...] or + iv: (e.g., iv#10.2,10,0x10) + For local memory use + lm# (e.g., lm#8192) + For input buffer use + if[#]:[][#[[][/[@#]]]] + (e.g., if:input.bin) + For output (or RW) buffer + of[#]:[#][@][#[[][/[+][@#]]]] + (e.g., of#16384:output.bin) + For input image use + ii#x,,: + (e.g., ii#1920x1080,7680,bgra:screen1920x1080.rgb) + For output image use + oi#x,,: + (e.g., oi#1920x1080,7680,bgra:screen1920x1080.rgb + +## Example + + % cat subtract.cl + __kernel __attribute__((reqd_work_group_size(64, 1, 1))) + void subtract( + __global float * a, + __global float * b, + __global float * c, + uint count) + { + uint id = get_global_id(0); + if(id < count) { + c[id] = a[id] - b[id]; + } + } + % runcl subtract.cl if#4000:a.f32 if#4000:b.f32 of#4000:#out.f32 iv#1000 1024,1,1/64,1,1 + OK: Using GPU device#0 [...] + OK: COMPILATION on GPU took 0.1268 sec for subtract + OK: kernel subtract info reqd_work_group_size(64,1,1) + OK: kernel subtract info work_group_size(256) + OK: kernel subtract info local_mem_size(0) + OK: kernel subtract info local_private_size(0) + OK: RUN SUCCESSFUL on GPU work:{1024,1,1}/{64,1,1} [ 0.00025 sec/exec] subtract (1st execution) diff --git a/runcl/runcl.cpp b/runcl/runcl.cpp index 99849aa..2c7642f 100644 --- a/runcl/runcl.cpp +++ b/runcl/runcl.cpp @@ -70,6 +70,9 @@ float coclock2sec(coclock_t tstart, coclock_t tend) } #endif +//! \brief The macro for fread error checking and reporting. +#define ERROR_CHECK_FREAD_(call,value) {size_t retVal = (call); if(retVal != (size_t)value) { fprintf(stderr,"ERROR: fread call expected to return [ %d elements ] but returned [ %d elements ] at " __FILE__ "#%d\n", (int)value, (int)retVal, __LINE__); return -1; } } + void show_usage(const char * program) { printf("Usage: %s [platform-options] [-I] [[-D=] ...] \n", program); @@ -558,7 +561,7 @@ main(int argc, char * argv[]) { FILE * fp = fopen(sourceelffile, "rb"); if (!fp) { fprintf(stderr, "ERROR: unable to open '%s'\n", sourceelffile); return !!- 1; } char * elfsrc = source + 2 * sizeof(size_t); - size_t elfsize = fread(elfsrc, 1, sourcesize, fp); + size_t elfsize = fread(elfsrc, 1, sizeof(elfsrc), fp); fclose(fp); ((size_t *)source)[0] = 1; ((size_t *)source)[1] = elfsize; @@ -739,7 +742,7 @@ main(int argc, char * argv[]) fseek(fp, 0L, SEEK_END); size[narg] = ftell(fp); fseek(fp, 0L, SEEK_SET); if (size[narg] < 1) { fprintf(stderr, "ERROR: invalid size value passed/derived for argument #%d\n", narg); return !!- 1; } arg[narg] = calloc(size[narg], 1); if (!arg[narg]) { fprintf(stderr, "ERROR: calloc(%d) failed\n", size[narg]); return !!- 1; } - fread(arg[narg], size[narg], 1, fp); fclose(fp); + ERROR_CHECK_FREAD_(fread(arg[narg], size[narg], 1, fp),1); fclose(fp); } else if (strncmp(argv[0], "lm#", 3) == 0) { @@ -851,7 +854,7 @@ main(int argc, char * argv[]) fp = fopen(p, "rb"); if (!fp) { fprintf(stderr, "ERROR: unable to open '%s'\n", p); return !!- 1; } if (size[narg] < 1) { fseek(fp, 0L, SEEK_END); size[narg] = ftell(fp); fseek(fp, 0L, SEEK_SET); } if (skip[narg] > 0) fseek(fp, (long)(size[narg] * skip[narg]), SEEK_SET); - fread(arg[narg], size[narg], 1, fp); + ERROR_CHECK_FREAD_(fread(arg[narg], size[narg], 1, fp),1); fclose(fp); } } @@ -941,7 +944,7 @@ main(int argc, char * argv[]) if (size[narg] < 1) { fprintf(stderr, "ERROR: invalid size value passed/derived for argument #%d\n", narg); return !!- 1; } arg[narg] = calloc(size[narg], 1); if (!arg[narg]) { fprintf(stderr, "ERROR: calloc(%d) failed\n", size[narg]); return !!- 1; } if (skip[narg] > 0) fseek(fp, (long)(size[narg] * skip[narg]), SEEK_SET); - if (!noload) fread(arg[narg], size[narg], 1, fp); + if (!noload) { ERROR_CHECK_FREAD_(fread(arg[narg], size[narg], 1, fp), 1); } fclose(fp); } if (!arg[narg]) diff --git a/runvx/CMakeLists.txt b/runvx/CMakeLists.txt index 1eb8c03..309d084 100644 --- a/runvx/CMakeLists.txt +++ b/runvx/CMakeLists.txt @@ -75,9 +75,13 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - find_package(OpenSSL QUIET) - if(OPENSSL_FOUND) + find_path(SSL_INCLUDE + NAMES openssl/hmac.h openssl/md5.h + HINTS /usr/local/opt/openssl/include /usr/include /usr/local/include /opt/local/include + ) + if(NOT "${SSL_INCLUDE}" STREQUAL "SSL_INCLUDE-NOTFOUND") + include_directories(${SSL_INCLUDE}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OpenSSL=1") target_link_libraries(runvx crypto) - endif(OPENSSL_FOUND) + endif() endif() diff --git a/runvx/README.md b/runvx/README.md index 3396be8..5b1003e 100644 --- a/runvx/README.md +++ b/runvx/README.md @@ -1,7 +1,7 @@ # AMD RunVX RunVX is a command-line tool to execute OpenVX graphs, with a simple, easy-to-use interface. It encapsulates most of the routine OpenVX calls, thus speeding up development and enabling rapid prototyping. As input, RunVX takes a GDF (Graph Description Format) file, a simple and intuitive syntax to describe the various data, nodes, and their dependencies. The tool has other useful features, such as, file read/write, data compares, image and keypoint data visualization, etc. -This project uses OpenCV for camera capture and image display. +If available, this project uses OpenCV for camera capture and image display. ## RunVX Usage and GDF Syntax runvx.exe [options] [argument(s)] diff --git a/runvx/runvx.cpp b/runvx/runvx.cpp index 64c3944..3e603f1 100644 --- a/runvx/runvx.cpp +++ b/runvx/runvx.cpp @@ -26,7 +26,7 @@ THE SOFTWARE. #include "vxEngine.h" // program and version -#define RUNVX_VERSION "0.9.6" +#define RUNVX_VERSION "0.9.7" #if _WIN32 #define RUNVX_PROGRAM "runvx.exe" #else @@ -59,6 +59,8 @@ void show_usage(const char * program, bool detail) printf(" Set context affinity to CPU or GPU.\n"); printf(" -dump-profile\n"); printf(" Print performance profiling information after graph launch.\n"); + printf(" -enable-profile\n"); + printf(" use directive VX_DIRECTIVE_AMD_ENABLE_PROFILE_CAPTURE when graph is created\n"); printf(" -discard-compare-errors\n"); printf(" Continue graph processing even if compare mismatches occur.\n"); printf(" -disable-virtual\n"); @@ -90,6 +92,7 @@ int main(int argc, char * argv[]) int arg, frameStart = 0, frameEnd = 1; bool frameCountSpecified = false; int waitKeyDelayInMilliSeconds = -1; // -ve indicates no user preference + bool enableFullProfile = false, disableNodeFlushForCL = false; for (arg = 1; arg < argc; arg++){ if (argv[arg][0] == '-'){ if (!_stricmp(argv[arg], "-h")) { @@ -142,6 +145,12 @@ int main(int argc, char * argv[]) else if (!_stricmp(argv[arg], "-dump-profile")) { enableDumpProfile = true; } + else if (!_stricmp(argv[arg], "-enable-profile")) { + enableFullProfile = true; + } + else if (!_stricmp(argv[arg], "-disable-opencl-node-flush")) { + disableNodeFlushForCL = true; + } else if (!_stricmp(argv[arg], "-dump-gdf") || !_stricmp(argv[arg], "-ago-dump")) { // TBD: remove -ago-dump enableDumpGDF = true; } @@ -186,7 +195,7 @@ int main(int argc, char * argv[]) int errorCode = 0; try { // initialize engine - if (engine.Initialize(argCount, defaultTargetAffinity, defaultTargetInfo, enableScheduleGraph, disableVirtual) < 0) throw - 1; + if (engine.Initialize(argCount, defaultTargetAffinity, defaultTargetInfo, enableScheduleGraph, disableVirtual, enableFullProfile, disableNodeFlushForCL) < 0) throw - 1; if (doSetGraphOptimizerFlags) { engine.SetGraphOptimizerFlags(graphOptimizerFlags); } diff --git a/runvx/vxEngine.cpp b/runvx/vxEngine.cpp index 68f1a4e..2076c31 100644 --- a/runvx/vxEngine.cpp +++ b/runvx/vxEngine.cpp @@ -84,7 +84,7 @@ CVxEngine::~CVxEngine() Shutdown(); } -int CVxEngine::Initialize(int argCount, int defaultTargetAffinity, int defaultTargetInfo, bool enableScheduleGraph, bool disableVirtual) +int CVxEngine::Initialize(int argCount, int defaultTargetAffinity, int defaultTargetInfo, bool enableScheduleGraph, bool disableVirtual, bool enableFullProfile, bool disableNodeFlushForCL) { // save configuration m_paramCount = argCount; @@ -122,6 +122,14 @@ int CVxEngine::Initialize(int argCount, int defaultTargetAffinity, int defaultTa m_graph = vxCreateGraph(m_context); if (vxGetStatus((vx_reference)m_graph)){ printf("ERROR: vxCreateGraph failed\n"); throw - 1; } + // select graph options + if(enableFullProfile) { + vxDirective((vx_reference)m_graph, VX_DIRECTIVE_AMD_ENABLE_PROFILE_CAPTURE); + } + if(disableNodeFlushForCL) { + vxDirective((vx_reference)m_graph, VX_DIRECTIVE_AMD_DISABLE_OPENCL_FLUSH); + } + return 0; } diff --git a/runvx/vxEngine.h b/runvx/vxEngine.h index 7273a98..54d54bd 100644 --- a/runvx/vxEngine.h +++ b/runvx/vxEngine.h @@ -30,7 +30,7 @@ class CVxEngine { public: CVxEngine(); virtual ~CVxEngine(); - int Initialize(int paramCount, int defaultTargetAffinity, int defaultTargetInfo, bool enableScheduleGraph, bool disableVirtual); + int Initialize(int paramCount, int defaultTargetAffinity, int defaultTargetInfo, bool enableScheduleGraph, bool disableVirtual, bool enableFullProfile, bool disableNodeFlushForCL); void SetConfigOptions(bool verbose, bool discardCompareErrors, bool enableDumpProfile, bool enableDumpGDF, int waitKeyDelayInMilliSeconds); void SetFrameCountOptions(bool enableMultiFrameProcessing, bool framesEofRequested, bool frameCountSpecified, int frameStart, int frameEnd); int SetGraphOptimizerFlags(vx_uint32 graph_optimizer_flags); @@ -91,4 +91,4 @@ class CVxEngine { void PrintHelpGDF(const char * command = nullptr); -#endif /* CVX_ENGINE_H*/ \ No newline at end of file +#endif /* CVX_ENGINE_H*/ diff --git a/runvx/vxImage.cpp b/runvx/vxImage.cpp index 4cd7bf6..51166f1 100644 --- a/runvx/vxImage.cpp +++ b/runvx/vxImage.cpp @@ -624,6 +624,9 @@ int CVxParamImage::Finalize() m_compareCountMatches = 0; m_compareCountMismatches = 0; + // Calculate image width for single plane image: + vx_size width_in_bytes = (m_planes == 1) ? CalculateImageWidthInBytes(m_image) : 0; + // compute frame size in bytes m_frameSize = 0; for (vx_uint32 plane = 0; plane < (vx_uint32)m_planes; plane++) { @@ -633,7 +636,8 @@ int CVxParamImage::Finalize() if (vxAccessImagePatch(m_image, &m_rectFull, plane, &addr, (void **)&dst, VX_READ_ONLY) == VX_SUCCESS) { vx_size width = (addr.dim_x * addr.scale_x) / VX_SCALE_UNITY; vx_size height = (addr.dim_y * addr.scale_y) / VX_SCALE_UNITY; - vx_size width_in_bytes = (m_format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x); + if (addr.stride_x != 0) + width_in_bytes = (width * addr.stride_x); m_frameSize += width_in_bytes * height; ERROR_CHECK(vxCommitImagePatch(m_image, &m_rectFull, plane, &addr, (void *)dst)); } diff --git a/runvx/vxParameter.cpp b/runvx/vxParameter.cpp index 2f1a484..a87d412 100644 --- a/runvx/vxParameter.cpp +++ b/runvx/vxParameter.cpp @@ -557,6 +557,14 @@ const char * ScanParameters(const char * s_, const char * syntax, const char * f *p = 0; if(*s == '"') s++; } + else if (s[0] == '{') { + *p++ = *s++; + // copy till end of the string. + for (; (*s != '\0') && (*s != '}') && (--maxStringBufferLength > 2);) + *p++ = *s++; + if (*s == '}') *p++ = *s++; + *p = 0; + } else { if (!_strnicmp(s, "https://", 8) || !_strnicmp(s, "http://", 7) || !_strnicmp(s, "file://", 7) || (((s[0] >= 'a' && s[0] <= 'z') || (s[0] >= 'A' && s[0] <= 'Z')) && s[1] == ':' && s[2] == '\\')) diff --git a/runvx/vxScalar.cpp b/runvx/vxScalar.cpp index 6c9acc0..3218573 100644 --- a/runvx/vxScalar.cpp +++ b/runvx/vxScalar.cpp @@ -67,6 +67,27 @@ int CVxParamScalar::Initialize(vx_context context, vx_graph graph, const char * if (m_format == VX_TYPE_STRING_AMD) { m_scalar = vxCreateScalar(context, m_format, value); } + else if (m_format == (VX_TYPE_NN_CONV_PARAMS) || m_format == (VX_TYPE_NN_DECONV_PARAMS) || m_format == (VX_TYPE_NN_ROIPOOL_PARAMS)) { + if (m_format == VX_TYPE_NN_CONV_PARAMS) { + vx_nn_convolution_params_t v; + if (!GetScalarValueForStructTypes(m_format, value, &v)) { + m_scalar= vxCreateScalar(context, m_format, &v); + } + } + else if (m_format == VX_TYPE_NN_DECONV_PARAMS) { + vx_nn_deconvolution_params_t v; + if (!GetScalarValueForStructTypes(m_format, value, &v)) { + m_scalar = vxCreateScalar(context, m_format, &v); + } + } + else if (m_format == VX_TYPE_NN_ROIPOOL_PARAMS) { + vx_nn_roi_pool_params_t v; + if (!GetScalarValueForStructTypes(m_format, value, &v)) { + m_scalar = vxCreateScalar(context, m_format, &v); + } + } + else ReportError("ERROR: unsupported scalar value: %s [%s:0x%08x]\n", value, format, m_format); + } else { vx_uint64 v = 0; if (!GetScalarValueFromString(m_format, value, &v)) { diff --git a/runvx/vxTensor.cpp b/runvx/vxTensor.cpp index 2f14ef8..9f971cd 100644 --- a/runvx/vxTensor.cpp +++ b/runvx/vxTensor.cpp @@ -122,7 +122,12 @@ int CVxParamTensor::InitializeIO(vx_context context, vx_graph graph, vx_referenc ERROR_CHECK(vxQueryTensor(m_tensor, VX_TENSOR_DIMS, &m_dims, sizeof(m_dims[0])*m_num_of_dims)); ERROR_CHECK(vxQueryTensor(m_tensor, VX_TENSOR_DATA_TYPE, &m_data_type, sizeof(m_data_type))); ERROR_CHECK(vxQueryTensor(m_tensor, VX_TENSOR_FIXED_POINT_POSITION, &m_fixed_point_pos, sizeof(vx_uint8))); - m_size = m_data_type == VX_TYPE_FLOAT32 ? 4 : 2; + if(m_data_type == VX_TYPE_UINT8 || m_data_type == VX_TYPE_INT8) + m_size = 1; + else if(m_data_type == VX_TYPE_UINT16 || m_data_type == VX_TYPE_INT16 || m_data_type == VX_TYPE_FLOAT16) + m_size = 2; + else + m_size = 4; for (vx_uint32 i = 0; i < m_num_of_dims; i++) { m_stride[i] = m_size; m_size *= m_dims[i]; @@ -149,6 +154,19 @@ int CVxParamTensor::InitializeIO(vx_context context, vx_graph graph, vx_referenc else ReportError("ERROR: invalid tensor read option: %s\n", option); } } + else if (!_stricmp(ioType, "init")) + { // init request syntax: init, + FILE * fp = fopen(RootDirUpdated(fileName), "rb"); + if (!fp) { + ReportError("ERROR: Unable to open: %s\n", fileName); + } + if (fread(m_data, 1, m_size, fp) != m_size) + ReportError("ERROR: not enough data (%d bytes) in %s\n", (vx_uint32)m_size, fileName); + vx_status status = vxCopyTensorPatch(m_tensor, m_num_of_dims, nullptr, nullptr, m_stride, m_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + fclose(fp); + if (status != VX_SUCCESS) + ReportError("ERROR: vxCopyTensorPatch: write failed (%d)\n", status); + } else if (!_stricmp(ioType, "write")) { // write request syntax: write,[,ascii|binary] m_fileNameWrite.assign(RootDirUpdated(fileName)); @@ -306,8 +324,9 @@ int CVxParamTensor::CompareFrame(int frameNumber) } vx_size count = m_dims[0] * m_dims[1] * m_dims[2] * m_dims[3]; float avgError = (float)sumError / (float)count; - mismatchDetected = ((float)maxError > m_maxErrorLimit) ? true : false; - mismatchDetected = ((float)avgError > m_avgErrorLimit) ? true : mismatchDetected; + mismatchDetected = true; + if (((float)maxError <= m_maxErrorLimit) && ((float)avgError <= m_avgErrorLimit)) + mismatchDetected = false; if (mismatchDetected) printf("ERROR: tensor COMPARE MISMATCHED [max-err: %d] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName); else if (m_verbose) @@ -336,8 +355,9 @@ int CVxParamTensor::CompareFrame(int frameNumber) } vx_size count = m_dims[0] * m_dims[1] * m_dims[2] * m_dims[3]; float avgError = (float)sumError / (float)count; - mismatchDetected = (maxError > m_maxErrorLimit) ? true : false; - mismatchDetected = (avgError > m_avgErrorLimit) ? true : mismatchDetected; + mismatchDetected = true; + if ((maxError <= m_maxErrorLimit) && (avgError <= m_avgErrorLimit)) + mismatchDetected = false; if (mismatchDetected) printf("ERROR: tensor COMPARE MISMATCHED [max-err: %.6f] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName); else if (m_verbose) @@ -370,8 +390,9 @@ int CVxParamTensor::CompareFrame(int frameNumber) } vx_size count = m_dims[0] * m_dims[1] * m_dims[2] * m_dims[3]; float avgError = (float)sumError / (float)count; - mismatchDetected = (maxError > m_maxErrorLimit) ? true : false; - mismatchDetected = (avgError > m_avgErrorLimit) ? true : mismatchDetected; + mismatchDetected = true; + if ((maxError <= m_maxErrorLimit) && (avgError <= m_avgErrorLimit)) + mismatchDetected = false; if (mismatchDetected) printf("ERROR: tensor COMPARE MISMATCHED [max-err: %.6f] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName); else if (m_verbose) diff --git a/runvx/vxUtils.cpp b/runvx/vxUtils.cpp index ebcb8b0..362c975 100644 --- a/runvx/vxUtils.cpp +++ b/runvx/vxUtils.cpp @@ -82,7 +82,11 @@ static struct { const char * name; vx_enum value; } s_table_constants[] = { { "BORDER_MODE_CONSTANT|VX_BORDER_MODE_CONSTANT", VX_BORDER_MODE_CONSTANT }, { "VX_DIRECTIVE_DISABLE_LOGGING", VX_DIRECTIVE_DISABLE_LOGGING }, { "VX_DIRECTIVE_ENABLE_LOGGING", VX_DIRECTIVE_ENABLE_LOGGING }, - { "VX_DIRECTIVE_READ_ONLY", VX_DIRECTIVE_AMD_READ_ONLY }, + { "VX_DIRECTIVE_ENABLE_PERFORMANCE", VX_DIRECTIVE_ENABLE_PERFORMANCE }, + { "VX_DIRECTIVE_READ_ONLY|VX_DIRECTIVE_AMD_READ_ONLY", VX_DIRECTIVE_AMD_READ_ONLY }, + { "VX_DIRECTIVE_AMD_ENABLE_PROFILE_CAPTURE", VX_DIRECTIVE_AMD_ENABLE_PROFILE_CAPTURE }, + { "VX_DIRECTIVE_AMD_DISABLE_PROFILE_CAPTURE", VX_DIRECTIVE_AMD_DISABLE_PROFILE_CAPTURE }, + { "VX_DIRECTIVE_AMD_DISABLE_OPENCL_FLUSH", VX_DIRECTIVE_AMD_DISABLE_OPENCL_FLUSH }, { "VX_MEMORY_TYPE_NONE", VX_MEMORY_TYPE_NONE }, { "VX_MEMORY_TYPE_HOST", VX_MEMORY_TYPE_HOST }, { "VX_MEMORY_TYPE_OPENCL", VX_MEMORY_TYPE_OPENCL }, @@ -91,6 +95,24 @@ static struct { const char * name; vx_enum value; } s_table_constants[] = { { "BT601_525|VX_COLOR_SPACE_BT601_525", VX_COLOR_SPACE_BT601_525 }, { "BT601_625|VX_COLOR_SPACE_BT601_625", VX_COLOR_SPACE_BT601_625 }, { "BT709|VX_COLOR_SPACE_BT709", VX_COLOR_SPACE_BT709 }, + { "VX_NN_POOLING_MAX", VX_NN_POOLING_MAX }, + { "VX_NN_POOLING_AVG", VX_NN_POOLING_AVG }, + { "VX_NN_DS_SIZE_ROUNDING_FLOOR", VX_NN_DS_SIZE_ROUNDING_FLOOR }, + { "VX_NN_DS_SIZE_ROUNDING_CEILING", VX_NN_DS_SIZE_ROUNDING_CEILING }, + { "VX_NN_ACTIVATION_LOGISTIC", VX_NN_ACTIVATION_LOGISTIC }, + { "VX_NN_ACTIVATION_HYPERBOLIC_TAN", VX_NN_ACTIVATION_HYPERBOLIC_TAN }, + { "VX_NN_ACTIVATION_RELU", VX_NN_ACTIVATION_RELU }, + { "VX_NN_ACTIVATION_BRELU", VX_NN_ACTIVATION_BRELU }, + { "VX_NN_ACTIVATION_SOFTRELU", VX_NN_ACTIVATION_SOFTRELU }, + { "VX_NN_ACTIVATION_ABS", VX_NN_ACTIVATION_ABS }, + { "VX_NN_ACTIVATION_SQUARE", VX_NN_ACTIVATION_SQUARE }, + { "VX_NN_ACTIVATION_SQRT", VX_NN_ACTIVATION_SQRT }, + { "VX_NN_ACTIVATION_LINEAR", VX_NN_ACTIVATION_LINEAR }, + { "VX_NN_NORMALIZATION_SAME_MAP", VX_NN_NORMALIZATION_SAME_MAP }, + { "VX_NN_NORMALIZATION_ACROSS_MAPS", VX_NN_NORMALIZATION_ACROSS_MAPS }, + { "VX_TYPE_NN_CONV_PARAMS", VX_TYPE_NN_CONV_PARAMS}, + { "VX_TYPE_NN_DECONV_PARAMS", VX_TYPE_NN_DECONV_PARAMS }, + { "VX_TYPE_NN_ROIPOOL_PARAMS", VX_TYPE_NN_ROIPOOL_PARAMS }, // error codes { "VX_FAILURE", VX_FAILURE }, { "VX_ERROR_REFERENCE_NONZERO", VX_ERROR_REFERENCE_NONZERO }, @@ -481,22 +503,36 @@ size_t CompareImage(vx_image image, vx_rectangle_t * rectRegion, vx_uint8 * refI return errorPixelCountTotal; } +// get image width in bytes from image +vx_size CalculateImageWidthInBytes(vx_image image) +{ + AgoImageFormatDescription format_description; + vx_context context = vxGetContext((vx_reference)image); + vx_df_image format = VX_DF_IMAGE_VIRT; + vx_uint32 width; + ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format))); + ERROR_CHECK(vxQueryImage(image, VX_IMAGE_WIDTH, &width, sizeof(width))); + ERROR_CHECK(vxGetContextImageFormatDescription(context, format, &format_description)); + + return ((width * format_description.pixelSizeInBitsNum + format_description.pixelSizeInBitsDenom - 1) / format_description.pixelSizeInBitsDenom + 7) >> 3; +} + // read image int ReadImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp) { - // get number of planes, image format, and pixel type - vx_df_image format = VX_DF_IMAGE_VIRT; + // get number of planes, image width in bytes for single plane vx_size num_planes = 0; - ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format))); ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_PLANES, &num_planes, sizeof(num_planes))); + vx_size width_in_bytes = (num_planes == 1) ? CalculateImageWidthInBytes(image) : 0; // read all image planes into vx_image and check if EOF has occured while reading bool eofDetected = false; - for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++){ + for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++) { vx_imagepatch_addressing_t addr; vx_uint8 * src = NULL; ERROR_CHECK(vxAccessImagePatch(image, rectFull, plane, &addr, (void **)&src, VX_WRITE_ONLY)); vx_size width = (addr.dim_x * addr.scale_x) / VX_SCALE_UNITY; - vx_size width_in_bytes = (format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x); + if (addr.stride_x != 0) + width_in_bytes = (width * addr.stride_x); for (vx_uint32 y = 0; y < addr.dim_y; y += addr.step_y){ vx_uint8 *srcp = (vx_uint8 *)vxFormatImagePatchAddress2d(src, 0, y, &addr); if (fread(srcp, 1, width_in_bytes, fp) != width_in_bytes) { @@ -513,19 +549,19 @@ int ReadImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp) // write image int WriteImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp) { - // get number of planes, image format, and pixel type - vx_df_image format = VX_DF_IMAGE_VIRT; + // get number of planes, image width in bytes for single plane vx_size num_planes = 0; - ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format))); ERROR_CHECK(vxQueryImage(image, VX_IMAGE_ATTRIBUTE_PLANES, &num_planes, sizeof(num_planes))); + vx_size width_in_bytes = (num_planes == 1) ? CalculateImageWidthInBytes(image) : 0; // write all image planes from vx_image bool eofDetected = false; - for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++){ + for (vx_uint32 plane = 0; plane < (vx_uint32)num_planes; plane++) { vx_imagepatch_addressing_t addr; vx_uint8 * src = NULL; ERROR_CHECK(vxAccessImagePatch(image, rectFull, plane, &addr, (void **)&src, VX_READ_ONLY)); vx_size width = (addr.dim_x * addr.scale_x) / VX_SCALE_UNITY; - vx_size width_in_bytes = (format == VX_DF_IMAGE_U1_AMD) ? ((width + 7) >> 3) : (width * addr.stride_x); + if (addr.stride_x != 0) + width_in_bytes = (width * addr.stride_x); for (vx_uint32 y = 0; y < addr.dim_y; y += addr.step_y){ vx_uint8 *srcp = (vx_uint8 *)vxFormatImagePatchAddress2d(src, 0, y, &addr); fwrite(srcp, 1, width_in_bytes, fp); @@ -601,6 +637,60 @@ int ReadScalarToString(vx_scalar scalar, char str[]) return 0; } +// get scalar value from struct types. +int GetScalarValueForStructTypes(vx_enum type, const char str[], void * value) +{ + auto getNextToken = [](const char *& s, char * token, size_t size) -> const char * { + size_t i = 0; + for (size--; *s && *s != ',' && *s != '}'; s++) { + if(i < size) + token[i++] = *s; + } + if(*s == ',' || *s == '}') + s++; + token[i] = '\0'; + return token; + }; + + char token[1024]; + const char * s = &str[1]; + if(str[0] != '{') { + printf("ERROR: GetScalarValueForStructTypes: string must start with '{'\n"); + return -1; + } + else if (type == VX_TYPE_NN_CONV_PARAMS) { + vx_nn_convolution_params_t v; + v.padding_x = atoi(getNextToken(s, token, sizeof(token))); + v.padding_y = atoi(getNextToken(s, token, sizeof(token))); + v.overflow_policy = ovxName2Enum(getNextToken(s, token, sizeof(token))); + v.rounding_policy = ovxName2Enum(getNextToken(s, token, sizeof(token))); + v.down_scale_size_rounding = ovxName2Enum(getNextToken(s, token, sizeof(token))); + v.dilation_x = atoi(getNextToken(s, token, sizeof(token))); + v.dilation_y = atoi(getNextToken(s, token, sizeof(token))); + *(vx_nn_convolution_params_t *)value = v; + } + else if (type == VX_TYPE_NN_DECONV_PARAMS) { + vx_nn_deconvolution_params_t v; + v.padding_x = atoi(getNextToken(s, token, sizeof(token))); + v.padding_y = atoi(getNextToken(s, token, sizeof(token))); + v.overflow_policy = ovxName2Enum(getNextToken(s, token, sizeof(token))); + v.rounding_policy = ovxName2Enum(getNextToken(s, token, sizeof(token))); + v.a_x = atoi(getNextToken(s, token, sizeof(token))); + v.a_y = atoi(getNextToken(s, token, sizeof(token))); + *(vx_nn_deconvolution_params_t *)value = v; + } + else if (type == VX_TYPE_NN_ROIPOOL_PARAMS) { + vx_nn_roi_pool_params_t v; + v.pool_type = ovxName2Enum(getNextToken(s, token, sizeof(token))); + *(vx_nn_roi_pool_params_t *)value = v; + } + else { + printf("ERROR: GetScalarValueForStructTypes: unsupported type 0x%08x\n", type); + return -1; + } + return 0; +} + // get scalar value from string int GetScalarValueFromString(vx_enum type, const char str[], vx_uint64 * value) { diff --git a/runvx/vxUtils.h b/runvx/vxUtils.h index d51fdf3..1c24cd2 100644 --- a/runvx/vxUtils.h +++ b/runvx/vxUtils.h @@ -39,6 +39,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -171,6 +172,8 @@ vx_enum ovxName2Enum(const char * name); void ComputeChecksum(char checkSumString[64], vx_image image, vx_rectangle_t * rectRegion); // compare rectangular region specified within an image and return number of pixels mismatching size_t CompareImage(vx_image image, vx_rectangle_t * rectRegion, vx_uint8 * refImage, float errLimitMin, float errLimitMax, int frameNumber, const char * fileNameRef); +// get image width in bytes from image +vx_size CalculateImageWidthInBytes(vx_image image); // read image int ReadImage(vx_image image, vx_rectangle_t * rectFull, FILE * fp); // write image @@ -181,6 +184,7 @@ int ReadScalarToString(vx_scalar scalar, char str[]); int WriteScalarFromString(vx_scalar scalar, const char str[]); int GetScalarValueFromString(vx_enum type, const char str[], vx_uint64 * value); int PutScalarValueToString(vx_enum type, const void * value, char str[]); +int GetScalarValueForStructTypes(vx_enum type, const char str[], void * value); // useful utility functions: // stristr -- case insensitive version of strstr