Commit 926a6bba authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

modified according to CUDA 4.0 API updates

parent 98d663e7
set(name "gpu")
set(the_target "opencv_${name}")
project(${the_target})
set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"
"${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CMAKE_CURRENT_BINARY_DIR}")
file(GLOB lib_srcs "src/*.cpp")
file(GLOB lib_int_hdrs "src/*.h*")
file(GLOB lib_cuda "src/cuda/*.cu*")
file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
source_group("Include" FILES ${lib_hdrs})
#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
source_group("Device" FILES ${lib_device_hdrs})
if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
endif()
if (HAVE_CUDA)
get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})
find_package(NPP 3.2.16 REQUIRED)
message(STATUS "NPP detected: " ${NPP_VERSION})
include_directories(${CUDA_INCLUDE_DIRS} ${CUDA_NPP_INCLUDES})
if (UNIX OR APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
endif()
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
if(MSVC)
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
endif()
if (OPENCV_BUILD_SHARED_LIB)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
endif()
CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET()
endif()
foreach(d ${DEPS})
if(${d} MATCHES "opencv_")
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endforeach()
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_target}_pch "src/precomp.cpp")
endif()
add_native_precompiled_header(${the_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_target} ${pch_header})
endif()
endif()
# For dynamic link numbering convenions
set_target_properties(${the_target} PROPERTIES
VERSION ${OPENCV_VERSION}
SOVERSION ${OPENCV_SOVERSION}
OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_target} PROPERTIES FOLDER "modules")
endif()
if (OPENCV_BUILD_SHARED_LIB)
if (MSVC)
set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
else()
add_definitions(-DCVAPI_EXPORTS)
endif()
endif()
# Additional target properties
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
)
# Add the required libraries for linking:
target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
if (HAVE_CUDA)
target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
CUDA_ADD_CUFFT_TO_TARGET(${the_target})
endif()
if(MSVC)
if(CMAKE_CROSSCOMPILING)
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk")
endif()
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc")
endif()
# Dependencies of this target:
add_dependencies(${the_target} ${DEPS})
install(TARGETS ${the_target}
RUNTIME DESTINATION bin COMPONENT main
LIBRARY DESTINATION lib COMPONENT main
ARCHIVE DESTINATION lib COMPONENT main)
install(FILES ${lib_hdrs}
DESTINATION include/opencv2/${name}
COMPONENT main)
install(FILES src/nvidia/NPP_staging/NPP_staging.hpp src/nvidia/core/NCV.hpp
DESTINATION include/opencv2/${name}
COMPONENT main)
#install(FILES ${lib_device_hdrs}
# DESTINATION include/opencv2/${name}/device
# COMPONENT main)
################################################################################################################
################################ GPU Module Tests #####################################################
################################################################################################################
# Test files processing is in the separated directory to avoid 'Src' source
# filter creation in Visual Studio
if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test)
set(the_test_target "opencv_test_${name}")
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/test"
"${CMAKE_CURRENT_BINARY_DIR}")
set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS})
foreach(d ${test_deps})
if(${d} MATCHES "opencv_")
if(${d} MATCHES "opencv_lapack")
else()
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endif()
endforeach()
file(GLOB test_srcs "test/*.cpp")
file(GLOB test_hdrs "test/*.h*")
source_group("Src" FILES ${test_hdrs} ${test_srcs})
if(HAVE_CUDA)
include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*")
source_group("Src\\NVidia" FILES ${nvidia})
endif()
add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_test_target}_pch "test/test_precomp.cpp")
endif()
add_native_precompiled_header(${the_test_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_test_target} ${pch_header})
endif()
endif()
# Additional target properties
set_target_properties(${the_test_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_test_target} PROPERTIES FOLDER "tests")
endif()
add_dependencies(${the_test_target} ${test_deps})
# Add the required libraries for linking:
target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps})
enable_testing()
get_target_property(LOC ${the_test_target} LOCATION)
add_test(${the_test_target} "${LOC}")
if(WIN32)
install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main)
endif()
endif()
set(name "gpu")
set(the_target "opencv_${name}")
project(${the_target})
set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"
"${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CMAKE_CURRENT_BINARY_DIR}")
file(GLOB lib_srcs "src/*.cpp")
file(GLOB lib_int_hdrs "src/*.h*")
file(GLOB lib_cuda "src/cuda/*.cu*")
file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
source_group("Include" FILES ${lib_hdrs})
#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
source_group("Device" FILES ${lib_device_hdrs})
if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
endif()
if (HAVE_CUDA)
#get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})
#find_package(NPP 3.2.16 REQUIRED)
#message(STATUS "NPP detected: " ${NPP_VERSION})
include_directories(${CUDA_INCLUDE_DIRS})
if (UNIX OR APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
endif()
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
if(MSVC)
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
endif()
if (OPENCV_BUILD_SHARED_LIB)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
endif()
CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET()
endif()
foreach(d ${DEPS})
if(${d} MATCHES "opencv_")
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endforeach()
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_target}_pch "src/precomp.cpp")
endif()
add_native_precompiled_header(${the_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_target} ${pch_header})
endif()
endif()
# For dynamic link numbering convenions
set_target_properties(${the_target} PROPERTIES
VERSION ${OPENCV_VERSION}
SOVERSION ${OPENCV_SOVERSION}
OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_target} PROPERTIES FOLDER "modules")
endif()
if (OPENCV_BUILD_SHARED_LIB)
if (MSVC)
set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
else()
add_definitions(-DCVAPI_EXPORTS)
endif()
endif()
# Additional target properties
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
)
# Add the required libraries for linking:
target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
if (HAVE_CUDA)
target_link_libraries(${the_target} ${CUDA_LIBRARIES})
CUDA_ADD_CUFFT_TO_TARGET(${the_target})
unset(CUDA_npp_LIBRARY CACHE)
find_cuda_helper_libs(npp)
target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
endif()
if(MSVC)
if(CMAKE_CROSSCOMPILING)
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk")
endif()
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc")
endif()
# Dependencies of this target:
add_dependencies(${the_target} ${DEPS})
install(TARGETS ${the_target}
RUNTIME DESTINATION bin COMPONENT main
LIBRARY DESTINATION lib COMPONENT main
ARCHIVE DESTINATION lib COMPONENT main)
install(FILES ${lib_hdrs}
DESTINATION include/opencv2/${name}
COMPONENT main)
install(FILES src/nvidia/NPP_staging/NPP_staging.hpp src/nvidia/core/NCV.hpp
DESTINATION include/opencv2/${name}
COMPONENT main)
#install(FILES ${lib_device_hdrs}
# DESTINATION include/opencv2/${name}/device
# COMPONENT main)
################################################################################################################
################################ GPU Module Tests #####################################################
################################################################################################################
# Test files processing is in the separated directory to avoid 'Src' source
# filter creation in Visual Studio
if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test)
set(the_test_target "opencv_test_${name}")
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/test"
"${CMAKE_CURRENT_BINARY_DIR}")
set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS})
foreach(d ${test_deps})
if(${d} MATCHES "opencv_")
if(${d} MATCHES "opencv_lapack")
else()
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endif()
endforeach()
file(GLOB test_srcs "test/*.cpp")
file(GLOB test_hdrs "test/*.h*")
source_group("Src" FILES ${test_hdrs} ${test_srcs})
if(HAVE_CUDA)
include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*")
source_group("Src\\NVidia" FILES ${nvidia})
endif()
add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_test_target}_pch "test/test_precomp.cpp")
endif()
add_native_precompiled_header(${the_test_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_test_target} ${pch_header})
endif()
endif()
# Additional target properties
set_target_properties(${the_test_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_test_target} PROPERTIES FOLDER "tests")
endif()
add_dependencies(${the_test_target} ${test_deps})
# Add the required libraries for linking:
target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps})
enable_testing()
get_target_property(LOC ${the_test_target} LOCATION)
add_test(${the_test_target} "${LOC}")
if(WIN32)
install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main)
endif()
endif()
###############################################################################
#
# FindNPP.cmake
#
# CUDA_NPP_LIBRARY_ROOT_DIR -- Path to the NPP dorectory.
# CUDA_NPP_INCLUDES -- NPP Include directories.
# CUDA_NPP_LIBRARIES -- NPP libraries.
# NPP_VERSION -- NPP version in format "major.minor.build".
#
# If not found automatically, please set CUDA_NPP_LIBRARY_ROOT_DIR
# in CMake or set enviroment varivabe $CUDA_NPP_ROOT
#
# Author: Anatoly Baksheev, Itseez Ltd.
#
# The MIT License
#
# License for the specific language governing rights and limitations under
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
cmake_policy(PUSH)
cmake_minimum_required(VERSION 2.8.0)
cmake_policy(POP)
if(NOT "${CUDA_NPP_LIBRARY_ROOT_DIR}" STREQUAL "${CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL}")
unset(CUDA_NPP_INCLUDES CACHE)
unset(CUDA_NPP_LIBRARIES CACHE)
endif()
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
if (UNIX OR APPLE)
set(NPP_SUFFIX "32")
else()
set(NPP_SUFFIX "-mt")
endif()
else(CMAKE_SIZEOF_VOID_P EQUAL 4)
if (UNIX OR APPLE)
set(NPP_SUFFIX "64")
else()
set(NPP_SUFFIX "-mt-x64")
endif()
endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
if(NOT CUDA_NPP_LIBRARY_ROOT_DIR OR CUDA_NPP_LIBRARY_ROOT_DIR STREQUAL "")
unset(CUDA_NPP_LIBRARY_ROOT_DIR CACHE)
find_path(CUDA_NPP_LIBRARY_ROOT_DIR "common/npp/include/npp.h" PATHS ENV CUDA_NPP_ROOT DOC "NPP root directory.")
MESSAGE(STATUS "NPP root directory: " ${CUDA_NPP_LIBRARY_ROOT_DIR})
endif()
# Search includes in our own paths.
find_path(CUDA_NPP_INCLUDES npp.h PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}/common/npp/include")
# Search default search paths, after we search our own set of paths.
find_path(CUDA_NPP_INCLUDES device_functions.h)
mark_as_advanced(CUDA_NPP_INCLUDES)
# Find NPP library
find_library(CUDA_NPP_LIBRARIES
NAMES "npp" "npp${NPP_SUFFIX}" "libnpp${NPP_SUFFIX}"
PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}"
PATH_SUFFIXES "common/lib" "common/npp/lib"
DOC "NPP library"
)
# Search default search paths, after we search our own set of paths.
find_library(CUDA_NPP_LIBRARIES NAMES npp${NPP_SUFFIX} libnpp${NPP_SUFFIX} DOC "NPP library")
mark_as_advanced(CUDA_NPP_LIBRARIES)
if(EXISTS ${CUDA_NPP_INCLUDES}/nppversion.h)
file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_major REGEX "#define NPP_VERSION_MAJOR.*")
file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_minor REGEX "#define NPP_VERSION_MINOR.*")
file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_build REGEX "#define NPP_VERSION_BUILD.*")
string( REGEX REPLACE "#define NPP_VERSION_MAJOR[ \t]+|//.*" "" npp_major ${npp_major})
string( REGEX REPLACE "#define NPP_VERSION_MINOR[ \t]+|//.*" "" npp_minor ${npp_minor})
string( REGEX REPLACE "#define NPP_VERSION_BUILD[ \t]+|//.*" "" npp_build ${npp_build})
string( REGEX MATCH "[0-9]+" npp_major ${npp_major} )
string( REGEX MATCH "[0-9]+" npp_minor ${npp_minor} )
string( REGEX MATCH "[0-9]+" npp_build ${npp_build} )
set( NPP_VERSION "${npp_major}.${npp_minor}.${npp_build}")
endif()
if(NOT EXISTS ${CUDA_NPP_LIBRARIES} OR NOT EXISTS ${CUDA_NPP_INCLUDES}/npp.h)
set(CUDA_NPP_FOUND FALSE)
message(FATAL_ERROR "NPP headers/libraries are not found. Please specify CUDA_NPP_LIBRARY_ROOT_DIR in CMake or set $CUDA_NPP_ROOT.")
endif()
include( FindPackageHandleStandardArgs )
find_package_handle_standard_args( NPP
REQUIRED_VARS
CUDA_NPP_INCLUDES
CUDA_NPP_LIBRARIES
#Need cmake 2.8.3 to uncomment this.
#VERSION_VAR
NPP_VERSION)
if(APPLE)
# We need to add the path to cudart to the linker using rpath, since the library name for the cuda libraries is prepended with @rpath.
get_filename_component(_cuda_path_to_npp "${CUDA_NPP_LIBRARIES}" PATH)
if(_cuda_path_to_npp)
list(APPEND CUDA_NPP_LIBRARIES "-Wl,-rpath,${_cuda_path_to_npp}")
endif()
endif()
set(CUDA_NPP_FOUND TRUE)
set(CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL "${CUDA_NPP_LIBRARY_ROOT_DIR}" CACHE INTERNAL "This is the value of the last time CUDA_NPP_LIBRARY_ROOT_DIR was set successfully." FORCE)
This diff is collapsed.
This diff is collapsed.
...@@ -51,7 +51,6 @@ using namespace std; ...@@ -51,7 +51,6 @@ using namespace std;
cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); } cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); }
cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); } cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); }
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
...@@ -101,7 +100,7 @@ namespace ...@@ -101,7 +100,7 @@ namespace
template <typename T> template <typename T>
void bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold, void bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
GpuMat& table_color, GpuMat& table_space, GpuMat& table_color, GpuMat& table_space,
const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream) const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
{ {
short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5)); short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
short max_disc = short(ndisp * max_disc_threshold + 0.5); short max_disc = short(ndisp * max_disc_threshold + 0.5);
...@@ -109,14 +108,19 @@ namespace ...@@ -109,14 +108,19 @@ namespace
bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc); bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
if (&dst != &disp) if (&dst != &disp)
disp.copyTo(dst); {
if (stream)
stream.enqueueCopy(disp, dst);
else
disp.copyTo(dst);
}
bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, stream); bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
} }
typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
GpuMat& table_color, GpuMat& table_space, GpuMat& table_color, GpuMat& table_space,
const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream); const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
const bilateral_filter_operator_t operators[] = const bilateral_filter_operator_t operators[] =
{bilateral_filter_operator<unsigned char>, 0, 0, bilateral_filter_operator<short>, 0, 0, 0, 0}; {bilateral_filter_operator<unsigned char>, 0, 0, bilateral_filter_operator<short>, 0, 0, 0, 0};
...@@ -139,18 +143,11 @@ cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radi ...@@ -139,18 +143,11 @@ cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radi
calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f); calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
} }
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst)
{
CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, 0);
}
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream) void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
{ {
CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters); CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3)); CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, StreamAccessor::getStream(stream)); operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */
...@@ -48,8 +48,7 @@ using namespace cv::gpu; ...@@ -48,8 +48,7 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
GpuMat&) { throw_nogpu(); }
#else #else
...@@ -57,14 +56,14 @@ namespace cv { namespace gpu ...@@ -57,14 +56,14 @@ namespace cv { namespace gpu
{ {
template <typename T> template <typename T>
void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2, void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2,
const PtrStep_<float> weights1, const PtrStep_<float> weights2, PtrStep_<T> result); const PtrStep_<float> weights1, const PtrStep_<float> weights2, PtrStep_<T> result, cudaStream_t stream);
void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep result); const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream);
}} }}
void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
GpuMat& result) GpuMat& result, Stream& stream)
{ {
CV_Assert(img1.size() == img2.size()); CV_Assert(img1.size() == img2.size());
CV_Assert(img1.type() == img2.type()); CV_Assert(img1.type() == img2.type());
...@@ -83,12 +82,12 @@ void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& ...@@ -83,12 +82,12 @@ void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat&
{ {
case CV_8U: case CV_8U:
if (cn != 4) if (cn != 4)
blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result); blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
else else
blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result); blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
break; break;
case CV_32F: case CV_32F:
blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result); blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
break; break;
default: default:
CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function"); CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function");
......
This diff is collapsed.
...@@ -44,20 +44,11 @@ ...@@ -44,20 +44,11 @@
#if !defined(HAVE_CUDA) #if !defined(HAVE_CUDA)
void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
GpuMat&) { throw_nogpu(); }
void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
const Mat&, const Mat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&,
const Mat&, const Mat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&,
Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
#else #else
...@@ -66,14 +57,12 @@ using namespace cv::gpu; ...@@ -66,14 +57,12 @@ using namespace cv::gpu;
namespace cv { namespace gpu { namespace transform_points namespace cv { namespace gpu { namespace transform_points
{ {
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
DevMem2D_<float3> dst, cudaStream_t stream);
}}} }}}
namespace namespace
{ {
void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
GpuMat& dst, cudaStream_t stream)
{ {
CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3); CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F); CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
...@@ -88,30 +77,20 @@ namespace ...@@ -88,30 +77,20 @@ namespace
} }
} }
void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
GpuMat& dst)
{
::transformPointsCaller(src, rvec, tvec, dst, 0);
}
void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
GpuMat& dst, const Stream& stream)
{ {
::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream)); ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
} }
namespace cv { namespace gpu { namespace project_points namespace cv { namespace gpu { namespace project_points
{ {
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
}}} }}}
namespace namespace
{ {
void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
cudaStream_t stream)
{ {
CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3); CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F); CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
...@@ -124,20 +103,11 @@ namespace ...@@ -124,20 +103,11 @@ namespace
Rodrigues(rvec, rot); Rodrigues(rvec, rot);
dst.create(src.size(), CV_32FC2); dst.create(src.size(), CV_32FC2);
project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
camera_mat.ptr<float>(), dst,stream);
} }
} }
void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst)
{
::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, 0);
}
void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
const Stream& stream)
{ {
::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream)); ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
} }
......
...@@ -47,8 +47,7 @@ using namespace cv::gpu; ...@@ -47,8 +47,7 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int) { throw_nogpu(); } void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, const Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
...@@ -455,12 +454,7 @@ namespace ...@@ -455,12 +454,7 @@ namespace
} }
} }
void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn) void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
{
cvtColor_caller(src, dst, code, dcn, 0);
}
void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, const Stream& stream)
{ {
cvtColor_caller(src, dst, code, dcn, StreamAccessor::getStream(stream)); cvtColor_caller(src, dst, code, dcn, StreamAccessor::getStream(stream));
} }
......
...@@ -68,19 +68,22 @@ namespace cv { namespace gpu ...@@ -68,19 +68,22 @@ namespace cv { namespace gpu
template <typename T> template <typename T>
void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2, void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result) const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result, cudaStream_t stream)
{ {
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
blendLinearKernel<<<grid, threads>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
} }
template void blendLinearCaller<uchar>(int, int, int, const PtrStep, const PtrStep, template void blendLinearCaller<uchar>(int, int, int, const PtrStep, const PtrStep,
const PtrStepf, const PtrStepf, PtrStep); const PtrStepf, const PtrStepf, PtrStep, cudaStream_t stream);
template void blendLinearCaller<float>(int, int, int, const PtrStepf, const PtrStepf, template void blendLinearCaller<float>(int, int, int, const PtrStepf, const PtrStepf,
const PtrStepf, const PtrStepf, PtrStepf); const PtrStepf, const PtrStepf, PtrStepf, cudaStream_t stream);
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
...@@ -105,13 +108,16 @@ namespace cv { namespace gpu ...@@ -105,13 +108,16 @@ namespace cv { namespace gpu
void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep result) const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream)
{ {
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
blendLinearKernel8UC4<<<grid, threads>>>(rows, cols, img1, img2, weights1, weights2, result); blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
} }
}} }}
\ No newline at end of file
This diff is collapsed.
...@@ -184,7 +184,9 @@ namespace cv { namespace gpu ...@@ -184,7 +184,9 @@ namespace cv { namespace gpu
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>( computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
num_points, object, image, dist_threshold, hypothesis_scores); num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
} }
} // namespace solvepnp_ransac } // namespace solvepnp_ransac
......
...@@ -64,19 +64,19 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -64,19 +64,19 @@ namespace cv { namespace gpu { namespace mathfunc
}; };
template <typename T1, typename T2> template <typename T1, typename T2>
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{ {
NotEqual<T1, T2> op; NotEqual<T1, T2> op;
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0); transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
} }
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{ {
compare_ne<uint, uint>(src1, src2, dst); compare_ne<uint, uint>(src1, src2, dst, stream);
} }
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{ {
compare_ne<float, float>(src1, src2, dst); compare_ne<float, float>(src1, src2, dst, stream);
} }
...@@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
......
This diff is collapsed.
...@@ -220,7 +220,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y, ...@@ -220,7 +220,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
img_block_width, grad, qangle, scale, block_hists); img_block_width, grad, qangle, scale, block_hists);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -324,7 +324,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y, ...@@ -324,7 +324,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -418,7 +418,7 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block ...@@ -418,7 +418,7 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
block_hists, coefs, free_coef, threshold, labels); block_hists, coefs, free_coef, threshold, labels);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
...@@ -463,7 +463,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i ...@@ -463,7 +463,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -512,7 +512,7 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i ...@@ -512,7 +512,7 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
...@@ -636,7 +636,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im ...@@ -636,7 +636,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int nthreads, int correct_gamma> template <int nthreads, int correct_gamma>
...@@ -707,7 +708,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im ...@@ -707,7 +708,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -765,7 +767,9 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex) ...@@ -765,7 +767,9 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs); resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex) ); cudaSafeCall( cudaUnbindTexture(tex) );
} }
......
...@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex_remap) ); cudaSafeCall( cudaUnbindTexture(tex_remap) );
} }
...@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace imgproc
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
...@@ -263,7 +263,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -263,7 +263,7 @@ namespace cv { namespace gpu { namespace imgproc
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
...@@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace imgproc
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
...@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream) void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
...@@ -411,7 +411,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -411,7 +411,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
...@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
...@@ -502,7 +502,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -502,7 +502,7 @@ namespace cv { namespace gpu { namespace imgproc
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst); extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
...@@ -611,7 +611,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -611,7 +611,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall(cudaUnbindTexture(harrisDxTex)); cudaSafeCall(cudaUnbindTexture(harrisDxTex));
cudaSafeCall(cudaUnbindTexture(harrisDyTex)); cudaSafeCall(cudaUnbindTexture(harrisDyTex));
} }
...@@ -727,7 +728,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -727,7 +728,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
} }
...@@ -763,7 +765,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -763,7 +765,7 @@ namespace cv { namespace gpu { namespace imgproc
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst); column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
...@@ -791,7 +793,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -791,7 +793,7 @@ namespace cv { namespace gpu { namespace imgproc
mulSpectrumsKernel<<<grid, threads>>>(a, b, c); mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
...@@ -820,7 +822,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -820,7 +822,7 @@ namespace cv { namespace gpu { namespace imgproc
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c); mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
...@@ -850,7 +852,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -850,7 +852,7 @@ namespace cv { namespace gpu { namespace imgproc
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c); mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
...@@ -880,7 +882,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -880,7 +882,7 @@ namespace cv { namespace gpu { namespace imgproc
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c); mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
...@@ -904,7 +906,9 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -904,7 +906,9 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst); downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
} }
template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst); template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst);
......
...@@ -46,6 +46,8 @@ ...@@ -46,6 +46,8 @@
#include "opencv2/gpu/devmem2d.hpp" #include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp" #include "safe_call.hpp"
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "npp.h"
#include "NPP_staging.hpp"
namespace cv namespace cv
{ {
...@@ -106,6 +108,41 @@ namespace cv ...@@ -106,6 +108,41 @@ namespace cv
cudaSafeCall( cudaGetTextureReference(&tex, name) ); cudaSafeCall( cudaGetTextureReference(&tex, name) );
cudaSafeCall( cudaUnbindTexture(tex) ); cudaSafeCall( cudaUnbindTexture(tex) );
} }
class NppStreamHandler
{
public:
inline explicit NppStreamHandler(cudaStream_t newStream = 0)
{
oldStream = nppGetStream();
nppSetStream(newStream);
}
inline ~NppStreamHandler()
{
nppSetStream(oldStream);
}
private:
cudaStream_t oldStream;
};
class NppStStreamHandler
{
public:
inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
{
oldStream = nppStSetActiveCUDAstream(newStream);
}
inline ~NppStStreamHandler()
{
nppStSetActiveCUDAstream(oldStream);
}
private:
cudaStream_t oldStream;
};
} }
} }
......
...@@ -134,7 +134,7 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ, ...@@ -134,7 +134,7 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -165,7 +165,7 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ, ...@@ -165,7 +165,7 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -228,7 +228,7 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ, ...@@ -228,7 +228,7 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -259,7 +259,7 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ, ...@@ -259,7 +259,7 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -309,7 +309,7 @@ void matchTemplatePrepared_SQDIFF_8U( ...@@ -309,7 +309,7 @@ void matchTemplatePrepared_SQDIFF_8U(
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -360,7 +360,7 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U( ...@@ -360,7 +360,7 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -392,7 +392,7 @@ void matchTemplatePrepared_CCOFF_8U( ...@@ -392,7 +392,7 @@ void matchTemplatePrepared_CCOFF_8U(
w, h, (float)templ_sum / (w * h), image_sum, result); w, h, (float)templ_sum / (w * h), image_sum, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -434,7 +434,7 @@ void matchTemplatePrepared_CCOFF_8UC2( ...@@ -434,7 +434,7 @@ void matchTemplatePrepared_CCOFF_8UC2(
image_sum_r, image_sum_g, result); image_sum_r, image_sum_g, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -490,7 +490,7 @@ void matchTemplatePrepared_CCOFF_8UC3( ...@@ -490,7 +490,7 @@ void matchTemplatePrepared_CCOFF_8UC3(
image_sum_r, image_sum_g, image_sum_b, result); image_sum_r, image_sum_g, image_sum_b, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -556,7 +556,7 @@ void matchTemplatePrepared_CCOFF_8UC4( ...@@ -556,7 +556,7 @@ void matchTemplatePrepared_CCOFF_8UC4(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -602,7 +602,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8U( ...@@ -602,7 +602,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
image_sum, image_sqsum, result); image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -665,7 +665,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2( ...@@ -665,7 +665,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -742,7 +742,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3( ...@@ -742,7 +742,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -833,7 +833,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4( ...@@ -833,7 +833,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -877,7 +877,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, ...@@ -877,7 +877,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -919,7 +919,7 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn) ...@@ -919,7 +919,7 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
......
...@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
...@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream) void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)
......
...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace matrix_operations { ...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
...@@ -199,7 +199,7 @@ namespace cv { namespace gpu { namespace matrix_operations { ...@@ -199,7 +199,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
...@@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace matrix_operations { ...@@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);
......
...@@ -275,11 +275,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -275,11 +275,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf); minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
} }
...@@ -306,11 +306,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -306,11 +306,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf); minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
} }
...@@ -363,11 +363,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -363,11 +363,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
} }
...@@ -395,11 +395,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -395,11 +395,11 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
} }
...@@ -609,17 +609,17 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -609,17 +609,17 @@ namespace cv { namespace gpu { namespace mathfunc
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
uint minloc_, maxloc_; uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
} }
...@@ -650,7 +650,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -650,7 +650,7 @@ namespace cv { namespace gpu { namespace mathfunc
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
...@@ -724,7 +724,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -724,7 +724,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
...@@ -766,7 +766,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -766,7 +766,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
...@@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace mathfunc
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf); countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
uint count; uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
...@@ -942,7 +942,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -942,7 +942,7 @@ namespace cv { namespace gpu { namespace mathfunc
countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
uint count; uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
...@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace mathfunc
break; break;
} }
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
...@@ -1543,7 +1543,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1543,7 +1543,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
...@@ -1615,7 +1615,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1615,7 +1615,7 @@ namespace cv { namespace gpu { namespace mathfunc
break; break;
} }
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
...@@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
...@@ -1737,7 +1737,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1737,7 +1737,7 @@ namespace cv { namespace gpu { namespace mathfunc
break; break;
} }
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
...@@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
......
...@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
...@@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
...@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
...@@ -445,7 +445,7 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -445,7 +445,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
...@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
...@@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
......
...@@ -102,19 +102,19 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned ...@@ -102,19 +102,19 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS)); ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
__syncthreads(); __syncthreads();
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS)); ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
__syncthreads(); __syncthreads();
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS)); ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
__syncthreads(); __syncthreads();
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS)); ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
__syncthreads(); __syncthreads();
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS)); ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
__syncthreads(); __syncthreads();
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS)); ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
__syncthreads(); __syncthreads();
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS)); ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
__syncthreads(); __syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS)); ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7]))); int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
...@@ -327,8 +327,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri ...@@ -327,8 +327,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp); stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
}; };
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream); typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream);
...@@ -407,7 +407,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, ...@@ -407,7 +407,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForSobel ) ); cudaSafeCall( cudaUnbindTexture (texForSobel ) );
} }
...@@ -531,10 +531,10 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a ...@@ -531,10 +531,10 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold); textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
cudaSafeCall( cudaUnbindTexture (texForTF) );
} }
}}} }}}
...@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{ {
...@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar3, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar3, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
...@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar3, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar3, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{ {
...@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar4, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar4, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
...@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar4, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar4, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{ {
...@@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
...@@ -287,7 +287,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -287,7 +287,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
...@@ -337,7 +337,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -337,7 +337,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream); template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream);
...@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
...@@ -520,7 +520,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -520,7 +520,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void output_gpu<short>(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_<short>& disp, cudaStream_t stream); template void output_gpu<short>(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_<short>& disp, cudaStream_t stream);
......
...@@ -385,7 +385,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -385,7 +385,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step, template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
...@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2, template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
...@@ -713,7 +713,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -713,7 +713,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
...@@ -815,7 +815,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -815,7 +815,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
...@@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
......
...@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace surf
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace); icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
...@@ -338,7 +338,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -338,7 +338,7 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
...@@ -483,7 +483,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -483,7 +483,7 @@ namespace cv { namespace gpu { namespace surf
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter); icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
...@@ -674,7 +674,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -674,7 +674,7 @@ namespace cv { namespace gpu { namespace surf
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir); icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
...@@ -986,24 +986,24 @@ namespace cv { namespace gpu { namespace surf ...@@ -986,24 +986,24 @@ namespace cv { namespace gpu { namespace surf
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir); compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors); normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
else else
{ {
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir); compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors); normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
}}} }}}
...@@ -64,6 +64,8 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { thro ...@@ -64,6 +64,8 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { thro
void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
...@@ -117,7 +119,7 @@ namespace ...@@ -117,7 +119,7 @@ namespace
} }
} }
CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; }; CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; };
void cv::gpu::Stream::create() void cv::gpu::Stream::create()
{ {
...@@ -188,18 +190,35 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr ...@@ -188,18 +190,35 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr
void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); } void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); }
void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); } void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
{ {
CV_Assert((src.depth() != CV_64F) || CV_Assert((src.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
{
cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, impl->stream) );
return;
}
if (src.depth() == CV_8U)
{
int cn = src.channels();
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<uchar>(s[0]);
cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, impl->stream) );
return;
}
}
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream); typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
static const set_caller_t set_callers[] = static const set_caller_t set_callers[] =
{ {
kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>, kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
kernelSet<int>, kernelSet<float>, kernelSet<double> kernelSet<int>, kernelSet<float>, kernelSet<double>
}; };
set_callers[src.depth()](src, val, impl->stream); set_callers[src.depth()](src, s, impl->stream);
} }
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
...@@ -246,5 +265,17 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, ...@@ -246,5 +265,17 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream); matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
} }
cv::gpu::Stream::operator bool() const
{
return impl && impl->stream;
}
cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {}
cv::gpu::Stream& cv::gpu::Stream::Null()
{
static Stream s((Impl*)0);
return s;
}
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */
This diff is collapsed.
This diff is collapsed.
...@@ -44,11 +44,11 @@ ...@@ -44,11 +44,11 @@
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf) void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
{ {
Size src_size = terminals.size(); Size src_size = terminals.size();
CV_Assert(terminals.type() == CV_32S); CV_Assert(terminals.type() == CV_32S);
...@@ -73,17 +73,17 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans ...@@ -73,17 +73,17 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize()) if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize())
buf.create(1, bufsz, CV_8U); buf.create(1, bufsz, CV_8U);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(), nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
terminals.step, leftTransp.step, sznpp, labels.ptr<Npp8u>(), labels.step, buf.ptr<Npp8u>()) ); terminals.step, leftTransp.step, sznpp, labels.ptr<Npp8u>(), labels.step, buf.ptr<Npp8u>()) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */
This diff is collapsed.
...@@ -128,7 +128,7 @@ void cv::gpu::GpuMat::copyTo( GpuMat& m ) const ...@@ -128,7 +128,7 @@ void cv::gpu::GpuMat::copyTo( GpuMat& m ) const
CV_DbgAssert(!this->empty()); CV_DbgAssert(!this->empty());
m.create(size(), type()); m.create(size(), type());
cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) ); cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void cv::gpu::GpuMat::copyTo( GpuMat& mat, const GpuMat& mask ) const void cv::gpu::GpuMat::copyTo( GpuMat& mat, const GpuMat& mask ) const
...@@ -179,7 +179,7 @@ namespace ...@@ -179,7 +179,7 @@ namespace
sz.height = src.rows; sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), src.step, dst.ptr<dst_t>(), dst.step, sz) ); nppSafeCall( func(src.ptr<src_t>(), src.step, dst.ptr<dst_t>(), dst.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func> template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
...@@ -193,7 +193,7 @@ namespace ...@@ -193,7 +193,7 @@ namespace
sz.height = src.rows; sz.height = src.rows;
nppSafeCall( func(src.ptr<Npp32f>(), src.step, dst.ptr<dst_t>(), dst.step, sz, NPP_RND_NEAR) ); nppSafeCall( func(src.ptr<Npp32f>(), src.step, dst.ptr<dst_t>(), dst.step, sz, NPP_RND_NEAR) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
...@@ -349,7 +349,7 @@ namespace ...@@ -349,7 +349,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz) ); nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func> template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
...@@ -364,7 +364,7 @@ namespace ...@@ -364,7 +364,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz) ); nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
...@@ -400,7 +400,7 @@ namespace ...@@ -400,7 +400,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) ); nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func> template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
...@@ -415,7 +415,7 @@ namespace ...@@ -415,7 +415,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) ); nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
...@@ -463,8 +463,8 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask) ...@@ -463,8 +463,8 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
{ {
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set}, {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
{kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>}, {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet<ushort>,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set}, {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet<short>,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set}, {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set}, {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set}, {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
{kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}, {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>},
......
...@@ -114,24 +114,14 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) ...@@ -114,24 +114,14 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
#if NPP_VERSION_MAJOR >= 4
DeviceBuffer dbuf(2); DeviceBuffer dbuf(2);
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, dbuf, (double*)dbuf + 1) ); nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, dbuf, (double*)dbuf + 1) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
double* ptrs[2] = {mean.val, stddev.val}; double* ptrs[2] = {mean.val, stddev.val};
dbuf.download(ptrs); dbuf.download(ptrs);
#else
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
cudaSafeCall( cudaThreadSynchronize() );
#endif
} }
...@@ -184,25 +174,15 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) ...@@ -184,25 +174,15 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
int funcIdx = normType >> 1; int funcIdx = normType >> 1;
double retVal; double retVal;
#if NPP_VERSION_MAJOR >= 4
DeviceBuffer dbuf; DeviceBuffer dbuf;
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, dbuf) ); nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, dbuf) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
dbuf.download(&retVal); dbuf.download(&retVal);
#else
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, &retVal) );
cudaSafeCall( cudaThreadSynchronize() );
#endif
return retVal; return retVal;
} }
......
...@@ -332,7 +332,7 @@ namespace cv ...@@ -332,7 +332,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T1, typename T2, typename D, typename BinOp, typename Mask> template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
...@@ -349,7 +349,7 @@ namespace cv ...@@ -349,7 +349,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<> struct TransformDispatcher<true> template<> struct TransformDispatcher<true>
...@@ -370,7 +370,7 @@ namespace cv ...@@ -370,7 +370,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T1, typename T2, typename D, typename BinOp, typename Mask> template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
...@@ -389,7 +389,7 @@ namespace cv ...@@ -389,7 +389,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
......
...@@ -77,8 +77,8 @@ ...@@ -77,8 +77,8 @@
#include "nvidia/NPP_staging/NPP_staging.hpp" #include "nvidia/NPP_staging/NPP_staging.hpp"
#include "nvidia/NCVHaarObjectDetection.hpp" #include "nvidia/NCVHaarObjectDetection.hpp"
#define CUDART_MINIMUM_REQUIRED_VERSION 3020 #define CUDART_MINIMUM_REQUIRED_VERSION 4000
#define NPP_MINIMUM_REQUIRED_VERSION 3216 #define NPP_MINIMUM_REQUIRED_VERSION 4000
#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
#error "Insufficient Cuda Runtime library version, please update it." #error "Insufficient Cuda Runtime library version, please update it."
......
...@@ -46,14 +46,10 @@ using namespace std; ...@@ -46,14 +46,10 @@ using namespace std;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/) { throw_nogpu(); } void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); } void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
...@@ -148,51 +144,25 @@ namespace cv { namespace gpu { namespace split_merge ...@@ -148,51 +144,25 @@ namespace cv { namespace gpu { namespace split_merge
}}} }}}
void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst) void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
{
split_merge::merge(src, n, dst, 0);
}
void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst)
{
split_merge::merge(&src[0], src.size(), dst, 0);
}
void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream)
{ {
split_merge::merge(src, n, dst, StreamAccessor::getStream(stream)); split_merge::merge(src, n, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, const Stream& stream) void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream)
{ {
split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream)); split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::split(const GpuMat& src, GpuMat* dst) void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
{
split_merge::split(src, dst, 0);
}
void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst)
{
dst.resize(src.channels());
if(src.channels() > 0)
split_merge::split(src, &dst[0], 0);
}
void cv::gpu::split(const GpuMat& src, GpuMat* dst, const Stream& stream)
{ {
split_merge::split(src, dst, StreamAccessor::getStream(stream)); split_merge::split(src, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream) void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream)
{ {
dst.resize(src.channels()); dst.resize(src.channels());
if(src.channels() > 0) if(src.channels() > 0)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment