modified according to CUDA 4.0 API updates

926a6bba · Vladislav Vinogradov · 98d663e7 · 98d663e7 · 98d663e7 · 98d663e7
Commit 926a6bba authored May 31, 2011 by Vladislav Vinogradov
40 changed files
--- a/modules/gpu/CMakeLists.txt_cuda32
+++ b/modules/gpu/CMakeLists.txt_cuda32
-set(name "gpu")
-set(the_target "opencv_${name}")
-project(${the_target})
-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed 
-set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"                    
-					"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"                    
-					"${CMAKE_CURRENT_SOURCE_DIR}/src"
-					"${CMAKE_CURRENT_BINARY_DIR}")
-file(GLOB lib_srcs "src/*.cpp")
-file(GLOB lib_int_hdrs "src/*.h*")
-file(GLOB lib_cuda "src/cuda/*.cu*")
-file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
-source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
-source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
-file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
-source_group("Include" FILES ${lib_hdrs})
-#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
-file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
-source_group("Device" FILES ${lib_device_hdrs})
-if (HAVE_CUDA)
-    file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
-    file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
-    file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
-    source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
-    include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
-endif()
-if (HAVE_CUDA)		
-    get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
-    set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})
-    find_package(NPP 3.2.16 REQUIRED)
-    message(STATUS "NPP detected: " ${NPP_VERSION})
-    include_directories(${CUDA_INCLUDE_DIRS} ${CUDA_NPP_INCLUDES})
-    if (UNIX OR APPLE)
-        set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;")
-        #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
-    endif()
-    #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
-    #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
-    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-    if(MSVC)
-        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-    endif()
-	if (OPENCV_BUILD_SHARED_LIB) 
-		set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
-	endif()
-    CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
-    #CUDA_BUILD_CLEAN_TARGET()
-endif()
-foreach(d ${DEPS})
-	if(${d} MATCHES "opencv_")
-		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-		include_directories("${d_dir}/include")
-	endif()
-endforeach()
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
-if(PCHSupport_FOUND)
-	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
-	if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
-		if(${CMAKE_GENERATOR} MATCHES "Visual*")
-			set(${the_target}_pch "src/precomp.cpp")
-		endif()
-		add_native_precompiled_header(${the_target} ${pch_header})
-	elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
-		add_precompiled_header(${the_target} ${pch_header})
-	endif()
-endif()
-# For dynamic link numbering convenions
-set_target_properties(${the_target} PROPERTIES
-	VERSION ${OPENCV_VERSION}
-	SOVERSION ${OPENCV_SOVERSION}
-	OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
-	)
-if(ENABLE_SOLUTION_FOLDERS)
-	set_target_properties(${the_target} PROPERTIES FOLDER "modules")
-endif()	
-if (OPENCV_BUILD_SHARED_LIB) 
-	if (MSVC)
-		set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
-	else()
-		add_definitions(-DCVAPI_EXPORTS)	   
-	endif()	
-endif()
-# Additional target properties
-set_target_properties(${the_target} PROPERTIES
-	DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
-	RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
-	INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
-	)
-# Add the required libraries for linking:
-target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
-if (HAVE_CUDA)
-	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})    
-    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
-endif()
-if(MSVC)
-	if(CMAKE_CROSSCOMPILING)
-		set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk")
-	endif()
-	set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc")
-endif()
-# Dependencies of this target:
-add_dependencies(${the_target} ${DEPS})
-install(TARGETS ${the_target}
-	RUNTIME DESTINATION bin COMPONENT main
-	LIBRARY DESTINATION lib COMPONENT main
-	ARCHIVE DESTINATION lib COMPONENT main)
-install(FILES ${lib_hdrs}
-	DESTINATION include/opencv2/${name}
-	COMPONENT main)
-install(FILES src/nvidia/NPP_staging/NPP_staging.hpp  src/nvidia/core/NCV.hpp
-	DESTINATION include/opencv2/${name}
-	COMPONENT main)
-#install(FILES ${lib_device_hdrs}
-#	DESTINATION include/opencv2/${name}/device
-#	COMPONENT main)
-################################################################################################################
-################################      GPU Module Tests     #####################################################
-################################################################################################################
-# Test files processing is in the separated directory to avoid 'Src' source 
-# filter creation in Visual Studio 
-if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test)
-    set(the_test_target "opencv_test_${name}")   
-    include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
-                        "${CMAKE_CURRENT_SOURCE_DIR}/test"
-                        "${CMAKE_CURRENT_BINARY_DIR}")                      
-    set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS})
-    foreach(d ${test_deps})
-        if(${d} MATCHES "opencv_")
-            if(${d} MATCHES "opencv_lapack")
-            else()
-                string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-                include_directories("${d_dir}/include")
-            endif()
-        endif()
-    endforeach()
-    file(GLOB test_srcs "test/*.cpp")
-    file(GLOB test_hdrs "test/*.h*")
-	source_group("Src" FILES ${test_hdrs} ${test_srcs})
-    if(HAVE_CUDA)
-            include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
-            file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*")            
-            source_group("Src\\NVidia" FILES ${nvidia})
-    endif()
-    add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia})
-    if(PCHSupport_FOUND)
-        set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp)
-        if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
-            if(${CMAKE_GENERATOR} MATCHES "Visual*")
-                set(${the_test_target}_pch "test/test_precomp.cpp")
-            endif()            
-            add_native_precompiled_header(${the_test_target} ${pch_header})
-        elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
-            add_precompiled_header(${the_test_target} ${pch_header})
-        endif()
-    endif()
-    # Additional target properties
-    set_target_properties(${the_test_target} PROPERTIES
-        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
-        )
-	if(ENABLE_SOLUTION_FOLDERS)
-		set_target_properties(${the_test_target} PROPERTIES FOLDER "tests")
-	endif()			
-    add_dependencies(${the_test_target} ${test_deps})
-    # Add the required libraries for linking:
-    target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps})
-    enable_testing()
-    get_target_property(LOC ${the_test_target} LOCATION)
-    add_test(${the_test_target} "${LOC}")
-    if(WIN32)
-        install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main)
-    endif()
-endif()
--- a/modules/gpu/CMakeLists.txt_cuda4.0
+++ b/modules/gpu/CMakeLists.txt_cuda4.0
-set(name "gpu")
-set(the_target "opencv_${name}")
-project(${the_target})
-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed 
-set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"                    
-					"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"                    
-					"${CMAKE_CURRENT_SOURCE_DIR}/src"
-					"${CMAKE_CURRENT_BINARY_DIR}")
-file(GLOB lib_srcs "src/*.cpp")
-file(GLOB lib_int_hdrs "src/*.h*")
-file(GLOB lib_cuda "src/cuda/*.cu*")
-file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
-source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
-source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
-file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
-source_group("Include" FILES ${lib_hdrs})
-#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
-file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
-source_group("Device" FILES ${lib_device_hdrs})
-if (HAVE_CUDA)
-    file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
-    file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
-    file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
-    source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
-    include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
-endif()
-if (HAVE_CUDA)		
-    #get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
-    #set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})
-    #find_package(NPP 3.2.16 REQUIRED)
-    #message(STATUS "NPP detected: " ${NPP_VERSION})
-    include_directories(${CUDA_INCLUDE_DIRS})
-    if (UNIX OR APPLE)
-        set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;")
-        #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
-    endif()
-    #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
-    #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
-    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-    if(MSVC)
-        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-        string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-    endif()
-	if (OPENCV_BUILD_SHARED_LIB) 
-		set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
-	endif()
-    CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
-    #CUDA_BUILD_CLEAN_TARGET()
-endif()
-foreach(d ${DEPS})
-	if(${d} MATCHES "opencv_")
-		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-		include_directories("${d_dir}/include")
-	endif()
-endforeach()
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
-if(PCHSupport_FOUND)
-	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
-	if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
-		if(${CMAKE_GENERATOR} MATCHES "Visual*")
-			set(${the_target}_pch "src/precomp.cpp")
-		endif()
-		add_native_precompiled_header(${the_target} ${pch_header})
-	elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
-		add_precompiled_header(${the_target} ${pch_header})
-	endif()
-endif()
-# For dynamic link numbering convenions
-set_target_properties(${the_target} PROPERTIES
-	VERSION ${OPENCV_VERSION}
-	SOVERSION ${OPENCV_SOVERSION}
-	OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
-	)
-if(ENABLE_SOLUTION_FOLDERS)
-	set_target_properties(${the_target} PROPERTIES FOLDER "modules")
-endif()	
-if (OPENCV_BUILD_SHARED_LIB) 
-	if (MSVC)
-		set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
-	else()
-		add_definitions(-DCVAPI_EXPORTS)	   
-	endif()	
-endif()
-# Additional target properties
-set_target_properties(${the_target} PROPERTIES
-	DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
-	RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
-	INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
-	)
-# Add the required libraries for linking:
-target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
-if (HAVE_CUDA)
-	target_link_libraries(${the_target} ${CUDA_LIBRARIES})    
-    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
-	unset(CUDA_npp_LIBRARY CACHE)
-	find_cuda_helper_libs(npp)
-	target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})    		
-endif()
-if(MSVC)
-	if(CMAKE_CROSSCOMPILING)
-		set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk")
-	endif()
-	set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc")
-endif()
-# Dependencies of this target:
-add_dependencies(${the_target} ${DEPS})
-install(TARGETS ${the_target}
-	RUNTIME DESTINATION bin COMPONENT main
-	LIBRARY DESTINATION lib COMPONENT main
-	ARCHIVE DESTINATION lib COMPONENT main)
-install(FILES ${lib_hdrs}
-	DESTINATION include/opencv2/${name}
-	COMPONENT main)
-install(FILES src/nvidia/NPP_staging/NPP_staging.hpp  src/nvidia/core/NCV.hpp
-	DESTINATION include/opencv2/${name}
-	COMPONENT main)
-#install(FILES ${lib_device_hdrs}
-#	DESTINATION include/opencv2/${name}/device
-#	COMPONENT main)
-################################################################################################################
-################################      GPU Module Tests     #####################################################
-################################################################################################################
-# Test files processing is in the separated directory to avoid 'Src' source 
-# filter creation in Visual Studio 
-if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test)
-    set(the_test_target "opencv_test_${name}")   
-    include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
-                        "${CMAKE_CURRENT_SOURCE_DIR}/test"
-                        "${CMAKE_CURRENT_BINARY_DIR}")                      
-    set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS})
-    foreach(d ${test_deps})
-        if(${d} MATCHES "opencv_")
-            if(${d} MATCHES "opencv_lapack")
-            else()
-                string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-                include_directories("${d_dir}/include")
-            endif()
-        endif()
-    endforeach()
-    file(GLOB test_srcs "test/*.cpp")
-    file(GLOB test_hdrs "test/*.h*")
-	source_group("Src" FILES ${test_hdrs} ${test_srcs})
-    if(HAVE_CUDA)
-            include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
-            file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*")            
-            source_group("Src\\NVidia" FILES ${nvidia})
-    endif()
-    add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia})
-    if(PCHSupport_FOUND)
-        set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp)
-        if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
-            if(${CMAKE_GENERATOR} MATCHES "Visual*")
-                set(${the_test_target}_pch "test/test_precomp.cpp")
-            endif()            
-            add_native_precompiled_header(${the_test_target} ${pch_header})
-        elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
-            add_precompiled_header(${the_test_target} ${pch_header})
-        endif()
-    endif()
-    # Additional target properties
-    set_target_properties(${the_test_target} PROPERTIES
-        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
-        )
-	if(ENABLE_SOLUTION_FOLDERS)
-		set_target_properties(${the_test_target} PROPERTIES FOLDER "tests")
-	endif()			
-    add_dependencies(${the_test_target} ${test_deps})
-    # Add the required libraries for linking:
-    target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps})
-    enable_testing()
-    get_target_property(LOC ${the_test_target} LOCATION)
-    add_test(${the_test_target} "${LOC}")
-    if(WIN32)
-        install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main)
-    endif()
-endif()
--- a/modules/gpu/FindNPP.cmake
+++ b/modules/gpu/FindNPP.cmake
-###############################################################################
-#
-# FindNPP.cmake
-#
-# CUDA_NPP_LIBRARY_ROOT_DIR   -- Path to the NPP dorectory.
-# CUDA_NPP_INCLUDES           -- NPP Include directories.
-# CUDA_NPP_LIBRARIES          -- NPP libraries.
-# NPP_VERSION                 -- NPP version in format "major.minor.build".
-#
-# If not found automatically, please set CUDA_NPP_LIBRARY_ROOT_DIR 
-# in CMake or set enviroment varivabe $CUDA_NPP_ROOT
-#
-# Author: Anatoly Baksheev, Itseez Ltd.
-# 
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-cmake_policy(PUSH)
-cmake_minimum_required(VERSION 2.8.0)
-cmake_policy(POP)
-if(NOT "${CUDA_NPP_LIBRARY_ROOT_DIR}" STREQUAL "${CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL}")
-    unset(CUDA_NPP_INCLUDES CACHE)
-    unset(CUDA_NPP_LIBRARIES CACHE)  
-endif()
-if(CMAKE_SIZEOF_VOID_P EQUAL 4)			
-    if (UNIX OR APPLE)
-        set(NPP_SUFFIX "32")				
-    else()
-        set(NPP_SUFFIX "-mt")
-    endif()
-else(CMAKE_SIZEOF_VOID_P EQUAL 4)
-    if (UNIX OR APPLE)
-        set(NPP_SUFFIX "64")				
-    else()
-        set(NPP_SUFFIX "-mt-x64")			
-    endif()
-endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
-if(NOT CUDA_NPP_LIBRARY_ROOT_DIR OR CUDA_NPP_LIBRARY_ROOT_DIR STREQUAL "")
-    unset(CUDA_NPP_LIBRARY_ROOT_DIR CACHE)	
-    find_path(CUDA_NPP_LIBRARY_ROOT_DIR "common/npp/include/npp.h" PATHS ENV CUDA_NPP_ROOT DOC "NPP root directory.")	
-    MESSAGE(STATUS "NPP root directory: " ${CUDA_NPP_LIBRARY_ROOT_DIR})
-endif()
-# Search includes in our own paths.
-find_path(CUDA_NPP_INCLUDES npp.h PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}/common/npp/include")
-# Search default search paths, after we search our own set of paths.
-find_path(CUDA_NPP_INCLUDES device_functions.h)
-mark_as_advanced(CUDA_NPP_INCLUDES)
-# Find NPP library
-find_library(CUDA_NPP_LIBRARIES
-    NAMES "npp" "npp${NPP_SUFFIX}" "libnpp${NPP_SUFFIX}"
-    PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}"    
-    PATH_SUFFIXES "common/lib" "common/npp/lib"
-    DOC "NPP library"	
-    )	
-# Search default search paths, after we search our own set of paths.
-find_library(CUDA_NPP_LIBRARIES NAMES npp${NPP_SUFFIX} libnpp${NPP_SUFFIX} DOC "NPP library")
-mark_as_advanced(CUDA_NPP_LIBRARIES)
-if(EXISTS ${CUDA_NPP_INCLUDES}/nppversion.h)
-    file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_major REGEX "#define NPP_VERSION_MAJOR.*")
-    file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_minor REGEX "#define NPP_VERSION_MINOR.*")
-    file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_build REGEX "#define NPP_VERSION_BUILD.*")
-    string( REGEX REPLACE "#define NPP_VERSION_MAJOR[ \t]+|//.*" "" npp_major ${npp_major})
-    string( REGEX REPLACE "#define NPP_VERSION_MINOR[ \t]+|//.*" "" npp_minor ${npp_minor})
-    string( REGEX REPLACE "#define NPP_VERSION_BUILD[ \t]+|//.*" "" npp_build ${npp_build})
-    string( REGEX MATCH "[0-9]+" npp_major ${npp_major} ) 
-    string( REGEX MATCH "[0-9]+" npp_minor ${npp_minor} ) 
-    string( REGEX MATCH "[0-9]+" npp_build ${npp_build} ) 	
-    set( NPP_VERSION "${npp_major}.${npp_minor}.${npp_build}")	
-endif()
-if(NOT EXISTS ${CUDA_NPP_LIBRARIES} OR NOT EXISTS ${CUDA_NPP_INCLUDES}/npp.h)
-    set(CUDA_NPP_FOUND FALSE)	
-    message(FATAL_ERROR "NPP headers/libraries are not found. Please specify CUDA_NPP_LIBRARY_ROOT_DIR in CMake or set $CUDA_NPP_ROOT.")	
-endif()
-include( FindPackageHandleStandardArgs ) 
-find_package_handle_standard_args( NPP 
-    REQUIRED_VARS 
-        CUDA_NPP_INCLUDES 
-        CUDA_NPP_LIBRARIES 
-    #Need cmake 2.8.3 to uncomment this. 
-    #VERSION_VAR			
-        NPP_VERSION)
-if(APPLE)
-    # We need to add the path to cudart to the linker using rpath, since the library name for the cuda libraries is prepended with @rpath.
-    get_filename_component(_cuda_path_to_npp "${CUDA_NPP_LIBRARIES}" PATH)
-    if(_cuda_path_to_npp)
-        list(APPEND CUDA_NPP_LIBRARIES "-Wl,-rpath,${_cuda_path_to_npp}")
-    endif()
-endif()
-set(CUDA_NPP_FOUND TRUE)
-set(CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL "${CUDA_NPP_LIBRARY_ROOT_DIR}" CACHE INTERNAL "This is the value of the last time CUDA_NPP_LIBRARY_ROOT_DIR was set successfully." FORCE)
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -51,7 +51,6 @@ using namespace std;
 cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); }
 cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); }
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
@@ -101,7 +100,7 @@ namespace
    template <typename T>
    void bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold, 
                                   GpuMat& table_color, GpuMat& table_space, 
-                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream)
+                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
    {
        short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
        short max_disc = short(ndisp * max_disc_threshold + 0.5);
@@ -109,14 +108,19 @@ namespace
        bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
        if (&dst != &disp)
-            disp.copyTo(dst);
+        {
+            if (stream)
+                stream.enqueueCopy(disp, dst);
+            else
+                disp.copyTo(dst);
+        }
-        bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, stream);
+        bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
    }
    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, 
                                                GpuMat& table_color, GpuMat& table_space, 
-                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream);
+                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
    const bilateral_filter_operator_t operators[] = 
        {bilateral_filter_operator<unsigned char>, 0, 0, bilateral_filter_operator<short>, 0, 0, 0, 0};
@@ -139,18 +143,11 @@ cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radi
    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
 }
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst)
-{
-    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
-    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, 0);
-}
 void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
 {
    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, StreamAccessor::getStream(stream));
+    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -48,8 +48,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
-void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, 
+void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-                          GpuMat&) { throw_nogpu(); }
 #else
@@ -57,14 +56,14 @@ namespace cv { namespace gpu
 {
    template <typename T>
    void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2, 
-                           const PtrStep_<float> weights1, const PtrStep_<float> weights2, PtrStep_<T> result);
+                           const PtrStep_<float> weights1, const PtrStep_<float> weights2, PtrStep_<T> result, cudaStream_t stream);
    void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, 
-                               const PtrStepf weights1, const PtrStepf weights2, PtrStep result);
+                               const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream);
 }}
 void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
-                          GpuMat& result)
+                          GpuMat& result, Stream& stream)
 {
    CV_Assert(img1.size() == img2.size());
    CV_Assert(img1.type() == img2.type());
@@ -83,12 +82,12 @@ void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat&
    {
    case CV_8U:
        if (cn != 4)
-            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result);
+            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
        else
-            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result);
+            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
        break;
    case CV_32F:
-        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result);
+        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
        break;
    default:
        CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function");

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -44,20 +44,11 @@
 #if !defined(HAVE_CUDA)
-void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&,
+void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
-                              GpuMat&) { throw_nogpu(); }
-void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&,
+void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
-                              GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&,
+void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
-                            const Mat&, const Mat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&,
-                            const Mat&, const Mat&, GpuMat&, const Stream&) { throw_nogpu(); }
-void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, 
-                             Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
 #else
@@ -66,14 +57,12 @@ using namespace cv::gpu;
 namespace cv { namespace gpu { namespace transform_points 
 {
-    void call(const DevMem2D_<float3> src, const float* rot, const float* transl,
+    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
-              DevMem2D_<float3> dst, cudaStream_t stream);
 }}}
 namespace
 {
-    void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+    void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
-                               GpuMat& dst, cudaStream_t stream)
    {
        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
@@ -88,30 +77,20 @@ namespace
    }
 }
-void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
-                              GpuMat& dst)
-{
-    ::transformPointsCaller(src, rvec, tvec, dst, 0);
-}
-void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
-                              GpuMat& dst, const Stream& stream)
 {
    ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
 }
 namespace cv { namespace gpu { namespace project_points 
 {
-    void call(const DevMem2D_<float3> src, const float* rot, const float* transl,
+    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
-              const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
 }}}
 namespace
 {
-    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
-                             const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
-                             cudaStream_t stream)
    {
        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
@@ -124,20 +103,11 @@ namespace
        Rodrigues(rvec, rot);
        dst.create(src.size(), CV_32FC2);
-        project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(),
+        project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
-                             camera_mat.ptr<float>(), dst,stream);
    }
 }
-void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
-                            const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst)
-{
-    ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, 0);
-}
-void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
-                            const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
-                            const Stream& stream)
 {
    ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
 }

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -47,8 +47,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
-void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int) { throw_nogpu(); }
+void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
-void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, const Stream&) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
@@ -455,12 +454,7 @@ namespace
    }
 }
-void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn)
+void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
-{
-    cvtColor_caller(src, dst, code, dcn, 0);
-}
-void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, const Stream& stream)
 {
    cvtColor_caller(src, dst, code, dcn, StreamAccessor::getStream(stream));
 }

--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -68,19 +68,22 @@ namespace cv { namespace gpu
    template <typename T>
    void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2, 
-                           const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result)
+                           const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result, cudaStream_t stream)
    {
        dim3 threads(16, 16);
        dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-        blendLinearKernel<<<grid, threads>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+        blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaGetLastError() );
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }
    template void blendLinearCaller<uchar>(int, int, int, const PtrStep, const PtrStep, 
-                                           const PtrStepf, const PtrStepf, PtrStep);
+                                           const PtrStepf, const PtrStepf, PtrStep, cudaStream_t stream);
    template void blendLinearCaller<float>(int, int, int, const PtrStepf, const PtrStepf, 
-                                           const PtrStepf, const PtrStepf, PtrStepf);
+                                           const PtrStepf, const PtrStepf, PtrStepf, cudaStream_t stream);
    __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
@@ -105,13 +108,16 @@ namespace cv { namespace gpu
    void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, 
-                               const PtrStepf weights1, const PtrStepf weights2, PtrStep result)
+                               const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream)
    {
        dim3 threads(16, 16);
        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-        blendLinearKernel8UC4<<<grid, threads>>>(rows, cols, img1, img2, weights1, weights2, result);
+        blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaGetLastError() );
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }
 }}
\ No newline at end of file
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -184,7 +184,9 @@ namespace cv { namespace gpu
            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
                    num_points, object, image, dist_threshold, hypothesis_scores);
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    } // namespace solvepnp_ransac

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -64,19 +64,19 @@ namespace cv { namespace gpu { namespace mathfunc
    };
    template <typename T1, typename T2>
-    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
        NotEqual<T1, T2> op;
-        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
+        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
    }
-    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        compare_ne<uint, uint>(src1, src2, dst);
+        compare_ne<uint, uint>(src1, src2, dst, stream);
    }
-    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        compare_ne<float, float>(src1, src2, dst);
+        compare_ne<float, float>(src1, src2, dst, stream);
    }
@@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
@@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
@@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall( cudaDeviceSynchronize() );
    }

--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -220,7 +220,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
        img_block_width, grad, qangle, scale, block_hists);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -324,7 +324,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -418,7 +418,7 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
        block_hists, coefs, free_coef, threshold, labels);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
 //----------------------------------------------------------------------------
@@ -463,7 +463,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -512,7 +512,7 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
 //----------------------------------------------------------------------------
@@ -636,7 +636,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
        compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
 template <int nthreads, int correct_gamma>
@@ -707,7 +708,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
        compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -765,7 +767,9 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
    resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall( cudaThreadSynchronize() );
+    cudaSafeCall( cudaDeviceSynchronize() );
    cudaSafeCall( cudaUnbindTexture(tex) );
 }

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc
        remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() );  
+        cudaSafeCall( cudaDeviceSynchronize() );  
        cudaSafeCall( cudaUnbindTexture(tex_remap) );
    }
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace imgproc
        remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() ); 
+        cudaSafeCall( cudaDeviceSynchronize() ); 
    }
 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
@@ -263,7 +263,7 @@ namespace cv { namespace gpu { namespace imgproc
        meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() );
+        cudaSafeCall( cudaDeviceSynchronize() );
        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }
    extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) 
@@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace imgproc
        meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() );
+        cudaSafeCall( cudaDeviceSynchronize() );
        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }
@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() ); 
+            cudaSafeCall( cudaDeviceSynchronize() ); 
    }
    void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
@@ -411,7 +411,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
@@ -502,7 +502,7 @@ namespace cv { namespace gpu { namespace imgproc
        extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
@@ -611,7 +611,8 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        cudaSafeCall(cudaUnbindTexture(harrisDxTex));
        cudaSafeCall(cudaUnbindTexture(harrisDyTex));
    }
@@ -727,7 +728,8 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall(cudaDeviceSynchronize());
        cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
        cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
    }
@@ -763,7 +765,7 @@ namespace cv { namespace gpu { namespace imgproc
        column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    //////////////////////////////////////////////////////////////////////////
@@ -791,7 +793,7 @@ namespace cv { namespace gpu { namespace imgproc
        mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    //////////////////////////////////////////////////////////////////////////
@@ -820,7 +822,7 @@ namespace cv { namespace gpu { namespace imgproc
        mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    //////////////////////////////////////////////////////////////////////////
@@ -850,7 +852,7 @@ namespace cv { namespace gpu { namespace imgproc
        mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    //////////////////////////////////////////////////////////////////////////
@@ -880,7 +882,7 @@ namespace cv { namespace gpu { namespace imgproc
        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    /////////////////////////////////////////////////////////////////////////
@@ -904,7 +906,9 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
        downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst);
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaGetLastError() );
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst);

--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -46,6 +46,8 @@
 #include "opencv2/gpu/devmem2d.hpp"
 #include "safe_call.hpp"
 #include "cuda_runtime.h"
+#include "npp.h"
+#include "NPP_staging.hpp"
 namespace cv
 {
@@ -106,6 +108,41 @@ namespace cv
            cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
            cudaSafeCall( cudaUnbindTexture(tex) );
        }
+        class NppStreamHandler
+        {
+        public:
+            inline explicit NppStreamHandler(cudaStream_t newStream = 0)
+            {
+                oldStream = nppGetStream();
+                nppSetStream(newStream);
+            }
+            inline ~NppStreamHandler()
+            {
+                nppSetStream(oldStream);
+            }
+        private:
+            cudaStream_t oldStream;
+        };
+        class NppStStreamHandler
+        {
+        public:
+            inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
+            {
+                oldStream = nppStSetActiveCUDAstream(newStream);
+            }
+            inline ~NppStStreamHandler()
+            {
+                nppStSetActiveCUDAstream(oldStream);
+            }
+        private:
+            cudaStream_t oldStream;
+        };
    }
 }

--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -134,7 +134,7 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -165,7 +165,7 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -228,7 +228,7 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -259,7 +259,7 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -309,7 +309,7 @@ void matchTemplatePrepared_SQDIFF_8U(
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -360,7 +360,7 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -392,7 +392,7 @@ void matchTemplatePrepared_CCOFF_8U(
            w, h, (float)templ_sum / (w * h), image_sum, result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -434,7 +434,7 @@ void matchTemplatePrepared_CCOFF_8UC2(
            image_sum_r, image_sum_g, result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -490,7 +490,7 @@ void matchTemplatePrepared_CCOFF_8UC3(
            image_sum_r, image_sum_g, image_sum_b, result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -556,7 +556,7 @@ void matchTemplatePrepared_CCOFF_8UC4(
            result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -602,7 +602,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
            image_sum, image_sqsum, result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -665,7 +665,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
            result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -742,7 +742,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -833,7 +833,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            result);
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -877,7 +877,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
@@ -919,7 +919,7 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
    }
    cudaSafeCall( cudaGetLastError() );
-    cudaSafeCall(cudaThreadSynchronize());
+    cudaSafeCall( cudaDeviceSynchronize() );
 }

--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)

--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall ( cudaThreadSynchronize() );        
+            cudaSafeCall ( cudaDeviceSynchronize() );
    }
    void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
@@ -199,7 +199,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall ( cudaThreadSynchronize() );
+            cudaSafeCall ( cudaDeviceSynchronize() );
    }
    template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
@@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall ( cudaThreadSynchronize() );
+            cudaSafeCall ( cudaDeviceSynchronize() );
    }
    template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);

--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -275,11 +275,11 @@ namespace cv { namespace gpu { namespace mathfunc
        minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
        *minval = minval_;
        *maxval = maxval_;
    }  
@@ -306,11 +306,11 @@ namespace cv { namespace gpu { namespace mathfunc
        minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
        *minval = minval_;
        *maxval = maxval_;
    }  
@@ -363,11 +363,11 @@ namespace cv { namespace gpu { namespace mathfunc
        minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall(cudaDeviceSynchronize());
        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
        *minval = minval_;
        *maxval = maxval_;
    }
@@ -395,11 +395,11 @@ namespace cv { namespace gpu { namespace mathfunc
        minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
        *minval = minval_;
        *maxval = maxval_;
    }
@@ -609,17 +609,17 @@ namespace cv { namespace gpu { namespace mathfunc
                                                           minloc_buf, maxloc_buf);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        T minval_, maxval_;
-        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
-        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
        *minval = minval_;
        *maxval = maxval_;
        uint minloc_, maxloc_;
-        cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
-        cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
    }
@@ -650,7 +650,7 @@ namespace cv { namespace gpu { namespace mathfunc
                                                             minloc_buf, maxloc_buf);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        T minval_, maxval_;
        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -724,7 +724,7 @@ namespace cv { namespace gpu { namespace mathfunc
        minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        T minval_, maxval_;
        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -766,7 +766,7 @@ namespace cv { namespace gpu { namespace mathfunc
        minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        T minval_, maxval_;
        cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace mathfunc
        countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        uint count;
        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
@@ -942,7 +942,7 @@ namespace cv { namespace gpu { namespace mathfunc
        countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        uint count;
        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace mathfunc
            break;
        }
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        R result[4] = {0, 0, 0, 0};
        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1543,7 +1543,7 @@ namespace cv { namespace gpu { namespace mathfunc
        }
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        R result[4] = {0, 0, 0, 0};
        cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1615,7 +1615,7 @@ namespace cv { namespace gpu { namespace mathfunc
            break;
        }
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        R result[4] = {0, 0, 0, 0};
        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace mathfunc
        }
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        R result[4] = {0, 0, 0, 0};
        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1737,7 +1737,7 @@ namespace cv { namespace gpu { namespace mathfunc
            break;
        }
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        R result[4] = {0, 0, 0, 0};
        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace mathfunc
        }
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall(cudaThreadSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize() );
        R result[4] = {0, 0, 0, 0};
        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));

--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace split_merge {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall(cudaDeviceSynchronize());
    }
@@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace split_merge {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall(cudaDeviceSynchronize());
    }
@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace split_merge {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall(cudaDeviceSynchronize());
    }
@@ -445,7 +445,7 @@ namespace cv { namespace gpu { namespace split_merge {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall(cudaDeviceSynchronize());
    }
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace split_merge {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall(cudaDeviceSynchronize());
    }
@@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace split_merge {
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall(cudaThreadSynchronize());
+            cudaSafeCall(cudaDeviceSynchronize());
    }

--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -102,19 +102,19 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned
    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
    ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
-	__syncthreads();
+    __syncthreads();
    ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
-	__syncthreads();
+    __syncthreads();
    ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
-	__syncthreads();
+    __syncthreads();
    ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
-	__syncthreads();
+    __syncthreads();
    ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
-	__syncthreads();
+    __syncthreads();
    ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
-	__syncthreads();
+    __syncthreads();
    ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
-	__syncthreads();
+    __syncthreads();
    ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
    int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
@@ -327,8 +327,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
    stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
    cudaSafeCall( cudaGetLastError() );
-    if (stream == 0)        
+    if (stream == 0)
-        cudaSafeCall( cudaThreadSynchronize() );
+        cudaSafeCall( cudaDeviceSynchronize() );
 };
 typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream);
@@ -407,7 +407,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
    cudaSafeCall( cudaGetLastError() );
    if (stream == 0)   
-		cudaSafeCall( cudaThreadSynchronize() );    
+        cudaSafeCall( cudaDeviceSynchronize() );    
    cudaSafeCall( cudaUnbindTexture (texForSobel ) );
 }
@@ -531,10 +531,10 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
    textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
    cudaSafeCall( cudaGetLastError() );
-	if (stream == 0)					
+    if (stream == 0)
-		cudaSafeCall( cudaThreadSynchronize() );		
+        cudaSafeCall( cudaDeviceSynchronize() );
-    cudaSafeCall( cudaUnbindTexture (texForTF) );
+    cudaSafeCall( cudaUnbindTexture (texForTF) );
 }
 }}}
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template <> void comp_data_gpu<uchar, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
    {
@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template <> void comp_data_gpu<uchar3, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template <> void comp_data_gpu<uchar3, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
    {
@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template <> void comp_data_gpu<uchar4, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template <> void comp_data_gpu<uchar4, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
    {
@@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 ///////////////////////////////////////////////////////////////
@@ -287,7 +287,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
@@ -337,7 +337,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream);
@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace bp
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
-                cudaSafeCall( cudaThreadSynchronize() );
+                cudaSafeCall( cudaDeviceSynchronize() );
        }
    }
@@ -520,7 +520,7 @@ namespace cv { namespace gpu { namespace bp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template void output_gpu<short>(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_<short>& disp, cudaStream_t stream);

--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -385,7 +385,7 @@ namespace cv { namespace gpu { namespace csbp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        dim3 threads(32, 8, 1);
        dim3 grid(1, 1, 1);
@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace csbp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace csbp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
@@ -713,7 +713,7 @@ namespace cv { namespace gpu { namespace csbp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
@@ -815,7 +815,7 @@ namespace cv { namespace gpu { namespace csbp
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
-                cudaSafeCall( cudaThreadSynchronize() );
+                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
@@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace csbp
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, 

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace surf
        icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() );
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    ////////////////////////////////////////////////////////////////////////
@@ -338,7 +338,7 @@ namespace cv { namespace gpu { namespace surf
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() );
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    ////////////////////////////////////////////////////////////////////////
@@ -483,7 +483,7 @@ namespace cv { namespace gpu { namespace surf
        icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() );
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    ////////////////////////////////////////////////////////////////////////
@@ -674,7 +674,7 @@ namespace cv { namespace gpu { namespace surf
        icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaThreadSynchronize() );
+        cudaSafeCall( cudaDeviceSynchronize() );
    }
    ////////////////////////////////////////////////////////////////////////
@@ -986,24 +986,24 @@ namespace cv { namespace gpu { namespace surf
            compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
            cudaSafeCall( cudaGetLastError() );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
            normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
            cudaSafeCall( cudaGetLastError() );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
        else
        {
            compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            
            cudaSafeCall( cudaGetLastError() );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
            normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);            
            cudaSafeCall( cudaGetLastError() );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    }
 }}}
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -64,6 +64,8 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { thro
 void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
+Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
+cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
 #else /* !defined (HAVE_CUDA) */
@@ -117,7 +119,7 @@ namespace
    }
 }
-CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; };
+CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; };
 void cv::gpu::Stream::create()
 {
@@ -188,18 +190,35 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr
 void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
 void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
 {
    CV_Assert((src.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+    if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+    {
+        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, impl->stream) );
+        return;
+    }
+    if (src.depth() == CV_8U)
+    {
+        int cn = src.channels();
+        if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
+        {
+            int val = saturate_cast<uchar>(s[0]);
+            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, impl->stream) );
+            return;
+        }
+    }
    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
    static const set_caller_t set_callers[] =
    {
        kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
        kernelSet<int>, kernelSet<float>, kernelSet<double>
    };
-    set_callers[src.depth()](src, val, impl->stream);
+    set_callers[src.depth()](src, s, impl->stream);
 }
 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
@@ -246,5 +265,17 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
    matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
 }
+cv::gpu::Stream::operator bool() const
+{
+    return impl && impl->stream;
+}
+cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {}
+cv::gpu::Stream& cv::gpu::Stream::Null()
+{
+    static Stream s((Impl*)0);
+    return s;
+}
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -44,11 +44,11 @@
 #if !defined (HAVE_CUDA)
-void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf)
+void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
 {
    Size src_size = terminals.size();
    CV_Assert(terminals.type() == CV_32S);
@@ -73,17 +73,17 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
    if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize())
        buf.create(1, bufsz, CV_8U);
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStreamHandler h(stream);
    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
        terminals.step, leftTransp.step, sznpp, labels.ptr<Npp8u>(), labels.step, buf.ptr<Npp8u>()) );
-    cudaSafeCall( cudaThreadSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -128,7 +128,7 @@ void cv::gpu::GpuMat::copyTo( GpuMat& m ) const
    CV_DbgAssert(!this->empty());
    m.create(size(), type());
    cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) );
-    cudaSafeCall( cudaThreadSynchronize() );
+    cudaSafeCall( cudaDeviceSynchronize() );
 }
 void cv::gpu::GpuMat::copyTo( GpuMat& mat, const GpuMat& mask ) const
@@ -179,7 +179,7 @@ namespace
            sz.height = src.rows;
            nppSafeCall( func(src.ptr<src_t>(), src.step, dst.ptr<dst_t>(), dst.step, sz) );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
@@ -193,7 +193,7 @@ namespace
            sz.height = src.rows;
            nppSafeCall( func(src.ptr<Npp32f>(), src.step, dst.ptr<dst_t>(), dst.step, sz, NPP_RND_NEAR) );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
@@ -349,7 +349,7 @@ namespace
            Scalar_<src_t> nppS = s;
            nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz) );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
@@ -364,7 +364,7 @@ namespace
            Scalar_<src_t> nppS = s;
            nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz) );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
@@ -400,7 +400,7 @@ namespace
            Scalar_<src_t> nppS = s;
            nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
@@ -415,7 +415,7 @@ namespace
            Scalar_<src_t> nppS = s;
            nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
-            cudaSafeCall( cudaThreadSynchronize() );
+            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
@@ -463,8 +463,8 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
        {
            {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
            {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
-            {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet<ushort>,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
+            {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
-            {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet<short>,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
+            {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
            {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
            {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
            {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>},

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -114,24 +114,14 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
    sz.width  = src.cols;
    sz.height = src.rows;
-#if NPP_VERSION_MAJOR >= 4
    DeviceBuffer dbuf(2);
    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, dbuf, (double*)dbuf + 1) );
-    cudaSafeCall( cudaThreadSynchronize() );
+    cudaSafeCall( cudaDeviceSynchronize() );
    double* ptrs[2] = {mean.val, stddev.val};
    dbuf.download(ptrs);
-#else
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
-    cudaSafeCall( cudaThreadSynchronize() );
-#endif
 }
@@ -184,25 +174,15 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    int funcIdx = normType >> 1;
    double retVal;
-#if NPP_VERSION_MAJOR >= 4
    DeviceBuffer dbuf;
    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, dbuf) );
-    cudaSafeCall( cudaThreadSynchronize() );
+    cudaSafeCall( cudaDeviceSynchronize() );
    dbuf.download(&retVal);
-#else
-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, &retVal) );
-    cudaSafeCall( cudaThreadSynchronize() );
-#endif   
    return retVal;
 }

--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -332,7 +332,7 @@ namespace cv
                cudaSafeCall( cudaGetLastError() );
                if (stream == 0)
-                    cudaSafeCall( cudaThreadSynchronize() ); 
+                    cudaSafeCall( cudaDeviceSynchronize() ); 
            }
            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
@@ -349,7 +349,7 @@ namespace cv
                cudaSafeCall( cudaGetLastError() );
                if (stream == 0)
-                    cudaSafeCall( cudaThreadSynchronize() );            
+                    cudaSafeCall( cudaDeviceSynchronize() );            
            }
        };
        template<> struct TransformDispatcher<true>
@@ -370,7 +370,7 @@ namespace cv
                cudaSafeCall( cudaGetLastError() );
                if (stream == 0)
-                    cudaSafeCall( cudaThreadSynchronize() );
+                    cudaSafeCall( cudaDeviceSynchronize() );
            }
            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
@@ -389,7 +389,7 @@ namespace cv
                cudaSafeCall( cudaGetLastError() );
                if (stream == 0)
-                    cudaSafeCall( cudaThreadSynchronize() );            
+                    cudaSafeCall( cudaDeviceSynchronize() );            
            }
        };

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -77,8 +77,8 @@
    #include "nvidia/NPP_staging/NPP_staging.hpp"
    #include "nvidia/NCVHaarObjectDetection.hpp"
-#define CUDART_MINIMUM_REQUIRED_VERSION 3020
+#define CUDART_MINIMUM_REQUIRED_VERSION 4000
-#define NPP_MINIMUM_REQUIRED_VERSION 3216
+#define NPP_MINIMUM_REQUIRED_VERSION 4000
 #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
    #error "Insufficient Cuda Runtime library version, please update it."

--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@@ -46,14 +46,10 @@ using namespace std;
 #if !defined (HAVE_CUDA)
-void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
-void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
-void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
+void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
-void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
+void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
-void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/) { throw_nogpu(); }
-void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/) { throw_nogpu(); }
-void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
-void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
@@ -148,51 +144,25 @@ namespace cv { namespace gpu { namespace split_merge
 }}}
-void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst) 
+void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream) 
-{ 
-    split_merge::merge(src, n, dst, 0);
-}
-void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst) 
-{
-    split_merge::merge(&src[0], src.size(), dst, 0);
-}
-void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream) 
 { 
    split_merge::merge(src, n, dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, const Stream& stream) 
+void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream) 
 {
    split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::split(const GpuMat& src, GpuMat* dst) 
+void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream) 
-{
-    split_merge::split(src, dst, 0);
-}
-void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst) 
-{
-    dst.resize(src.channels());
-    if(src.channels() > 0)
-        split_merge::split(src, &dst[0], 0);
-}
-void cv::gpu::split(const GpuMat& src, GpuMat* dst, const Stream& stream) 
 {
    split_merge::split(src, dst, StreamAccessor::getStream(stream));
 }
-void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream) 
+void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream) 
 {
    dst.resize(src.channels());
    if(src.channels() > 0)

--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp