Commit 11dfceb2 authored by cuda-geek's avatar cuda-geek Committed by OpenCV Buildbot

Merge pull request #328 from jet47:new-gpu-fixes

parents 2b4ffd11 395f0201
......@@ -110,14 +110,15 @@ endif()
# Optional 3rd party components
# ===================================================
OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) )
OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS)
OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE )
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON)
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS))
OCV_OPTION(WITH_GSTREAMER "Include Gstreamer support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) )
OCV_OPTION(WITH_GTK "Include GTK support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) )
OCV_OPTION(WITH_IMAGEIO "ImageIO support for OS X" OFF IF APPLE)
......@@ -140,9 +141,9 @@ OCV_OPTION(WITH_V4L "Include Video 4 Linux support" ON
OCV_OPTION(WITH_VIDEOINPUT "Build HighGUI with DirectShow support" ON IF WIN32 )
OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) )
OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) )
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
# OpenCV build components
......@@ -161,12 +162,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE "Build OpenCV Manager for Google Play" OFF I
OCV_OPTION(BUILD_ANDROID_PACKAGE "Build platform-specific package for Google Play" OFF IF ANDROID )
# 3rd party libs
OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE )
OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE )
OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE )
OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE )
OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE )
OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE )
OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE OR CARMA )
OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE OR CARMA )
# OpenCV installation options
......@@ -778,6 +779,7 @@ if(HAVE_CUDA)
status(" Use CUFFT:" HAVE_CUFFT THEN YES ELSE NO)
status(" Use CUBLAS:" HAVE_CUBLAS THEN YES ELSE NO)
status(" USE NVCUVID:" HAVE_NVCUVID THEN YES ELSE NO)
status(" NVIDIA GPU arch:" ${OPENCV_CUDA_ARCH_BIN})
status(" NVIDIA PTX archs:" ${OPENCV_CUDA_ARCH_PTX})
status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO)
......
......@@ -3,17 +3,17 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
return()
endif()
if (WIN32 AND NOT MSVC)
message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler suppoted on your platform).")
if(WIN32 AND NOT MSVC)
message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler supported on your platform).")
return()
endif()
if (CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
message(STATUS "CUDA compilation is disabled (due to Clang unsuppoted on your platform).")
if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
message(STATUS "CUDA compilation is disabled (due to Clang unsupported on your platform).")
return()
endif()
find_package(CUDA 4.1)
find_package(CUDA 4.2 QUIET)
if(CUDA_FOUND)
set(HAVE_CUDA 1)
......@@ -26,16 +26,21 @@ if(CUDA_FOUND)
set(HAVE_CUBLAS 1)
endif()
if(WITH_NVCUVID)
find_cuda_helper_libs(nvcuvid)
set(HAVE_NVCUVID 1)
endif()
message(STATUS "CUDA detected: " ${CUDA_VERSION})
if(${CUDA_VERSION_STRING} VERSION_GREATER "4.1")
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
if (CARMA)
set(CUDA_ARCH_BIN "2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
else()
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
endif()
set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
......@@ -78,6 +83,15 @@ if(CUDA_FOUND)
set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
endforeach()
if(CARMA)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" )
if (CMAKE_VERSION VERSION_LESS 2.8.10)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" )
endif()
endif()
# These vars will be processed in other scripts
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
......@@ -92,7 +106,6 @@ if(CUDA_FOUND)
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
unset(CUDA_npp_LIBRARY CACHE)
find_cuda_helper_libs(npp)
macro(ocv_cuda_compile VAR)
......@@ -106,15 +119,15 @@ if(CUDA_FOUND)
string(REPLACE "-ggdb3" "" ${var} "${${var}}")
endforeach()
if (BUILD_SHARED_LIBS)
if(BUILD_SHARED_LIBS)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -DCVAPI_EXPORTS)
endif()
if(UNIX OR APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
endif()
if(APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
endif()
# disabled because of multiple warnings during building nvcc auto generated files
......
......@@ -44,6 +44,7 @@ set(OpenCV_COMPUTE_CAPABILITIES @OpenCV_CUDA_CC_CONFIGCMAKE@)
set(OpenCV_CUDA_VERSION @OpenCV_CUDA_VERSION@)
set(OpenCV_USE_CUBLAS @HAVE_CUBLAS@)
set(OpenCV_USE_CUFFT @HAVE_CUFFT@)
set(OpenCV_USE_NVCUVID @HAVE_NVCUVID@)
# Android API level from which OpenCV has been compiled is remembered
set(OpenCV_ANDROID_NATIVE_API_LEVEL @OpenCV_ANDROID_NATIVE_API_LEVEL_CONFIGCMAKE@)
......@@ -218,17 +219,22 @@ foreach(__opttype OPT DBG)
else()
#TODO: duplicates are annoying but they should not be the problem
endif()
# fix hard coded paths for CUDA libraries under Windows
if(WIN32 AND OpenCV_CUDA_VERSION AND NOT OpenCV_SHARED)
# CUDA
if(OpenCV_CUDA_VERSION AND (CARMA OR (WIN32 AND NOT OpenCV_SHARED)))
if(NOT CUDA_FOUND)
find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
else()
if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
message(FATAL_ERROR "OpenCV static library compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
if(WIN32)
message(FATAL_ERROR "OpenCV static library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
else()
message(FATAL_ERROR "OpenCV library for CARMA was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
endif()
endif()
endif()
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_nvcuvid_LIBRARY} ${CUDA_nvcuvenc_LIBRARY})
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
if(OpenCV_USE_CUBLAS)
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUBLAS_LIBRARIES})
......@@ -238,6 +244,13 @@ foreach(__opttype OPT DBG)
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUFFT_LIBRARIES})
endif()
if(OpenCV_USE_NVCUVID)
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvid_LIBRARIES})
endif()
if(WIN32)
list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvenc_LIBRARIES})
endif()
endif()
endforeach()
......
......@@ -175,21 +175,15 @@
/* NVidia Cuda Runtime API*/
#cmakedefine HAVE_CUDA
/* OpenCL Support */
#cmakedefine HAVE_OPENCL
/* AMD's OpenCL Fast Fourier Transform Library*/
#cmakedefine HAVE_CLAMDFFT
/* AMD's Basic Linear Algebra Subprograms Library*/
#cmakedefine HAVE_CLAMDBLAS
/* NVidia Cuda Fast Fourier Transform (FFT) API*/
#cmakedefine HAVE_CUFFT
/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
#cmakedefine HAVE_CUBLAS
/* NVidia Video Decoding API*/
#cmakedefine HAVE_NVCUVID
/* Compile for 'real' NVIDIA GPU architectures */
#define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
......@@ -202,6 +196,15 @@
/* Create PTX or BIN for 1.0 compute capability */
#cmakedefine CUDA_ARCH_BIN_OR_PTX_10
/* OpenCL Support */
#cmakedefine HAVE_OPENCL
/* AMD's OpenCL Fast Fourier Transform Library*/
#cmakedefine HAVE_CLAMDFFT
/* AMD's Basic Linear Algebra Subprograms Library*/
#cmakedefine HAVE_CLAMDBLAS
/* VideoInput library */
#cmakedefine HAVE_VIDEOINPUT
......
......@@ -10,7 +10,6 @@ if(HAVE_CUDA)
file(GLOB lib_cuda "src/cuda/*.cu")
ocv_cuda_compile(cuda_objs ${lib_cuda})
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
else()
set(lib_cuda "")
......
......@@ -177,6 +177,20 @@ namespace cv
//#undef __CV_GPU_DEPR_BEFORE__
//#undef __CV_GPU_DEPR_AFTER__
namespace device
{
using cv::gpu::PtrSz;
using cv::gpu::PtrStep;
using cv::gpu::PtrStepSz;
using cv::gpu::PtrStepSzb;
using cv::gpu::PtrStepSzf;
using cv::gpu::PtrStepSzi;
using cv::gpu::PtrStepb;
using cv::gpu::PtrStepf;
using cv::gpu::PtrStepi;
}
}
}
......
......@@ -79,6 +79,8 @@ namespace cv { namespace gpu
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
};
CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
// Gives information about what GPU archs this OpenCV GPU module was
// compiled for
class CV_EXPORTS TargetArchs
......
......@@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/type_traits.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
void writeScalar(const int*);
void writeScalar(const float*);
void writeScalar(const double*);
void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
}}}
......@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D>
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
{
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const
__device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
{
return saturate_cast<D>(alpha * src + beta);
}
double alpha, beta;
S alpha, beta;
};
namespace detail
......@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
};
}
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
{
};
template<typename T, typename D>
template<typename T, typename D, typename S>
void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
{
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta);
Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
}
......@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
{
typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
static const caller_t tab[8][8] =
{
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
{0,0,0,0,0,0,0,0}
static const caller_t tab[7][7] =
{
{
cvt_<uchar, uchar, float>,
cvt_<uchar, schar, float>,
cvt_<uchar, ushort, float>,
cvt_<uchar, short, float>,
cvt_<uchar, int, float>,
cvt_<uchar, float, float>,
cvt_<uchar, double, double>
},
{
cvt_<schar, uchar, float>,
cvt_<schar, schar, float>,
cvt_<schar, ushort, float>,
cvt_<schar, short, float>,
cvt_<schar, int, float>,
cvt_<schar, float, float>,
cvt_<schar, double, double>
},
{
cvt_<ushort, uchar, float>,
cvt_<ushort, schar, float>,
cvt_<ushort, ushort, float>,
cvt_<ushort, short, float>,
cvt_<ushort, int, float>,
cvt_<ushort, float, float>,
cvt_<ushort, double, double>
},
{
cvt_<short, uchar, float>,
cvt_<short, schar, float>,
cvt_<short, ushort, float>,
cvt_<short, short, float>,
cvt_<short, int, float>,
cvt_<short, float, float>,
cvt_<short, double, double>
},
{
cvt_<int, uchar, float>,
cvt_<int, schar, float>,
cvt_<int, ushort, float>,
cvt_<int, short, float>,
cvt_<int, int, double>,
cvt_<int, float, double>,
cvt_<int, double, double>
},
{
cvt_<float, uchar, float>,
cvt_<float, schar, float>,
cvt_<float, ushort, float>,
cvt_<float, short, float>,
cvt_<float, int, float>,
cvt_<float, float, float>,
cvt_<float, double, double>
},
{
cvt_<double, uchar, double>,
cvt_<double, schar, double>,
cvt_<double, ushort, double>,
cvt_<double, short, double>,
cvt_<double, int, double>,
cvt_<double, float, double>,
cvt_<double, double, double>
}
};
caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
func(src, dst, alpha, beta, stream);
}
......
This diff is collapsed.
......@@ -5,7 +5,7 @@ endif()
set(the_description "GPU-accelerated Computer Vision")
ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy)
ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda" "${CMAKE_CURRENT_SOURCE_DIR}/../highgui/src")
ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.hpp" "include/opencv2/${name}/device/*.h")
......@@ -22,17 +22,14 @@ source_group("Device" FILES ${lib_device_hdrs})
source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
set(ncv_files ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
set(ncv_files ${ncv_srcs} ${ncv_cuda})
source_group("Src\\NVidia" FILES ${ncv_files})
ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
if(MSVC)
......@@ -47,23 +44,18 @@ if (HAVE_CUDA)
ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET()
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
if(NOT APPLE)
unset(CUDA_nvcuvid_LIBRARY CACHE)
find_cuda_helper_libs(nvcuvid)
if(WITH_NVCUVID)
set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
endif()
if(WIN32)
unset(CUDA_nvcuvenc_LIBRARY CACHE)
find_cuda_helper_libs(nvcuvenc)
set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
endif()
if(NOT APPLE AND WITH_FFMPEG)
if(WITH_FFMPEG)
set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
endif()
else()
......
......@@ -216,6 +216,86 @@ namespace cv { namespace gpu { namespace device
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
#undef OPENCV_GPU_IMPLEMENT_RGB2Lab_TRAITS
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
#undef OPENCV_GPU_IMPLEMENT_Lab2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
#undef OPENCV_GPU_IMPLEMENT_RGB2Luv_TRAITS
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
#undef OPENCV_GPU_IMPLEMENT_Luv2RGB_TRAITS
}}} // namespace cv { namespace gpu { namespace device
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
......@@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
#ifdef __CUDACC__
namespace cv { namespace gpu
{
__host__ __device__ __forceinline__ int divUp(int total, int grain)
......@@ -96,19 +94,25 @@ namespace cv { namespace gpu
namespace device
{
using cv::gpu::divUp;
#ifdef __CUDACC__
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef signed char schar;
#ifdef _WIN32
typedef unsigned int uint;
#endif
template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
#endif // __CUDACC__
}
}}
#endif // __CUDACC__
#endif // __OPENCV_GPU_COMMON_HPP__
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -44,7 +44,6 @@
#define OPENCV_GPU_EMULATION_HPP_
#include "warp_reduce.hpp"
#include <stdio.h>
namespace cv { namespace gpu { namespace device
{
......
......@@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
template <> struct name<type> : binary_function<type, type, type> \
{ \
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
__device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\
__device__ __forceinline__ name():binary_function<type, type, type>(){}\
__device__ __forceinline__ name() {}\
__device__ __forceinline__ name(const name&) {}\
};
template <typename T> struct maximum : binary_function<T, T, T>
{
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{
return lhs < rhs ? rhs : lhs;
return max(lhs, rhs);
}
__device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){}
__device__ __forceinline__ maximum():binary_function<T, T, T>(){}
__device__ __forceinline__ maximum() {}
__device__ __forceinline__ maximum(const maximum&) {}
};
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
......@@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
{
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{
return lhs < rhs ? lhs : rhs;
return min(lhs, rhs);
}
__device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){}
__device__ __forceinline__ minimum():binary_function<T, T, T>(){}
__device__ __forceinline__ minimum() {}
__device__ __forceinline__ minimum(const minimum&) {}
};
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
......@@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
// Math functions
///bound=========================================
template <typename T> struct abs_func : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
{
return abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
{
__device__ __forceinline__ unsigned char operator ()(unsigned char x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<signed char> : unary_function<signed char, signed char>
{
__device__ __forceinline__ signed char operator ()(signed char x) const
{
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<char> : unary_function<char, char>
{
__device__ __forceinline__ char operator ()(char x) const
{
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
{
__device__ __forceinline__ unsigned short operator ()(unsigned short x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<short> : unary_function<short, short>
{
__device__ __forceinline__ short operator ()(short x) const
{
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
{
__device__ __forceinline__ unsigned int operator ()(unsigned int x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<int> : unary_function<int, int>
{
__device__ __forceinline__ int operator ()(int x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<float> : unary_function<float, float>
{
__device__ __forceinline__ float operator ()(float x) const
{
return ::fabsf(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<double> : unary_function<double, double>
{
__device__ __forceinline__ double operator ()(double x) const
{
return ::fabs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
template <typename T> struct name ## _func : unary_function<T, float> \
{ \
......@@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
{ \
return func ## f(v); \
} \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; \
template <> struct name ## _func<double> : unary_function<double, double> \
{ \
......@@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
{ \
return func(v); \
} \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
};
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
......@@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
} \
};
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
......
This diff is collapsed.
......@@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
{
return (uchar) ::max((int)v, 0);
uint res = 0;
int vi = v;
asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
uint res = 0;
asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
{
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
uint res = 0;
asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
{
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
uint res = 0;
asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
{
return (uchar) ::min(v, (uint)UCHAR_MAX);
uint res = 0;
asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
return saturate_cast<uchar>((uint)v);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<uchar>(iv);
uint res = 0;
asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<uchar>(iv);
#if __CUDA_ARCH__ >= 130
uint res = 0;
asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else
return saturate_cast<uchar>((float)v);
#endif
......@@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
{
return (schar) ::min((int)v, SCHAR_MAX);
uint res = 0;
uint vi = v;
asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
{
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
uint res = 0;
asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
{
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
uint res = 0;
asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
{
return saturate_cast<schar>((int)v);
uint res = 0;
asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
{
return (schar) ::min(v, (uint)SCHAR_MAX);
uint res = 0;
asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<schar>(iv);
uint res = 0;
asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<schar>(iv);
#if __CUDA_ARCH__ >= 130
uint res = 0;
asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else
return saturate_cast<schar>((float)v);
#endif
......@@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
{
return (ushort) ::max((int)v, 0);
ushort res = 0;
int vi = v;
asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
{
return (ushort) ::max((int)v, 0);
ushort res = 0;
asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
{
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
ushort res = 0;
asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
{
return (ushort) ::min(v, (uint)USHRT_MAX);
ushort res = 0;
asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<ushort>(iv);
ushort res = 0;
asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<ushort>(iv);
#if __CUDA_ARCH__ >= 130
ushort res = 0;
asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else
return saturate_cast<ushort>((float)v);
#endif
......@@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
{
return (short) ::min((int)v, SHRT_MAX);
short res = 0;
asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
{
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
short res = 0;
asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
{
return (short) ::min(v, (uint)SHRT_MAX);
short res = 0;
asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<short>(iv);
short res = 0;
asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
}
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<short>(iv);
#if __CUDA_ARCH__ >= 130
short res = 0;
asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else
return saturate_cast<short>((float)v);
#endif
}
template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
{
int res = 0;
asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
{
return __float2int_rn(v);
......@@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
#endif
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
{
uint res = 0;
int vi = v;
asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
{
uint res = 0;
asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
{
uint res = 0;
asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
{
return __float2uint_rn(v);
......
......@@ -45,7 +45,6 @@
#include "saturate_cast.hpp"
#include "datamov_utils.hpp"
#include "detail/reduction_detail.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
}
};
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
}
template <int n, typename T, typename V, typename Pred>
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
}
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
......
......@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "utility.hpp"
#include "reduce.hpp"
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
......@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
}
__device__ __forceinline__ operator int() const
......@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
}
__device__ __forceinline__ operator float() const
......@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
}
__device__ __forceinline__ operator float() const
......@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
}
__device__ __forceinline__ operator int() const
......
......@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
namespace cv { namespace gpu { namespace device
{
template <typename T>
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl(val, srcLane, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl((int) val, srcLane, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl(lo, srcLane, width);
hi = __shfl(hi, srcLane, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_down(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_down((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo, delta, width);
hi = __shfl_down(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_up(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_up((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_up(lo, delta, width);
hi = __shfl_up(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
}}}
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_VERSION 1)
set(CMAKE_SYSTEM_PROCESSOR arm)
set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc-4.5)
set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++-4.5)
#suppress compiller varning
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-psabi" )
# can be any other plases
set(__arm_linux_eabi_root /usr/arm-linux-gnueabi)
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root})
if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
endif()
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set(CARMA 1)
add_definitions(-DCARMA)
This diff is collapsed.
This diff is collapsed.
......@@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
SANITY_CHECK(found_locations);
}
///////////////////////////////////////////////////////////////
// HaarClassifier
......
This diff is collapsed.
This diff is collapsed.
......@@ -622,7 +622,7 @@ private:
}
// copy data structures on gpu
stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) ));
stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment