Merge branch 'master' of code.opencv.org:opencv

4c15465f · Alexander Mordvintesv · a4bffd96 · bf4c1df0 · 4c15465f · 4c15465f
Commit 4c15465f authored Aug 18, 2012 by Alexander Mordvintesv
30 changed files
--- a/android/android.toolchain.cmake
+++ b/android/android.toolchain.cmake
 # ------------------------------------------------------------------------------
 #  Android CMake toolchain file, for use with the Android NDK r5-r8
-#  Requires cmake 2.6.3 or newer (2.8.3 or newer is recommended).
+#  Requires cmake 2.6.3 or newer (2.8.5 or newer is recommended).
 #  See home page: http://code.google.com/p/android-cmake/
 #
 #  The file is mantained by the OpenCV project. And also can be found at
@@ -44,7 +44,8 @@
 #    ANDROID_ABI=armeabi-v7a -  specifies the target Application Binary
 #      Interface (ABI). This option nearly matches to the APP_ABI variable
 #      used by ndk-build tool from Android NDK.
-#      Possible values are:
+#
+#      Possible targets are:
 #        "armeabi" - matches to the NDK ABI with the same name.
 #           See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
 #        "armeabi-v7a" - matches to the NDK ABI with the same name.
@@ -56,6 +57,8 @@
 #        "armeabi-v6 with VFP" - tuned for ARMv6 processors having VFP.
 #        "x86" - matches to the NDK ABI with the same name.
 #            See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
+#        "mips" - matches to the NDK ABI with the same name
+#            (not testes on real devices)
 #
 #    ANDROID_NATIVE_API_LEVEL=android-8 - level of Android API compile for.
 #      Option is read-only when standalone toolchain used.
@@ -183,12 +186,13 @@
 #   - modified August 2012
 #     [+] updated for NDK r8b
 #     [~] all intermediate files generated by toolchain are moved into CMakeFiles
+#     [~] libstdc++ and libsupc are removed from explicit link libraries
 # ------------------------------------------------------------------------------

 cmake_minimum_required( VERSION 2.6.3 )

 if( DEFINED CMAKE_CROSSCOMPILING )
- #subsequent toolchain loading is not really needed
+ # subsequent toolchain loading is not really needed
 return()
 endif()

@@ -199,7 +203,7 @@ endif()

 # this one is important
 set( CMAKE_SYSTEM_NAME Linux )
-#this one not so much
+# this one not so much
 set( CMAKE_SYSTEM_VERSION 1 )

 set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
@@ -331,11 +335,11 @@ macro( __COPY_IF_DIFFERENT _source _destination )
 endmacro()


-#stl version: by default gnustl_static will be used
+# stl version: by default gnustl_static will be used
 set( ANDROID_USE_STLPORT FALSE CACHE BOOL "Experimental: use stlport_static instead of gnustl_static")
 mark_as_advanced( ANDROID_USE_STLPORT )

-#fight against cygwin
+# fight against cygwin
 set( ANDROID_FORBID_SYGWIN TRUE CACHE BOOL "Prevent cmake from working under cygwin and using cygwin tools")
 mark_as_advanced( ANDROID_FORBID_SYGWIN )
 if( ANDROID_FORBID_SYGWIN )
@@ -344,7 +348,7 @@ if( ANDROID_FORBID_SYGWIN )
 endif()

 if( CMAKE_HOST_WIN32 )
-  #remove cygwin from PATH
+  # remove cygwin from PATH
  set( __new_path "$ENV{PATH}")
  __LIST_FILTER( __new_path "cygwin" )
  set(ENV{PATH} "${__new_path}")
@@ -352,7 +356,7 @@ if( ANDROID_FORBID_SYGWIN )
 endif()
 endif()

-#detect current host platform
+# detect current host platform
 set( TOOL_OS_SUFFIX "" )
 if( CMAKE_HOST_APPLE )
 set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86" )
@@ -365,10 +369,10 @@ else()
 message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
 endif()

-#see if we have path to Android NDK
+# see if we have path to Android NDK
 __INIT_VARIABLE( ANDROID_NDK PATH ENV_ANDROID_NDK )
 if( NOT ANDROID_NDK )
- #see if we have path to Android standalone toolchain
+ # see if we have path to Android standalone toolchain
 __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ENV_ANDROID_STANDALONE_TOOLCHAIN OBSOLETE_ANDROID_NDK_TOOLCHAIN_ROOT OBSOLETE_ENV_ANDROID_NDK_TOOLCHAIN_ROOT )

 if( NOT ANDROID_STANDALONE_TOOLCHAIN )
@@ -397,10 +401,10 @@ if( NOT ANDROID_NDK )
 endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
 endif( NOT ANDROID_NDK )

-#remember found paths
+# remember found paths
 if( ANDROID_NDK )
 get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- #try to detect change
+ # try to detect change
 if( CMAKE_AR )
  string( LENGTH "${ANDROID_NDK}" __length )
  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
@@ -414,7 +418,7 @@ if( ANDROID_NDK )
 set( BUILD_WITH_ANDROID_NDK True )
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
 get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
- #try to detect change
+ # try to detect change
 if( CMAKE_AR )
  string( LENGTH "${ANDROID_STANDALONE_TOOLCHAIN}" __length )
  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidStandaloneToolchainPreviousPath )
@@ -438,7 +442,7 @@ else()
      sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
 endif()

-#get all the details about standalone toolchain
+# get all the details about standalone toolchain
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
 set( ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
@@ -455,7 +459,7 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  set( __availableToolchainArchs "mipsel" )
 endif()
 if( ANDROID_COMPILER_VERSION )
-  #do not run gcc every time because it is relatevely expencive
+  # do not run gcc every time because it is relatevely expencive
  set( __availableToolchainCompilerVersions "${ANDROID_COMPILER_VERSION}" )
 else()
  execute_process( COMMAND "${ANDROID_STANDALONE_TOOLCHAIN}/bin/${__availableToolchainMachines}-gcc${TOOL_OS_SUFFIX}" --version
@@ -464,7 +468,7 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 endif()
 endif()

-#get all the details about NDK
+# get all the details about NDK
 if( BUILD_WITH_ANDROID_NDK )
 file( GLOB ANDROID_SUPPORTED_NATIVE_API_LEVELS RELATIVE "${ANDROID_NDK}/platforms" "${ANDROID_NDK}/platforms/android-*" )
 string( REPLACE "android-" "" ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_SUPPORTED_NATIVE_API_LEVELS}" )
@@ -490,7 +494,7 @@ if( BUILD_WITH_ANDROID_NDK )
 endif()
 endif()

-#build list of available ABIs
+# build list of available ABIs
 if( NOT ANDROID_SUPPORTED_ABIS )
 set( ANDROID_SUPPORTED_ABIS "" )
 set( __uniqToolchainArchNames ${__availableToolchainArchs} )
@@ -505,9 +509,9 @@ if( NOT ANDROID_SUPPORTED_ABIS )
 endif()
 endif()

-#choose target ABI
+# choose target ABI
 __INIT_VARIABLE( ANDROID_ABI OBSOLETE_ARM_TARGET OBSOLETE_ARM_TARGETS VALUES ${ANDROID_SUPPORTED_ABIS} )
-#verify that target ABI is supported
+# verify that target ABI is supported
 list( FIND ANDROID_SUPPORTED_ABIS "${ANDROID_ABI}" __androidAbiIdx )
 if( __androidAbiIdx EQUAL -1 )
 string( REPLACE ";" "\", \"", PRINTABLE_ANDROID_SUPPORTED_ABIS  "${ANDROID_SUPPORTED_ABIS}" )
@@ -517,10 +521,10 @@ if( __androidAbiIdx EQUAL -1 )
 endif()
 unset( __androidAbiIdx )

-#remember target ABI
+# remember target ABI
 set( ANDROID_ABI "${ANDROID_ABI}" CACHE STRING "The target ABI for Android. If arm, then armeabi-v7a is recommended for hardware floating point." FORCE )

-#set target ABI options
+# set target ABI options
 if( ANDROID_ABI STREQUAL "x86" )
 set( X86 true )
 set( ANDROID_NDK_ABI_NAME "x86" )
@@ -545,7 +549,7 @@ elseif( ANDROID_ABI STREQUAL "armeabi-v6 with VFP" )
 set( ANDROID_ARCH_NAME "arm" )
 set( ANDROID_ARCH_FULLNAME "arm" )
 set( CMAKE_SYSTEM_PROCESSOR "armv6" )
- #need always fallback to older platform
+ # need always fallback to older platform
 set( ARMEABI true )
 elseif( ANDROID_ABI STREQUAL "armeabi-v7a")
 set( ARMEABI_V7A true )
@@ -573,8 +577,8 @@ else()
 endif()

 if( CMAKE_BINARY_DIR AND EXISTS "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" )
- #really dirty hack
- #it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
+ # really dirty hack
+ # it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
 file( APPEND "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" "SET(CMAKE_SYSTEM_PROCESSOR \"${CMAKE_SYSTEM_PROCESSOR}\")\n" )
 endif()

@@ -592,7 +596,7 @@ else()
 unset( ANDROID_FORCE_ARM_BUILD CACHE )
 endif()

-#choose toolchain
+# choose toolchain
 if( ANDROID_TOOLCHAIN_NAME )
 list( FIND __availableToolchains "${ANDROID_TOOLCHAIN_NAME}" __toolchainIdx )
 if( __toolchainIdx EQUAL -1 )
@@ -637,10 +641,10 @@ unset( __availableToolchainMachines )
 unset( __availableToolchainArchs )
 unset( __availableToolchainCompilerVersions )

-#choose native API level
+# choose native API level
 __INIT_VARIABLE( ANDROID_NATIVE_API_LEVEL ENV_ANDROID_NATIVE_API_LEVEL ANDROID_API_LEVEL ENV_ANDROID_API_LEVEL ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME} ANDROID_DEFAULT_NDK_API_LEVEL )
 string( REGEX MATCH "[0-9]+" ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" )
-#validate
+# validate
 list( FIND ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_NATIVE_API_LEVEL}" __levelIdx )
 if( __levelIdx EQUAL -1 )
 message( SEND_ERROR "Specified Android native API level (${ANDROID_NATIVE_API_LEVEL}) is not supported by your NDK/toolchain." )
@@ -659,7 +663,7 @@ if( CMAKE_VERSION VERSION_GREATER "2.8" )
 set_property( CACHE ANDROID_NATIVE_API_LEVEL PROPERTY STRINGS ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
 endif()

-#setup paths
+# setup paths
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
 set( ANDROID_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot" )
@@ -689,7 +693,7 @@ set( CMAKE_ASM_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHI
 if( CMAKE_VERSION VERSION_LESS 2.8.5 )
 set( CMAKE_ASM_COMPILER_ARG1 "-c" )
 endif()
-#there may be a way to make cmake deduce these TODO deduce the rest of the tools
+# there may be a way to make cmake deduce these TODO deduce the rest of the tools
 set( CMAKE_STRIP        "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-strip${TOOL_OS_SUFFIX}"   CACHE PATH "strip" )
 set( CMAKE_AR           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ar${TOOL_OS_SUFFIX}"      CACHE PATH "archive" )
 set( CMAKE_LINKER       "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ld${TOOL_OS_SUFFIX}"      CACHE PATH "linker" )
@@ -705,11 +709,12 @@ if( APPLE )
 endif()
 mark_as_advanced( CMAKE_INSTALL_NAME_TOOL )
 endif()
-#export directories
+
+# export directories
 set( ANDROID_SYSTEM_INCLUDE_DIRS "" )
 set( ANDROID_SYSTEM_LIB_DIRS "" )

-#setup output directories
+# setup output directories
 set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "root for library output, set this to change where android libs are installed to" )
 set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )

@@ -722,13 +727,13 @@ if(NOT _CMAKE_IN_TRY_COMPILE)
 set( LIBRARY_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/libs/${ANDROID_NDK_ABI_NAME}" CACHE PATH "path for android libs" )
 endif()

-#includes
+# includes
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${ANDROID_SYSROOT}/usr/include" )
 if( __stlIncludePath AND EXISTS "${__stlIncludePath}" )
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${__stlIncludePath}" )
 endif()

-#STL bits includes
+# c++ bits includes
 if( __stlLibPath AND EXISTS "${__stlLibPath}/include" )
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${__stlLibPath}/include" )
 endif()
@@ -742,7 +747,7 @@ elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/incl
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" )
 endif()

-#flags and definitions
+# flags and definitions
 if(ANDROID_SYSROOT MATCHES "[ ;\"]")
 set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
 # quotes will break try_compile and compiler identification
@@ -766,7 +771,7 @@ set( CMAKE_CXX_PLATFORM_ID Linux )
 set( CMAKE_CXX_SIZEOF_DATA_PTR 4 )
 set( CMAKE_CXX_HAS_ISYSROOT 1 )
 set( CMAKE_CXX_COMPILER_ABI ELF )
-#force ASM compiler (required for CMake < 2.8.5)
+# force ASM compiler (required for CMake < 2.8.5)
 set( CMAKE_ASM_COMPILER_ID_RUN TRUE )
 set( CMAKE_ASM_COMPILER_ID GNU )
 set( CMAKE_ASM_COMPILER_WORKS TRUE )
@@ -796,17 +801,17 @@ endif()

 if( ANDROID_USE_STLPORT )
 set( _CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions" )
- set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fno-rtti -fno-exceptions" )
+ set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fno-exceptions" )
 else()
 set( _CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS} -frtti -fexceptions" )
 set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fexceptions" )
 endif()

-#release and debug flags
+# release and debug flags
 if( ARMEABI OR ARMEABI_V7A )
 if( NOT ANDROID_FORCE_ARM_BUILD AND NOT ARMEABI_V6 )
-  #It is recommended to use the -mthumb compiler flag to force the generation
-  #of 16-bit Thumb-1 instructions (the default being 32-bit ARM ones).
+  # It is recommended to use the -mthumb compiler flag to force the generation
+  # of 16-bit Thumb-1 instructions (the default being 32-bit ARM ones).
  # O3 instead of O2/Os in release mode - like cmake sets for desktop gcc
  set( _CMAKE_CXX_FLAGS_RELEASE "-mthumb -O3" )
  set( _CMAKE_C_FLAGS_RELEASE   "-mthumb -O3" )
@@ -836,7 +841,7 @@ set( _CMAKE_C_FLAGS_RELEASE   "${_CMAKE_C_FLAGS_RELEASE}   -fomit-frame-pointer
 set( _CMAKE_CXX_FLAGS_DEBUG "${_CMAKE_CXX_FLAGS_DEBUG} -fno-strict-aliasing -fno-omit-frame-pointer -DDEBUG -D_DEBUG" )
 set( _CMAKE_C_FLAGS_DEBUG   "${_CMAKE_C_FLAGS_DEBUG}   -fno-strict-aliasing -fno-omit-frame-pointer -DDEBUG -D_DEBUG" )

-#ABI-specific flags
+# ABI-specific flags
 if( ARMEABI_V7A )
 set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=softfp" )
 if( NEON )
@@ -854,19 +859,18 @@ elseif( X86 )
 set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS}" )#sse?
 endif()

-#linker flags
+# linker flags
 if( NOT DEFINED __ndklibspath )
 set( __ndklibspath "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/ndklibs/${ANDROID_NDK_ABI_NAME}" )
 endif()
-list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
+list( APPEND ANDROID_SYSTEM_LIB_DIRS "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
 set( ANDROID_LINKER_FLAGS "" )
-#STL
+
+# STL
 if( ANDROID_USE_STLPORT )
 if( EXISTS "${__stlLibPath}/libstlport_static.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstlport_static.a" "${__ndklibspath}/libstlport_static.a" )
- endif()
- if( EXISTS "${__ndklibspath}/libstlport_static.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--start-group -lstlport_static" )
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> \"${__stlLibPath}/libstlport_static.a\"")
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> \"${__stlLibPath}/libstlport_static.a\"")
 endif()
 else( ANDROID_USE_STLPORT )
 if( EXISTS "${__stlLibPath}/libgnustl_static.a" )
@@ -880,11 +884,6 @@ else( ANDROID_USE_STLPORT )
 elseif( EXISTS "${__stlLibPath}/libstdc++.a" )
  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstdc++.a" "${__ndklibspath}/libstdc++.a" )
 endif()
- if( EXISTS "${__ndklibspath}/libstdc++.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lstdc++" )
- endif()
-
- #gcc exception & rtti support
 if( EXISTS "${__stlLibPath}/libsupc++.a" )
  __COPY_IF_DIFFERENT( "${__stlLibPath}/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
 elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" )
@@ -896,16 +895,14 @@ else( ANDROID_USE_STLPORT )
 elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" )
  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
 endif()
- if( EXISTS "${__ndklibspath}/libsupc++.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lsupc++" )
- endif()
+ list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" )
 endif( ANDROID_USE_STLPORT )

-#cleanup for STL search
+# cleanup for STL search
 unset( __stlIncludePath )
 unset( __stlLibPath )

-#other linker flags
+# other linker flags
 __INIT_VARIABLE( ANDROID_NO_UNDEFINED OBSOLETE_NO_UNDEFINED VALUES ON )
 set( ANDROID_NO_UNDEFINED ${ANDROID_NO_UNDEFINED} CACHE BOOL "Show all undefined symbols as linker errors" FORCE )
 mark_as_advanced( ANDROID_NO_UNDEFINED )
@@ -914,7 +911,7 @@ if( ANDROID_NO_UNDEFINED )
 endif()

 if (ANDROID_NDK MATCHES "-r[56].?$")
- #libGLESv2.so in NDK's prior to r7 refers to exteranal symbols. So this flag option is required for all projects using OpenGL from native.
+ # libGLESv2.so in NDK's prior to r7 refers to exteranal symbols. So this flag option is required for all projects using OpenGL from native.
 __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES ON )
 else()
 __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES OFF )
@@ -940,7 +937,7 @@ if( ARMEABI_V7A )
 set( ANDROID_LINKER_FLAGS "-Wl,--fix-cortex-a8 ${ANDROID_LINKER_FLAGS}" )
 endif()

-#cache flags
+# cache flags
 set( CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags" )
 set( CMAKE_C_FLAGS "${_CMAKE_C_FLAGS}" CACHE STRING "c flags" )
 set( CMAKE_CXX_FLAGS_RELEASE "${_CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "c++ Release flags" )
@@ -954,7 +951,7 @@ set( CMAKE_EXE_LINKER_FLAGS "-Wl,-z,nocopyreloc" CACHE STRING "linker flags" )
 include_directories( SYSTEM ${ANDROID_SYSTEM_INCLUDE_DIRS} )
 link_directories( ${ANDROID_SYSTEM_LIB_DIRS} )

-#finish flags
+# finish flags
 set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS}"    CACHE INTERNAL "Extra Android compiler flags")
 set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}" CACHE INTERNAL "Extra Android linker flags")
 set( CMAKE_CXX_FLAGS           "${ANDROID_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" )
@@ -969,7 +966,7 @@ else()
 set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
 endif()

-#set these global flags for cmake client scripts to change behavior
+# set these global flags for cmake client scripts to change behavior
 set( ANDROID True )
 set( BUILD_ANDROID True )

@@ -982,7 +979,7 @@ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
 set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )


-#macro to find packages on the host OS
+# macro to find packages on the host OS
 macro( find_host_package )
 set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
 set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
@@ -1004,7 +1001,7 @@ macro( find_host_package )
 endmacro()


-#macro to find programs on the host OS
+# macro to find programs on the host OS
 macro( find_host_program )
 set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
 set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
@@ -1044,7 +1041,11 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
 set( __toolchain_config "")
 foreach( __var ANDROID_ABI ANDROID_FORCE_ARM_BUILD ANDROID_NATIVE_API_LEVEL ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_SET_OBSOLETE_VARIABLES LIBRARY_OUTPUT_PATH_ROOT ANDROID_USE_STLPORT ANDROID_FORBID_SYGWIN ANDROID_NDK ANDROID_STANDALONE_TOOLCHAIN ANDROID_FUNCTION_LEVEL_LINKING __ndklibspath )
  if( DEFINED ${__var} )
-   set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" )\n" )
+   if( "${__var}" MATCHES " ")
+    set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" CACHE INTERNAL \"\" )\n" )
+   else()
+    set( __toolchain_config "${__toolchain_config}set( ${__var} ${${__var}} CACHE INTERNAL \"\" )\n" )
+   endif()
  endif()
 endforeach()
 file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" )

--- a/doc/_themes/blue/static/default.css_t
+++ b/doc/_themes/blue/static/default.css_t
@@ -175,6 +175,8 @@ a:hover {
 div.body p, div.body dd, div.body li {
    text-align: justify;
    line-height: 130%;
+    margin-top: 1em;
+    margin-bottom: 1em;
 }

 div.body h1,
@@ -327,9 +329,9 @@ table.field-list {
    margin-top: 20px;
 }

-ul.simple {
+/*ul.simple {
    list-style: none;
-}
+}*/

 em.menuselection, em.guilabel {
    font-family: {{ theme_guifont }};
@@ -384,3 +386,8 @@ margin-top: 0px;
 div.body ul.search li {
    text-align: left;
 }
+
+div.linenodiv {
+    min-width: 1em;
+    text-align: right;
+}
\ No newline at end of file
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@@ -202,7 +202,7 @@ Open OpenCV library and samples in Eclipse

   Sometimes more advanced manipulations are required:

-   * The provided projects are configured for ``API 11`` target (and ``API 9`` for the library) that can be missing platform in your Android SDK.
+   The provided projects are configured for ``API 11`` target (and ``API 9`` for the library) that can be missing platform in your Android SDK.
   After right click on any project select  :guilabel:`Properties` and then :guilabel:`Android` on the left pane.
   Click some target with `API Level` 11 or higher:


--- a/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
+++ b/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
@@ -75,7 +75,7 @@ You need the following software to be installed in order to develop for Android

           sudo update-java-alternatives --set java-6-sun

-   **TODO:** add a note on Sun/Oracle Java installation on Ubuntu 12.
+..   **TODO:** add a note on Sun/Oracle Java installation on Ubuntu 12.

 #. **Android SDK**

@@ -241,6 +241,7 @@ where:
 The script :file:`Android.mk` usually has the following structure:

 .. code-block:: make
+   :linenos:

   LOCAL_PATH := $(call my-dir)

@@ -258,6 +259,7 @@ This is the minimal file :file:`Android.mk`, which builds C++ source code of an
 Usually the file :file:`Application.mk` is optional, but in case of project using OpenCV, when STL and exceptions are used in C++, it also should be created. Example of the file :file:`Application.mk`:

 .. code-block:: make
+   :linenos:

   APP_STL := gnustl_static
   APP_CPPFLAGS := -frtti -fexceptions
@@ -337,7 +339,7 @@ We recommend the approach based on Eclipse :abbr:`CDT(C/C++ Development Tooling)
      :alt: Configure CDT
      :align: center

-    ` `
+   And:

   .. image:: images/eclipse_cdt_cfg2.png
      :alt: Configure CDT
@@ -350,6 +352,7 @@ We recommend the approach based on Eclipse :abbr:`CDT(C/C++ Development Tooling)
        :align: center

 #. Open :guilabel:`Project Properties -> C/C++ Build`, unckeck ``Use default build command``, replace "Build command" text from ``"make"`` to
+
   ``"${NDKROOT}/ndk-build.cmd"`` on Windows,

   ``"${NDKROOT}/ndk-build"`` on Linux and MacOS.
@@ -393,7 +396,7 @@ We recommend the approach based on Eclipse :abbr:`CDT(C/C++ Development Tooling)
      :alt: Configure CDT
      :align: center

-     .. note:: The latest Android NDK **r8b** has a bit different STL headers path. So if you use this NDK version please use the following modified **Include** paths list:
+   .. note:: The latest Android NDK **r8b** uses different STL headers path. So if you use this NDK release add the following **Include** paths list instead:

   ::

@@ -412,12 +415,16 @@ AVD
 AVD (*Android Virtual Device*) is not probably the most convenient way to test an OpenCV-dependent application, but sure the most uncomplicated one to configure.

 #. Assuming you already have *Android SDK* and *Eclipse IDE* installed, in Eclipse go :guilabel:`Window -> AVD Manager`.
-     **TBD:** how to start AVD Manager without Eclipse...
+
+   ..     **TBD:** how to start AVD Manager without Eclipse...
+
 #. Press the :guilabel:`New` button in :guilabel:`AVD Manager` window.
 #. :guilabel:`Create new Android Virtual Device` window will let you select some properties for your new device, like target API level, size of SD-card and other.
+
   .. image:: images/AVD_create.png
      :alt: Configure builders
      :align: center
+
 #. When you click the :guilabel:`Create AVD` button, your new AVD will be availible in :guilabel:`AVD Manager`.
 #. Press :guilabel:`Start` to launch the device. Be aware that any AVD (a.k.a. Emulator) is usually much slower than a hardware Android device, so it may take up to several minutes to start.
 #. Go :guilabel:`Run -> Run/Debug`  in Eclipse IDE to run your application in regular or debugging mode. :guilabel:`Device Chooser` will let you choose among the running devices or to start a new one.
@@ -435,22 +442,31 @@ Windows host computer
 #. Attach the Android device to your PC with a USB cable.
 #. Go to :guilabel:`Start Menu` and **right-click** on :guilabel:`Computer`. Select :guilabel:`Manage` in the context menu. You may be asked for Administrative permissions.
 #. Select :guilabel:`Device Manager` in the left pane and find an unknown device in the list. You may try unplugging it and then plugging back in order to check whether it's your exact equipment appears in the list.
+
   .. image:: images/usb_device_connect_01.png
      :alt: Unknown device
      :align: center
+
 #. Try your luck installing `Google USB drivers` without any modifications: **right-click** on the unknown device, select :guilabel:`Properties` menu item --> :guilabel:`Details` tab --> :guilabel:`Update Driver` button.
+
   .. image:: images/usb_device_connect_05.png
      :alt: Device properties
      :align: center
+
 #. Select :guilabel:`Browse computer for driver software`.
+
   .. image:: images/usb_device_connect_06.png
      :alt: Browse for driver
      :align: center
+
 #. Specify the path to :file:`<Android SDK folder>/extras/google/usb_driver/` folder.
+
   .. image:: images/usb_device_connect_07.png
      :alt: Browse for driver
      :align: center
+
 #. If you get the prompt to install unverified drivers and report about success - you've finished with USB driver installation.
+
   .. image:: images/usb_device_connect_08.png
      :alt: Install prompt
      :align: center
@@ -460,23 +476,33 @@ Windows host computer
   .. image:: images/usb_device_connect_09.png
      :alt: Installed OK
      :align: center
+
 #. Otherwise (getting the failure like shown below) follow the next steps.
+
   .. image:: images/usb_device_connect_12.png
      :alt: No driver
      :align: center
+
 #. Again **right-click** on the unknown device, select :guilabel:`Properties --> Details --> Hardware Ids` and copy the line like ``USB\VID_XXXX&PID_XXXX&MI_XX``.
+
   .. image:: images/usb_device_connect_02.png
      :alt: Device properties details
      :align: center
+
 #. Now open file :file:`<Android SDK folder>/extras/google/usb_driver/android_winusb.inf`. Select either ``Google.NTx86`` or ``Google.NTamd64`` section depending on your host system architecture.
+
   .. image:: images/usb_device_connect_03.png
      :alt: "android_winusb.inf"
      :align: center
+
 #. There should be a record like existing ones for your device and you need to add one manually.
+
   .. image:: images/usb_device_connect_04.png
      :alt: "android_winusb.inf"
      :align: center
+
 #. Save the :file:`android_winusb.inf` file and try to install the USB driver again.
+
   .. image:: images/usb_device_connect_05.png
      :alt: Device properties
      :align: center
@@ -492,7 +518,9 @@ Windows host computer
   .. image:: images/usb_device_connect_07.png
      :alt: Browse for driver
      :align: center
+
 #. This time installation should go successfully.
+
   .. image:: images/usb_device_connect_08.png
      :alt: Install prompt
      :align: center
@@ -502,11 +530,15 @@ Windows host computer
   .. image:: images/usb_device_connect_09.png
      :alt: Installed OK
      :align: center
+
 #. And an unknown device is now recognized as an Android phone.
+
   .. image:: images/usb_device_connect_10.png
      :alt: "Known" device
      :align: center
+
 #. Successful device USB connection can be verified in console via ``adb devices`` command.
+
   .. image:: images/usb_device_connect_11.png
      :alt: "adb devices"
      :align: center
@@ -523,7 +555,7 @@ By default Linux doesn't recognize Android devices, but it's easy to fix this is

 Then restart your adb server (even better to restart the system), plug in your Android device and execute :command:`adb devices` command. You will see the list of attached devices:

-  .. image:: images/usb_device_connect_ubuntu.png
+.. image:: images/usb_device_connect_ubuntu.png
   :alt: List of attached devices
   :align: center


--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
@@ -57,7 +57,7 @@ Using async initialization is a **recommended** way for application development.
 To run OpenCV Manager-based application the first time you need to install packages with the `OpenCV Manager` and `OpenCV binary pack` for you platform.
 You can do it using Google Play Market or manually with ``adb`` tool:

-  .. code-block:: sh
+.. code-block:: sh
    :linenos:

    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.2_Manager.apk
@@ -266,12 +266,12 @@ It will be capable of accessing camera output, processing it and displaying the
 #. Set name, target, package and minSDKVersion accordingly.

 #. Create a new class (*File -> New -> Class*). Name it for example: *HelloOpenCVView*.
+
   .. image:: images/dev_OCV_new_class.png
        :alt: Add a new class.
        :align: center

   * It should extend *SurfaceView* class.
-
   * It also should implement *SurfaceHolder.Callback*, *Runnable*.

 #. Edit *HelloOpenCVView* class.
@@ -279,7 +279,9 @@ It will be capable of accessing camera output, processing it and displaying the
   * Add an *import* line for *android.content.context*.

   * Modify autogenerated stubs: *HelloOpenCVView*, *surfaceCreated*, *surfaceDestroyed* and *surfaceChanged*.
+
     .. code-block:: java
+        :linenos:

        package com.hello.opencv.test;

@@ -300,16 +302,18 @@ It will be capable of accessing camera output, processing it and displaying the
            cameraRelease();
        }

-		  public void surfaceChanged(SurfaceHolder holder, int format, int width,
-		  int height) {
+        public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) {
            cameraSetup(width, height);
        }

+        //...
+
   * Add *cameraOpen*, *cameraRelease* and *cameraSetup* voids as shown below.

   * Also, don't forget to add the public void *run()* as follows:

     .. code-block:: java
+        :linenos:

        public void run() {
            // TODO: loop { getFrame(), processFrame(), drawFrame() }
@@ -327,11 +331,10 @@ It will be capable of accessing camera output, processing it and displaying the
            // TODO setup camera
        }

-
-       ..
-
 #. Create a new *Activity* (*New -> Other -> Android -> Android Activity*) and name it, for example: *HelloOpenCVActivity*. For this activity define *onCreate*, *onResume* and *onPause* voids.
+
   .. code-block:: java
+      :linenos:

       public void onCreate (Bundle savedInstanceState) {
           super.onCreate(savedInstanceState);
@@ -359,11 +362,12 @@ It will be capable of accessing camera output, processing it and displaying the
               });
               ad.show();
           }
-		
       }

 #. Add the following permissions to the AndroidManifest.xml file:
+
   .. code-block:: xml
+      :linenos:

      </application>

@@ -372,12 +376,15 @@ It will be capable of accessing camera output, processing it and displaying the
      <uses-feature android:name="android.hardware.camera.autofocus" />

 #. Reference OpenCV library within your project properties.
+
   .. image:: images/dev_OCV_reference.png
        :alt: Reference OpenCV library.
        :align: center

 #. We now need some code to handle the camera. Update the *HelloOpenCVView* class as follows:
+
   .. code-block:: java
+      :linenos:

      private VideoCapture      mCamera;

@@ -394,6 +401,7 @@ It will be capable of accessing camera output, processing it and displaying the
          }
          return true;
      }
+
      public void cameraRelease() {
          synchronized(this) {
              if (mCamera != null) {
@@ -402,6 +410,7 @@ It will be capable of accessing camera output, processing it and displaying the
              }
          }
      }
+
      private void cameraSetup(int width, int height) {
          synchronized (this) {
              if (mCamera != null && mCamera.isOpened()) {
@@ -425,7 +434,9 @@ It will be capable of accessing camera output, processing it and displaying the
      }

 #. The last step would be to update the *run()* void in *HelloOpenCVView* class as follows:
+
   .. code-block:: java
+      :linenos:

      public void run() {
          while (true) {
@@ -465,5 +476,3 @@ It will be capable of accessing camera output, processing it and displaying the
          }
          return bmp;
      }
-
-
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -4620,6 +4620,34 @@ public:

 CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body);

+/////////////////////////// Synchronization Primitives ///////////////////////////////
+
+class CV_EXPORTS Mutex
+{
+public:
+    Mutex();
+    ~Mutex();
+    Mutex(const Mutex& m);
+    Mutex& operator = (const Mutex& m);
+    
+    void lock();
+    bool trylock();
+    void unlock();
+    
+    struct Impl;
+protected:
+    Impl* impl;
+};
+
+class CV_EXPORTS AutoLock
+{
+public:    
+    AutoLock(Mutex& m) : mutex(&m) { mutex->lock(); }
+    ~AutoLock() { mutex->unlock(); }
+protected:    
+    Mutex* mutex;
+};
+
 }

 #endif // __cplusplus

--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -42,6 +42,16 @@

 #include "precomp.hpp"

+#if !defined HAVE_TBB && !defined HAVE_OPENMP && !defined HAVE_GCD && !defined HAVE_CONCURRENCY
+
+#ifdef __APPLE__
+#define HAVE_GCD
+#elif defined _MSC_VER && _MSC_VER >= 1600
+#define HAVE_CONCURRENCY
+#endif
+
+#endif
+
 #ifdef HAVE_CONCURRENCY
 #  include <ppl.h>
 #elif defined HAVE_OPENMP

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -930,4 +930,104 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
 }
 #endif

+namespace cv
+{
+
+#if defined WIN32 || defined _WIN32 || defined WINCE
+
+struct Mutex::Impl
+{
+    Impl() { InitializeCriticalSection(&cs); refcount = 1; }
+    ~Impl() { DeleteCriticalSection(&cs); }
+
+    void lock() { EnterCriticalSection(&cs); }
+    bool trylock() { return TryEnterCriticalSection(&cs) != 0; }
+    void unlock() { LeaveCriticalSection(&cs); }
+
+    CRITICAL_SECTION cs;
+    int refcount;
+};
+
+#elif defined __APPLE__
+
+#include <libkern/OSAtomic.h>
+
+struct Mutex::Impl
+{
+    Impl() { sl = OS_SPINLOCK_INIT; refcount = 1; }
+    ~Impl() {}
+
+    void lock() { OSSpinLockLock(&sl); }
+    bool trylock() { return OSSpinLockTry(&sl); }
+    void unlock() { OSSpinLockUnlock(&sl); }
+
+    OSSpinLock sl;
+    int refcount;
+};
+
+#elif defined __linux__ && !defined ANDROID
+
+struct Mutex::Impl
+{
+    Impl() { pthread_spin_init(&sl, 0); refcount = 1; }
+    ~Impl() { pthread_spin_destroy(&sl); }
+
+    void lock() { pthread_spin_lock(&sl); }
+    bool trylock() { return pthread_spin_trylock(&sl) == 0; }
+    void unlock() { pthread_spin_unlock(&sl); }
+
+    pthread_spinlock_t sl;
+    int refcount;
+};
+
+#else
+
+struct Mutex::Impl
+{
+    Impl() { pthread_mutex_init(&sl, 0); refcount = 1; }
+    ~Impl() { pthread_mutex_destroy(&sl); }
+
+    void lock() { pthread_mutex_lock(&sl); }
+    bool trylock() { return pthread_mutex_trylock(&sl) == 0; }
+    void unlock() { pthread_mutex_unlock(&sl); }
+
+    pthread_mutex_t sl;
+    int refcount;
+};
+
+#endif
+
+Mutex::Mutex()
+{
+    impl = new Mutex::Impl;
+}
+
+Mutex::~Mutex()
+{
+    if( CV_XADD(&impl->refcount, -1) == 1 )
+        delete impl;
+    impl = 0;
+}
+
+Mutex::Mutex(const Mutex& m)
+{
+    impl = m.impl;
+    CV_XADD(&impl->refcount, 1);
+}
+
+Mutex& Mutex::operator = (const Mutex& m)
+{
+    CV_XADD(&m.impl->refcount, 1);
+    if( CV_XADD(&impl->refcount, -1) == 1 )
+        delete impl;
+    impl = m.impl;
+    return *this;
+}
+
+void Mutex::lock() { impl->lock(); }
+void Mutex::unlock() { impl->unlock(); }
+bool Mutex::trylock() { return impl->trylock(); }
+
+}
+
 /* End of file. */
\ No newline at end of file
--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
@@ -653,7 +653,7 @@ gpu::GMG_GPU
 ------------
 .. ocv:class:: gpu::GMG_GPU

-Class used for background/foreground segmentation. ::
+  Class used for background/foreground segmentation. ::

    class GMG_GPU_GPU
    {
@@ -677,9 +677,9 @@ Class used for background/foreground segmentation. ::
        ...
    };

-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [GMG2012]_.
+  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [GMG2012]_.

-Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:

    .. ocv:member:: int maxFeatures


--- a/modules/gpu/misc/mark_nvidia.py
+++ b/modules/gpu/misc/mark_nvidia.py
 import sys, re

 spaces = '[\s]*'
-symbols = '[\s\w\d,=:|]*'
+symbols = '[\s\w\d,.=:|]*'

 def pattern1(prefix, test):
    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + '\)' + spaces)

--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -4,7 +4,7 @@ if(NOT HAVE_OPENCL)
 endif()

 set(the_description "OpenCL-accelerated Computer Vision")
-ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
+ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_ts)

 ocv_module_include_directories()


--- a/modules/ocl/perf/test_filters.cpp
+++ b/modules/ocl/perf/test_filters.cpp
@@ -325,7 +325,7 @@ PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
 		ksize = GET_PARAM(1);

 		cv::RNG& rng = TS::ptr()->get_rng();
-		cv::Size size = cv::Size(2560, 2560);
+		cv::Size size = cv::Size(MWIDTH, MHEIGHT);

 		mat  = randomMat(rng, size, type, 5, 16, false);
 		dst  = randomMat(rng, size, type, 5, 16, false);
@@ -468,7 +468,7 @@ PARAM_TEST_CASE(ErodeDilateBase, MatType, bool)
 		//  iterations = GET_PARAM(1);

 		cv::RNG& rng = TS::ptr()->get_rng();
-		cv::Size size = cv::Size(2560, 2560);
+		cv::Size size = cv::Size(MWIDTH, MHEIGHT);

 		mat1 = randomMat(rng, size, type, 5, 16, false);
 		dst  = randomMat(rng, size, type, 5, 16, false);
@@ -679,7 +679,7 @@ PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
 		dx = 2; dy=0;

 		cv::RNG& rng = TS::ptr()->get_rng();
-		cv::Size size = cv::Size(2560, 2560);
+		cv::Size size = cv::Size(MWIDTH, MHEIGHT);

 		mat1 = randomMat(rng, size, type, 5, 16, false);
 		dst  = randomMat(rng, size, type, 5, 16, false);
@@ -817,7 +817,7 @@ PARAM_TEST_CASE(Scharr, MatType, int, int, int)
 		dx = 1; dy=0;

 		cv::RNG& rng = TS::ptr()->get_rng();
-		cv::Size size = cv::Size(2560, 2560);
+		cv::Size size = cv::Size(MWIDTH, MHEIGHT);

 		mat1 = randomMat(rng, size, type, 5, 16, false);
 		dst  = randomMat(rng, size, type, 5, 16, false);
@@ -956,7 +956,7 @@ PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
 		bordertype = GET_PARAM(2);

 		cv::RNG& rng = TS::ptr()->get_rng();
-		cv::Size size = cv::Size(2560, 2560);
+		cv::Size size = cv::Size(MWIDTH, MHEIGHT);

 		sigma1 = rng.uniform(0.1, 1.0); 
 		sigma2 = rng.uniform(0.1, 1.0);

--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -177,9 +177,7 @@ namespace cv
        extern const char *filter_sep_row;
        extern const char *filter_sep_col;
        extern const char *filtering_laplacian;
-        extern const char *filtering_erodeFilter;
-        extern const char *filtering_dilateFilter;
-
+        extern const char *filtering_morph;
    }
 }

@@ -334,28 +332,54 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c

    int srcStep = src.step1() / src.channels();
    int dstStep = dst.step1() / dst.channels();
-    int srcOffset = src.offset / src.channels() / src.elemSize1();
-    int dstOffset = dst.offset / dst.channels() / dst.elemSize1();
-    int minclos = -(srcOffset % srcStep);
-    int maxclos = src.wholecols + minclos - 1;
-    int minrows = -(srcOffset / srcStep);
-    int maxrows = src.wholerows + minrows - 1;
-
-    //int D=src.depth();
+    int srcOffset = src.offset /  src.elemSize();
+    int dstOffset = dst.offset /  dst.elemSize();

+    int srcOffset_x=srcOffset%srcStep;
+	int srcOffset_y=srcOffset/srcStep;
    Context *clCxt = src.clCxt;
+	string kernelName;
+    size_t localThreads[3] = {16, 16, 1};
+    size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] * localThreads[0], (src.rows + localThreads[1]) / localThreads[1] * localThreads[1], 1};
      
-    string kernelName = "erode";
-
+	if(src.type()==CV_8UC1)
+	{
+		kernelName = "morph_C1_D0";
+		globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
+		CV_Assert( localThreads[0]*localThreads[1]*8 >= (localThreads[0]*4+ksize.width-1)*(localThreads[1]+ksize.height-1) );
+	}
+	else
+	{
+		kernelName = "morph";
+		CV_Assert( localThreads[0]*localThreads[1]*2 >= (localThreads[0]+ksize.width-1)*(localThreads[1]+ksize.height-1) );
+	}
+	char s[64];
+	switch(src.type())
+	{
+	case CV_8UC1:
+		sprintf(s, "-D VAL=255");
+		break;
+	case CV_8UC3:
+	case CV_8UC4:
+		sprintf(s, "-D VAL=255 -D GENTYPE=uchar4");
+		break;
+	case CV_32FC1:
+		sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float");
+		break;
+	case CV_32FC3:
+	case CV_32FC4:
+		sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float4");
+		break;
+	default:
+		CV_Error(-217,"unsupported type");
+	}
+    char compile_option[128];
+    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s", anchor.x, anchor.y, localThreads[0], localThreads[1],s); 
    vector< pair<size_t, const void *> > args;
    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&minclos));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&maxclos));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&minrows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&maxrows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
    args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
@@ -363,18 +387,8 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c
    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data));
 	args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholecols));
 	args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholerows));
-    //args.push_back( make_pair( sizeof(cl_int),(void*)&ksize.width));
-    //args.push_back( make_pair( sizeof(cl_int),(void*)&ksize.height));
-
-    size_t globalThreads[3] = {(src.cols + 15) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
-    if(src.channels() == 1)
-        globalThreads[0] = ((src.cols + 9) / 4 + 15) / 16 * 16;
-    size_t localThreads[3] = {16, 16, 1};
-
-    char compile_option[128];
-    sprintf(compile_option, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d", anchor.x, anchor.y, ksize.width, ksize.height);
-
-    openCLExecuteKernel(clCxt, &filtering_erodeFilter, kernelName, globalThreads, localThreads, args, src.channels(), src.depth(), compile_option);
+    args.push_back( make_pair( sizeof(cl_int),(void*)&dstOffset));
+    openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 }


@@ -390,26 +404,54 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize,

    int srcStep = src.step1() / src.channels();
    int dstStep = dst.step1() / dst.channels();
-    int srcOffset = src.offset / src.channels() / src.elemSize1();
-    int dstOffset = dst.offset / dst.channels() / dst.elemSize1();
-    int minclos = -(srcOffset % srcStep);
-    int maxclos = src.wholecols + minclos - 1;
-    int minrows = -(srcOffset / srcStep);
-    int maxrows = src.wholerows + minrows - 1;
-
+    int srcOffset = src.offset /  src.elemSize();
+    int dstOffset = dst.offset /  dst.elemSize();

+    int srcOffset_x=srcOffset%srcStep;
+	int srcOffset_y=srcOffset/srcStep;
    Context *clCxt = src.clCxt;
+	string kernelName;
+    size_t localThreads[3] = {16, 16, 1};
+    size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] * localThreads[0], (src.rows + localThreads[1]) / localThreads[1] * localThreads[1], 1};
      
-    string kernelName = "dilate";
+	if(src.type()==CV_8UC1)
+	{
+		kernelName = "morph_C1_D0";
+		globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
+		CV_Assert( localThreads[0]*localThreads[1]*8 >= (localThreads[0]*4+ksize.width-1)*(localThreads[1]+ksize.height-1) );
+	}
+	else
+	{
+		kernelName = "morph";
+		CV_Assert( localThreads[0]*localThreads[1]*2 >= (localThreads[0]+ksize.width-1)*(localThreads[1]+ksize.height-1) );
+	}
+	char s[64];
+	switch(src.type())
+	{
+	case CV_8UC1:
+		sprintf(s, "-D VAL=0");
+		break;
+	case CV_8UC3:
+	case CV_8UC4:
+		sprintf(s, "-D VAL=0 -D GENTYPE=uchar4");
+		break;
+	case CV_32FC1:
+		sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float");
+		break;
+	case CV_32FC3:
+	case CV_32FC4:
+		sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float4");
+		break;
+	default:
+		CV_Error(-217,"unsupported type");
+	}
+    char compile_option[128];
+    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s", anchor.x, anchor.y, localThreads[0], localThreads[1],s); 
    vector< pair<size_t, const void *> > args;
    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&minclos));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&maxclos));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&minrows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&maxrows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
    args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
@@ -417,15 +459,8 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize,
    args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data));
 	args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholecols));
 	args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholerows));
-
-    size_t globalThreads[3] = {(src.cols + 15) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
-    if(src.channels() == 1)
-        globalThreads[0] = ((src.cols + 9) / 4 + 15) / 16 * 16;
-    size_t localThreads[3] = {16, 16, 1};
-    char compile_option[128];
-    sprintf(compile_option, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d", anchor.x, anchor.y, ksize.width, ksize.height);
-
-    openCLExecuteKernel(clCxt, &filtering_dilateFilter, kernelName, globalThreads, localThreads, args, src.channels(), src.depth(), compile_option);
+    args.push_back( make_pair( sizeof(cl_int),(void*)&dstOffset));
+    openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 }

 Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor)
@@ -739,7 +774,7 @@ namespace
            int src_type = src.type();

            int cn = src.channels();
-            dst.create(src_size, src_type);
+            //dst.create(src_size, src_type);
            dst = Scalar(0.0);
            //dstBuf.create(src_size, src_type);
            dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
@@ -1265,8 +1300,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
        sprintf(btype, "BORDER_REFLECT_101");
        break;
    }
-    char compile_option[128];
-    sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, localThreads[0], localThreads[1], channels, btype);
+    char compile_option[256];
+    

    size_t globalThreads[3];
    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
@@ -1277,21 +1312,46 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
        {
        case 1:
            globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+			sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+				anchor, localThreads[0], localThreads[1], channels, btype,"float","uchar","convert_uchar_sat");
            break;
        case 2:
            globalThreads[0] = ((dst.cols + 1) / 2 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+			sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+				anchor, localThreads[0], localThreads[1], channels, btype,"float2","uchar2","convert_uchar2_sat");
            break;
        case 3:
-            globalThreads[0] = ((dst.cols * 3 + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-            break;
        case 4:
            globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+			sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+				anchor, localThreads[0], localThreads[1], channels, btype,"float4","uchar4","convert_uchar4_sat");
            break;
        }
    }
    else
    {
        globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+		switch(dst.type())
+		{
+		case CV_32SC1:
+			sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+				anchor, localThreads[0], localThreads[1], channels, btype,"float","int","convert_int_sat");
+			break;
+		case CV_32SC3:
+		case CV_32SC4:
+			sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+				anchor, localThreads[0], localThreads[1], channels, btype,"float4","int4","convert_int4_sat");
+			break;
+		case CV_32FC1:
+			sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+				anchor, localThreads[0], localThreads[1], channels, btype,"float","float","");
+			break;
+		case CV_32FC3:
+		case CV_32FC4:
+			sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+				anchor, localThreads[0], localThreads[1], channels, btype,"float4","float4","");
+			break;
+		}
    }

    //sanity checks
@@ -1321,7 +1381,7 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
    args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
    args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));

-    openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, channels, dst.depth(), compile_option);
+    openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 }

 Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel, int anchor, int bordertype, double delta)
@@ -1376,7 +1436,7 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
 {
    if( ddepth < 0 )
        ddepth = src.depth();
-    CV_Assert(ddepth == src.depth());
+    //CV_Assert(ddepth == src.depth());
    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));

    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);

--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -394,8 +394,15 @@ namespace cv
                args.push_back( make_pair(sizeof(cl_int),(void*)&map1.cols));
                args.push_back( make_pair(sizeof(cl_int),(void*)&map1.rows));
                args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
+                if(src.clCxt -> impl -> double_support != 0)
+                {
                    args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue));
                }
+                else
+                {
+                    args.push_back( make_pair(sizeof(cl_float4),(void*)&borderValue));
+                }
+              }
            openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth());
    }	
    

--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -44,9 +44,9 @@
 //M*/

 #include "precomp.hpp"
-#include "threadsafe.h"
+#include "Threadsafe.h"
 #include <iomanip>
-#include "binarycaching.hpp"
+#include "binaryCaching.hpp"

 using namespace cv;
 using namespace cv::ocl;

--- a/modules/ocl/src/kernels/filter_sep_col.cl
+++ b/modules/ocl/src/kernels/filter_sep_col.cl
@@ -90,9 +90,9 @@ Niko
 ***********************************************************************************/


-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D0
-						(__global const float * restrict src, 
-						 __global uchar * dst,
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
+						(__global const GENTYPE_SRC * restrict src, 
+						 __global GENTYPE_DST * dst,
                         const int dst_cols,
                         const int dst_rows, 
 						 const int src_whole_cols,
@@ -111,10 +111,10 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_
 	int start_addr = mad24(y,src_step_in_pixel,x);
 	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
 	int i;
-	float sum;
-	float temp[READ_TIMES_COL];
+	GENTYPE_SRC sum;
+	GENTYPE_SRC temp[READ_TIMES_COL];

-	__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
+	__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];

 	//read pixels from src
 	for(i = 0;i<READ_TIMES_COL;i++)
@@ -141,170 +141,6 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_
 	if((x<dst_cols) & (y<dst_rows))
 	{
 		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
-		dst[start_addr] = convert_uchar_sat(sum);
-	}
-}
-
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D0
-						(__global const float4 * restrict src, 
-						 __global uchar4 * dst,
-                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         //const int src_offset_x, 
-                         //const int src_offset_y, 
-                         const int dst_step_in_pixel,
-                         const int dst_offset_in_pixel,
-                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_addr = mad24(y,src_step_in_pixel,x);
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	int i;
-	float4 sum;
-	float4 temp[READ_TIMES_COL];
-
-	__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
-
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
-		current_addr = current_addr < end_addr ? current_addr : 0;
-		temp[i] = src[current_addr];
-	}
-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	//read pixels from lds and calculate the result
-	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
-	for(i=1;i<=RADIUSY;i++)
-	{
-		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
-		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
-		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
-	}
-	//write the result to dst
-	if((x<dst_cols) & (y<dst_rows))
-	{
-		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
-		dst[start_addr] = convert_uchar4_sat(sum);
-	}
-}
-
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D5
-						(__global const float * restrict src, 
-						 __global float * dst,
-                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         //const int src_offset_x, 
-                         //const int src_offset_y, 
-                         const int dst_step_in_pixel,
-                         const int dst_offset_in_pixel,
-                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_addr = mad24(y,src_step_in_pixel,x);
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	int i;
-	float sum;
-	float temp[READ_TIMES_COL];
-
-	__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
-
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
-		current_addr = current_addr < end_addr ? current_addr : 0;
-		temp[i] = src[current_addr];
-	}
-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	//read pixels from lds and calculate the result
-	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
-	for(i=1;i<=RADIUSY;i++)
-	{
-		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
-		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
-		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
-	}
-	//write the result to dst
-	if((x<dst_cols) & (y<dst_rows))
-	{
-		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
-		dst[start_addr] = sum;
-	}
-}
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D5
-						(__global const float4 * restrict src, 
-						 __global float4 * dst,
-                         const int dst_cols,
-                         const int dst_rows, 
-						 const int src_whole_cols,
-						 const int src_whole_rows,
-                         const int src_step_in_pixel, 
-                         //const int src_offset_x, 
-                         //const int src_offset_y, 
-                         const int dst_step_in_pixel,
-                         const int dst_offset_in_pixel,
-                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	int l_x = get_local_id(0);
-	int l_y = get_local_id(1);
-	int start_addr = mad24(y,src_step_in_pixel,x);
-	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-	int i;
-	float4 sum;
-	float4 temp[READ_TIMES_COL];
-
-	__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
-
-	//read pixels from src
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
-		current_addr = current_addr < end_addr ? current_addr : 0;
-		temp[i] = src[current_addr];
-	}
-	//save pixels to lds
-	for(i = 0;i<READ_TIMES_COL;i++)
-	{
-		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-	//read pixels from lds and calculate the result
-	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
-	for(i=1;i<=RADIUSY;i++)
-	{
-		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
-		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
-		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
-	}
-	//write the result to dst
-	if((x<dst_cols) & (y<dst_rows))
-	{
-		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
-		dst[start_addr] = sum;
+		dst[start_addr] = convert_to_DST(sum);
 	}
 }
--- a/modules/ocl/src/kernels/filtering_dilateFilter.cl
+++ b/modules/ocl/src/kernels/filtering_dilateFilter.cl
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL FP_CONTRACT ON
-#define UCHAR_MIN 0
-__kernel void dilate_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = get_global_id(0);
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    float4 maxVal = (float4)(-FLT_MAX);
-	  int k=0;
-	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX; j++, kX++)
-        {
-			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
-			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-			float4 v = src[current_addr];		
-			uchar now = mat_kernel[k++];
-		    float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(-FLT_MAX);
-            maxVal = max(maxVal , flag);
-        }
-    }
-
-	  if(mX < cols && mY < rows)
-        dst[mY * dstStep + mX + dstOffset] = (maxVal);		   
-}
-
-__kernel void dilate_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = (get_global_id(0)<<2) - (dstOffset&3);
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    float4 maxVal = (float4)(-FLT_MAX);
-	  int k=0;
-	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX;j++, kX++)
-        {
-			int start = mad24(kY,srcStep,kX) + srcOffset;
-			start = ((start < end_addr) && (start > 0)) ? start : 0;
-			int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
-			    float8 sVal = (float8)(src[start>>2], src[start2>>2]);
-			
-		    	float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
-		    	int det = start & 3;
-	    		float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);		
-			    uchar now = mat_kernel[k++];
-			    float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
-			    flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : -FLT_MAX;
-			    flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : -FLT_MAX;
-			    flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : -FLT_MAX;
-			    flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : -FLT_MAX;
-			
-          maxVal = max(maxVal , flag);
-        }
-    }
-    if(mY < rows && mX < cols)
-	  {
-		    __global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
-		    float4 dVal = *d;
-    		maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
-    		maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
-    		maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
-    		maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
-		
-        *d = (maxVal);	
-	  }
-}
-
-__kernel void dilate_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = (get_global_id(0)<<2) - (dstOffset&3);;
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    uchar4 maxVal = (uchar4)(UCHAR_MIN);
-	  int k=0;
-	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX;j++, kX++)
-        {
-			    int start = mad24(kY,srcStep,kX) + srcOffset;
-				start = ((start < end_addr) && (start > 0)) ? start : 0;
-				int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
-			    uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
-			
-			    uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
-			    int det = start & 3;
-			    uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
-
-			    uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : maxVal;
-			    flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : UCHAR_MIN;
-			    flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : UCHAR_MIN;
-			    flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : UCHAR_MIN;
-			    flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : UCHAR_MIN;			
-
-          maxVal = max(maxVal , flag);
-        }
-    }
-	  if(mY < rows)
-	  {
-		    __global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
-		    uchar4 dVal = *d;
-		
-    		maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
-    		maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
-    		maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
-    		maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
-		
-        *d = (maxVal);	
-	  }
-}
-
-__kernel void dilate_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = get_global_id(0);
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    uchar4 maxVal = (uchar4)(UCHAR_MIN);
-	  int k=0;
-	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX;j++, kX++)
-        {
-			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
-			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;		
-			uchar4 v = src[current_addr];
-			    uchar now = mat_kernel[k++];
-			    uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
-          maxVal = max(maxVal , flag);
-        }
-    }
-
-	  if(mX < cols && mY < rows)
-        dst[mY * dstStep + mX + dstOffset] = (maxVal);		   
-}
-
--- a/modules/ocl/src/kernels/filtering_erodeFilter.cl
+++ b/modules/ocl/src/kernels/filtering_erodeFilter.cl
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Zero Lin, zero.lin@amd.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-__kernel void erode_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = get_global_id(0);
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    float4 minVal = (float4)(3.4e+38);
-	int k=0;
-	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX; j++, kX++)
-        {
-			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
-			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-			float4 v = src[current_addr];
-			uchar now = mat_kernel[k++];
-			float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
-            minVal = min(minVal , flag);
-        }
-    }
-
-	if(mX < cols && mY < rows)
-        dst[mY * dstStep + mX + dstOffset] = (minVal);		   
-}
-
-__kernel void erode_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = (get_global_id(0)<<2) - (dstOffset&3);
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    float4 minVal = (float4)(3.4e+38);
-	int k=0;
-	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX;j++, kX++)
-        {
-			int start = mad24(kY,srcStep,kX) + srcOffset;
-			start = ((start < end_addr) && (start > 0)) ? start : 0;
-			int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
-			float8 sVal = (float8)(src[start>>2], src[start2>>2]);
-			
-			float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
-			int det = start & 3;
-			float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);		
-			uchar now = mat_kernel[k++];
-			float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
-			flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 3.4e+38;
-			flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 3.4e+38;
-			flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 3.4e+38;
-			flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 3.4e+38;
-			
-            minVal = min(minVal , flag);
-        }
-    }
-
-	if(mY < rows && mX < cols)
-	{
-		__global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
-		float4 dVal = *d;
-		minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
-		minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
-		minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
-		minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
-		
-        *d = (minVal);	
-	}
-}
-
-__kernel void erode_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = (get_global_id(0)<<2) - (dstOffset&3);
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    uchar4 minVal = (uchar4)(0xff);
-	int k=0;
-	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX;j++, kX++)
-        {
-			int start = mad24(kY,srcStep,kX) + srcOffset;
-			start = ((start < end_addr) && (start > 0)) ? start : 0;
-			int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
-			uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
-			
-			uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
-			int det = start & 3;
-			uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
-
-			uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : (uchar4)(0xff);
-			flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 0xff;
-			flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 0xff;
-			flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 0xff;
-			flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 0xff;			
-
-            minVal = min(minVal , flag);
-        }
-    }
-
-	if(mY < rows)
-	{
-		__global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
-		uchar4 dVal = *d;
-		
-		minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
-		minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
-		minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
-		minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
-		
-        *d = (minVal);	
-	}
-}
-
-__kernel void erode_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset, 
-					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
-					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
-{
-    int mX = get_global_id(0);
-    int mY = get_global_id(1);
-    int kX = mX - anX, kY = mY - anY;
-	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
-    uchar4 minVal = (uchar4)(0xff);
-	int k=0;
-	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
-    {
-        for(int j=0;j<ksX;j++, kX++)
-        {
-			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
-			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;		
-			uchar4 v = src[current_addr];
-			uchar now = mat_kernel[k++];
-			uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (uchar4)(0xff);
-            minVal = min(minVal , flag);
-        }
-    }
-
-	if(mX < cols && mY < rows)
-        dst[mY * dstStep + mX + dstOffset] = (minVal);		   
-}
-
--- a/modules/ocl/src/kernels/filtering_morph.cl
+++ b/modules/ocl/src/kernels/filtering_morph.cl
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin, zero.lin@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+
+#ifdef ERODE
+#define MORPH_OP(A,B) min((A),(B))
+#endif
+#ifdef DILATE
+#define MORPH_OP(A,B) max((A),(B))
+#endif
+//BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
+#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+#ifndef GENTYPE
+__kernel void morph_C1_D0(__global const uchar * restrict src,
+						  __global uchar *dst, 
+						  int src_offset_x, int src_offset_y, 
+						  int cols, int rows, 
+					      int src_step_in_pixel, int dst_step_in_pixel, 
+						  __constant uchar * mat_kernel, 
+						  int src_whole_cols, int src_whole_rows,
+						  int dst_offset_in_pixel)
+{
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int x = get_group_id(0)*4*LSIZE0;
+	int y = get_group_id(1)*LSIZE1;
+	int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
+	int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
+	int width = (end_x -start_x+4)>>2;
+	int offset = src_offset_x-RADIUSX & 3;
+	int start_y = y+src_offset_y-RADIUSY;
+	int point1 = mad24(l_y,LSIZE0,l_x);
+	int point2 = point1 + LSIZE0*LSIZE1;
+	int tl_x = (point1 % width)<<2;
+	int tl_y = point1 / width;
+	int tl_x2 = (point2 % width)<<2;
+	int tl_y2 = point2 / width;
+	int cur_x = start_x + tl_x;
+	int cur_y = start_y + tl_y;
+	int cur_x2 = start_x + tl_x2;
+	int cur_y2 = start_y + tl_y2;
+	int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
+	int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
+	uchar4 temp0,temp1;
+	__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
+
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	//read pixels from src
+	start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
+	start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
+	temp0 = *(__global uchar4*)&src[start_addr];
+	temp1 = *(__global uchar4*)&src[start_addr2];
+	//judge if read out of boundary
+	temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
+	temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
+	temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
+	temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
+	temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
+
+	temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
+	temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
+	temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
+	temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
+	temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
+
+	LDS_DAT[point1] = temp0;
+	LDS_DAT[point2] = temp1;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uchar4 res = (uchar4)VAL;
+	for(int i=0;i<2*RADIUSY+1;i++)
+		for(int j=0;j<2*RADIUSX+1;j++)
+		{
+			res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
+		}
+	int gidx = get_global_id(0)<<2;
+	int gidy = get_global_id(1);
+	int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
+	if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
+	{
+		*(__global uchar4*)&dst[out_addr] = res;
+	}
+	else
+	{
+		if(gidx+3<cols && gidy<rows)
+		{
+			dst[out_addr] = res.x;
+			dst[out_addr+1] = res.y;
+			dst[out_addr+2] = res.z;
+			dst[out_addr+3] = res.w;
+		}	
+		else if(gidx+2<cols && gidy<rows)
+		{
+			dst[out_addr] = res.x;
+			dst[out_addr+1] = res.y;
+			dst[out_addr+2] = res.z;
+		}		
+		else if(gidx+1<cols && gidy<rows)
+		{
+			dst[out_addr] = res.x;
+			dst[out_addr+1] = res.y;
+		}		
+		else if(gidx<cols && gidy<rows)
+		{
+			dst[out_addr] = res.x;
+		}		
+	}
+}
+#else
+__kernel void morph(__global const GENTYPE * restrict src,
+						  __global GENTYPE *dst, 
+						  int src_offset_x, int src_offset_y, 
+						  int cols, int rows, 
+					      int src_step_in_pixel, int dst_step_in_pixel, 
+						  __constant uchar * mat_kernel, 
+						  int src_whole_cols, int src_whole_rows,
+						  int dst_offset_in_pixel)
+{
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int x = get_group_id(0)*LSIZE0;
+	int y = get_group_id(1)*LSIZE1;
+	int start_x = x+src_offset_x-RADIUSX;
+	int end_x = x + src_offset_x+LSIZE0+RADIUSX;
+	int width = end_x -start_x+1;
+	int start_y = y+src_offset_y-RADIUSY;
+	int point1 = mad24(l_y,LSIZE0,l_x);
+	int point2 = point1 + LSIZE0*LSIZE1;
+	int tl_x = point1 % width;
+	int tl_y = point1 / width;
+	int tl_x2 = point2 % width;
+	int tl_y2 = point2 / width;
+	int cur_x = start_x + tl_x;
+	int cur_y = start_y + tl_y;
+	int cur_x2 = start_x + tl_x2;
+	int cur_y2 = start_y + tl_y2;
+	int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
+	int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
+	GENTYPE temp0,temp1;
+	__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
+
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	//read pixels from src
+	start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
+	start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
+	temp0 = src[start_addr];
+	temp1 = src[start_addr2];
+	//judge if read out of boundary
+	temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
+	temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
+
+	temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
+	temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
+
+	LDS_DAT[point1] = temp0;
+	LDS_DAT[point2] = temp1;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	GENTYPE res = (GENTYPE)VAL;
+	for(int i=0;i<2*RADIUSY+1;i++)
+		for(int j=0;j<2*RADIUSX+1;j++)
+		{
+			res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
+		}
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
+	if(gidx<cols && gidy<rows)
+	{
+		dst[out_addr] = res;
+	}
+}
+#endif
--- a/modules/ocl/src/kernels/imgproc_remap.cl
+++ b/modules/ocl/src/kernels/imgproc_remap.cl
@@ -94,7 +94,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
        uchar4 dVal = *d;      

        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-        dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
+        dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;

        *d = dst_data;

@@ -139,8 +139,8 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
        uchar4 dVal = *d;      

        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-        dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
  
+        dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
        *d = dst_data;

    }
@@ -167,7 +167,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig
        short8 map1_data;

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-        int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
+        int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
        uchar4 src_a, src_b, src_c, src_d;
        src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
        src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
@@ -187,7 +187,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig
        uchar16 dVal = *d;      

        int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
-        dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
+        dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;

        *d = dst_data;
    }
@@ -216,7 +216,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig
        map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
        int8 map1_dataZ = convert_int8_sat_rte(map1_data);

-        int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<2) + src_offset;
+        int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<((int4)(2))) + src_offset;
        uchar4 src_a, src_b, src_c, src_d;
        src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
        src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
@@ -236,7 +236,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig
        uchar16 dVal = *d;      

        int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
-        dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
+        dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;

        *d = dst_data;

@@ -269,7 +269,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
    
-        int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
+        int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
    
        float4 src_data;
        src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
@@ -289,7 +289,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
        float4 dVal = *d;      

        int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
-        dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
+        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;

        *d = dst_data;

@@ -321,7 +321,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
        map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
        int8 map1_dataZ = convert_int8_sat_rte(map1_data);

-        int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<2) + src_offset;
+        int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
    
        float4 src_data;
        src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
@@ -341,7 +341,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
        float4 dVal = *d;      

        int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
-        dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
+        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;

        *d = dst_data;

@@ -418,21 +418,21 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig

      float4 u = temp.even;
      float4 v = temp.odd;
-      float4 ud = 1.f - u;
-      float4 vd = 1.f - v;
+      float4 ud = (float4)(1.0) - u;
+      float4 vd = (float4)(1.0) - v;
      //float8 map1_dataU = map1_dataD + 1;

      int4 map1_dataDx = map1_dataD.even;
      int4 map1_dataDy = map1_dataD.odd;
-      int4 map1_dataDx1 = map1_dataDx + 1;
-      int4 map1_dataDy1 = map1_dataDy + 1;
+      int4 map1_dataDx1 = map1_dataDx + (int4)(1);
+      int4 map1_dataDy1 = map1_dataDy + (int4)(1);

      int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
      int4 src_StartD = src_StartU + src_step;
     /* 
      //not using the vload
-      int4 src_StartU1 = src_StartU + 1;
-      int4 src_StartD1 = src_StartD + 1;
+      int4 src_StartU1 = src_StartU + (int4)(1);
+      int4 src_StartD1 = src_StartD + (int4)(1);

      uchar4 a, b, c, d;
      a.x = *(src_StartU.x + src);
@@ -476,10 +476,10 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
      int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
      int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
      int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
-      a = (convert_uchar4(ac) == (uchar4)0)? a : val;
-      b = (convert_uchar4(bc) == (uchar4)0)? b : val;
-      c = (convert_uchar4(cc) == (uchar4)0)? c : val;
-      d = (convert_uchar4(dc) == (uchar4)0)? d : val;
+      a = (convert_uchar4(ac) == (uchar4)(0))? a : val;
+      b = (convert_uchar4(bc) == (uchar4)(0))? b : val;
+      c = (convert_uchar4(cc) == (uchar4)(0))? c : val;
+      d = (convert_uchar4(dc) == (uchar4)(0))? d : val;

      uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
    
@@ -487,7 +487,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig

      uchar4 dVal = *D;      
      int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-      dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
+      dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;

      *D = dst_data;
    }
@@ -531,7 +531,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig
        uchar4 dVal = *d;      

        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-        dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
+        dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;

        *d = dst_data;

@@ -567,17 +567,17 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig

      float4 u = temp.even;
      float4 v = temp.odd;
-      float4 ud = 1.f - u;
-      float4 vd = 1.f - v;
+      float4 ud = (float4)(1.0) - u;
+      float4 vd = (float4)(1.0) - v;
      
      //float8 map1_dataU = map1_dataD + 1;

      int4 map1_dataDx = map1_dataD.even;
      int4 map1_dataDy = map1_dataD.odd;
-      int4 map1_dataDx1 = map1_dataDx + 1;
-      int4 map1_dataDy1 = map1_dataDy + 1;
+      int4 map1_dataDx1 = map1_dataDx + (int4)(1);
+      int4 map1_dataDy1 = map1_dataDy + (int4)(1);

-      int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset;
+      int4 src_StartU = map1_dataDy * src_step + (convert_int4(map1_dataDx) << (int4)(2)) + src_offset;
      int4 src_StartD = src_StartU + src_step;

      uchar8 aU, bU, cU, dU, aD, bD, cD, dD;
@@ -605,10 +605,10 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig
      int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
 
      uchar16 val = (uchar16)(nval, nval, nval, nval);
-      a = (convert_uchar16(acc) == (uchar16)0)? a : val;
-      b = (convert_uchar16(bcc) == (uchar16)0)? b : val;
-      c = (convert_uchar16(ccc) == (uchar16)0)? c : val;
-      d = (convert_uchar16(dcc) == (uchar16)0)? d : val;
+      a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
+      b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
+      c = (convert_uchar16(ccc) == (uchar16)(0))? c : val;
+      d = (convert_uchar16(dcc) == (uchar16)(0))? d : val;

      float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w));
      float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w));
@@ -621,7 +621,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig

      uchar16 dVal = *D;      
      int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
-      dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
+      dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;

      *D = dst_data;
    }
@@ -646,7 +646,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig
        short8 map1_data;

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-        int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
+        int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<(int4)(2)) + src_offset;
        uchar4 src_a, src_b, src_c, src_d;
        src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
        src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
@@ -666,7 +666,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig
        uchar16 dVal = *d;      

        int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
-        dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
+        dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;

        *d = dst_data;

@@ -700,21 +700,21 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *

      float4 u = temp.even;
      float4 v = temp.odd;
-      float4 ud = 1.f - u;
-      float4 vd = 1.f - v;
+      float4 ud = (float4)(1.0) - u;
+      float4 vd = (float4)(1.0) - v;
      //float8 map1_dataU = map1_dataD + 1;

      int4 map1_dataDx = map1_dataD.even;
      int4 map1_dataDy = map1_dataD.odd;
-      int4 map1_dataDx1 = map1_dataDx + 1;
-      int4 map1_dataDy1 = map1_dataDy + 1;
+      int4 map1_dataDx1 = map1_dataDx + (int4)(1);
+      int4 map1_dataDy1 = map1_dataDy + (int4)(1);

-      int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset;
+      int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
      int4 src_StartD = src_StartU + src_step;
     /* 
      //not using the vload
-      int4 src_StartU1 = src_StartU + 1;
-      int4 src_StartD1 = src_StartD + 1;
+      int4 src_StartU1 = src_StartU + (int4)(1);
+      int4 src_StartD1 = src_StartD + (int4)(1);

      float4 a, b, c, d;
      a.x = *(src_StartU.x + src);
@@ -754,14 +754,14 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
      c = (float4)(aD.x, bD.x, cD.x, dD.x);
      d = (float4)(aD.y, bD.y, cD.y, dD.y);
      
-      int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
-      int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
-      int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
-      int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
-      a = (convert_float4(ac) == 0)? a : val;
-      b = (convert_float4(bc) == 0)? b : val;
-      c = (convert_float4(cc) == 0)? c : val;
-      d = (convert_float4(dc) == 0)? d : val;
+      int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
+      int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
+      int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
+      int4 dc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDy1 < (int4)(0));
+      a = (convert_float4(ac) == (float4)(0))? a : val;
+      b = (convert_float4(bc) == (float4)(0))? b : val;
+      c = (convert_float4(cc) == (float4)(0))? c : val;
+      d = (convert_float4(dc) == (float4)(0))? d : val;

      float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
    
@@ -769,7 +769,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *

      float4 dVal = *D;      
      int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
-      dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
+      dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;

      *D = dst_data;
    }
@@ -798,7 +798,7 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *

        map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
    
-        int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
+        int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset;
    
        float4 src_data;
        src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
@@ -818,7 +818,7 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const *
        float4 dVal = *d;      

        int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
-        dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
+        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;

        *d = dst_data;

@@ -918,7 +918,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im
      __global uchar4* d = (__global uchar4 *)(dst + dstStart);
      uchar4 dVal = *d;
      int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-      dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
+      dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;

      *d = dst_data;   
    }

--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -588,6 +588,13 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern
 			sprintf(compile_option, "-D GENTYPE=int");
 			args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
 			break;
+		case 2:
+			sprintf(compile_option, "-D GENTYPE=int2");
+			cl_int2 i2val;
+			i2val.s[0] = val.ival.s[0];
+			i2val.s[1] = val.ival.s[1];
+			args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
+			break;
 		case 4:
 			sprintf(compile_option, "-D GENTYPE=int4");
 			args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));

--- a/modules/ocl/src/threadsafe.cpp
+++ b/modules/ocl/src/threadsafe.cpp
@@ -44,7 +44,7 @@
 //M*/

 #include "precomp.hpp"
-#include "threadsafe.h"
+#include "Threadsafe.h"

 CriticalSection::CriticalSection()
 {

--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
--- a/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/ColorBlobDetector.java
+++ b/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/ColorBlobDetector.java
@@ -115,5 +115,5 @@ public class ColorBlobDetector
 	// Color radius for range checking in HSV color space
 	private Scalar mColorRadius = new Scalar(25,50,50,0);
 	private Mat mSpectrum = new Mat();
-	private List<MatOfPoint> mContours = new ArrayList<MatOfPoint>();;
+	private List<MatOfPoint> mContours = new ArrayList<MatOfPoint>();
 }
--- a/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/SampleCvViewBase.java
+++ b/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/SampleCvViewBase.java
@@ -115,6 +115,7 @@ public abstract class SampleCvViewBase extends SurfaceView implements SurfaceHol
            if (bmp != null) {
                Canvas canvas = mHolder.lockCanvas();
                if (canvas != null) {
+                	canvas.drawColor(0, android.graphics.PorterDuff.Mode.CLEAR);
                	canvas.drawBitmap(bmp, (canvas.getWidth() - bmp.getWidth()) / 2, (canvas.getHeight() - bmp.getHeight()) / 2, null);
                    mHolder.unlockCanvasAndPost(canvas);
                }

--- a/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsView.java
+++ b/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsView.java
@@ -127,7 +127,7 @@ class ImageManipulationsView extends SampleCvViewBase {

        case ImageManipulationsActivity.VIEW_MODE_HIST:
            capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
-            if (mSizeRgba == null)
+            if ((mSizeRgba == null) || (mRgba.cols() != mSizeRgba.width) || (mRgba.height() != mSizeRgba.height))
                CreateAuxiliaryMats();
            int thikness = (int) (mSizeRgba.width / (mHistSizeNum + 10) / 5);
            if(thikness > 5) thikness = 5;
@@ -171,7 +171,7 @@ class ImageManipulationsView extends SampleCvViewBase {
        case ImageManipulationsActivity.VIEW_MODE_CANNY:
            capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);

-            if (mRgbaInnerWindow == null || mGrayInnerWindow == null)
+            if ((mRgbaInnerWindow == null) || (mGrayInnerWindow == null) || (mRgba.cols() != mSizeRgba.width) || (mRgba.height() != mSizeRgba.height))
                CreateAuxiliaryMats();
            Imgproc.Canny(mRgbaInnerWindow, mIntermediateMat, 80, 90);
            Imgproc.cvtColor(mIntermediateMat, mRgbaInnerWindow, Imgproc.COLOR_GRAY2BGRA, 4);
@@ -181,7 +181,7 @@ class ImageManipulationsView extends SampleCvViewBase {
            capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
            capture.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME);

-            if (mRgbaInnerWindow == null || mGrayInnerWindow == null)
+            if ((mRgbaInnerWindow == null) || (mGrayInnerWindow == null) || (mRgba.cols() != mSizeRgba.width) || (mRgba.height() != mSizeRgba.height))
                CreateAuxiliaryMats();

            Imgproc.Sobel(mGrayInnerWindow, mIntermediateMat, CvType.CV_8U, 1, 1);
@@ -196,7 +196,7 @@ class ImageManipulationsView extends SampleCvViewBase {

        case ImageManipulationsActivity.VIEW_MODE_ZOOM:
            capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
-            if (mZoomCorner == null || mZoomWindow == null)
+            if ((mZoomCorner == null) || (mZoomWindow == null) || (mRgba.cols() != mSizeRgba.width) || (mRgba.height() != mSizeRgba.height))
                CreateAuxiliaryMats();
            Imgproc.resize(mZoomWindow, mZoomCorner, mZoomCorner.size());

@@ -206,7 +206,7 @@ class ImageManipulationsView extends SampleCvViewBase {

        case ImageManipulationsActivity.VIEW_MODE_PIXELIZE:
            capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
-            if (mRgbaInnerWindow == null)
+            if ((mRgbaInnerWindow == null) || (mRgba.cols() != mSizeRgba.width) || (mRgba.height() != mSizeRgba.height))
                CreateAuxiliaryMats();
            Imgproc.resize(mRgbaInnerWindow, mIntermediateMat, mSize0, 0.1, 0.1, Imgproc.INTER_NEAREST);
            Imgproc.resize(mIntermediateMat, mRgbaInnerWindow, mSizeRgbaInner, 0., 0., Imgproc.INTER_NEAREST);
@@ -214,7 +214,7 @@ class ImageManipulationsView extends SampleCvViewBase {

        case ImageManipulationsActivity.VIEW_MODE_POSTERIZE:
            capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
-            if (mRgbaInnerWindow == null)
+            if ((mRgbaInnerWindow == null) || (mRgba.cols() != mSizeRgba.width) || (mRgba.height() != mSizeRgba.height))
                CreateAuxiliaryMats();
            /*
            Imgproc.cvtColor(mRgbaInnerWindow, mIntermediateMat, Imgproc.COLOR_RGBA2RGB);

--- a/samples/android/tutorial-0-androidcamera/src/org/opencv/samples/tutorial0/Sample0Base.java
+++ b/samples/android/tutorial-0-androidcamera/src/org/opencv/samples/tutorial0/Sample0Base.java
@@ -10,13 +10,13 @@ import android.view.MenuItem;
 import android.view.Window;

 public class Sample0Base extends Activity {
+    
 	private static final String TAG            = "Sample::Activity";

    private MenuItem            mItemPreviewRGBA;
    private MenuItem            mItemPreviewGray;
    private Sample0View         mView;

-
    public Sample0Base() {
        Log.i(TAG, "Instantiated new " + this.getClass());
    }

--- a/samples/android/tutorial-2-opencvcamera/src/org/opencv/samples/tutorial2/Sample2View.java
+++ b/samples/android/tutorial-2-opencvcamera/src/org/opencv/samples/tutorial2/Sample2View.java
@@ -44,7 +44,7 @@ class Sample2View extends SampleCvViewBase {
            break;
        case Sample2NativeCamera.VIEW_MODE_RGBA:
            capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
-            Core.putText(mRgba, "OpenCV + Android", new Point(10, 100), 3, 2, new Scalar(255, 0, 0, 255), 3);
+            Core.putText(mRgba, "OpenCV+Android", new Point(10, 50), 3, 1, new Scalar(255, 0, 0, 255), 2);
            break;
        case Sample2NativeCamera.VIEW_MODE_CANNY:
            capture.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME);

--- a/samples/android/tutorial-2-opencvcamera/src/org/opencv/samples/tutorial2/SampleCvViewBase.java
+++ b/samples/android/tutorial-2-opencvcamera/src/org/opencv/samples/tutorial2/SampleCvViewBase.java
@@ -115,7 +115,8 @@ public abstract class SampleCvViewBase extends SurfaceView implements SurfaceHol
            if (bmp != null) {
                Canvas canvas = mHolder.lockCanvas();
                if (canvas != null) {
-                    canvas.drawBitmap(bmp, (canvas.getWidth() - bmp.getWidth()) / 2, (canvas.getHeight() - bmp.getHeight()) / 2, null);
+                	canvas.drawColor(0, android.graphics.PorterDuff.Mode.CLEAR);
+                	canvas.drawBitmap(bmp, (canvas.getWidth()-bmp.getWidth()) / 2, (canvas.getHeight()-bmp.getHeight()) / 2, null);
                    mHolder.unlockCanvasAndPost(canvas);
                }
                bmp.recycle();

--- a/samples/ocl/squares.cpp
+++ b/samples/ocl/squares.cpp
+// The "Square Detector" program.
+// It loads several images sequentially and tries to find squares in
+// each image
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/ocl/ocl.hpp"
+
+#include <iostream>
+#include <math.h>
+#include <string.h>
+
+using namespace cv;
+using namespace std;
+
+void help()
+{
+    cout <<
+        "\nA program using OCL module pyramid scaling, Canny, dilate functions, threshold, split; cpu contours, contour simpification and\n"
+        "memory storage (it's got it all folks) to find\n"
+        "squares in a list of images pic1-6.png\n"
+        "Returns sequence of squares detected on the image.\n"
+        "the sequence is stored in the specified memory storage\n"
+        "Call:\n"
+        "./squares\n"
+        "Using OpenCV version %s\n" << CV_VERSION << "\n" << endl;
+}
+
+
+int thresh = 50, N = 11;
+const char* wndname = "OpenCL Square Detection Demo";
+
+// helper function:
+// finds a cosine of angle between vectors
+// from pt0->pt1 and from pt0->pt2
+double angle( Point pt1, Point pt2, Point pt0 )
+{
+    double dx1 = pt1.x - pt0.x;
+    double dy1 = pt1.y - pt0.y;
+    double dx2 = pt2.x - pt0.x;
+    double dy2 = pt2.y - pt0.y;
+    return (dx1*dx2 + dy1*dy2)/sqrt((dx1*dx1 + dy1*dy1)*(dx2*dx2 + dy2*dy2) + 1e-10);
+}
+
+// returns sequence of squares detected on the image.
+// the sequence is stored in the specified memory storage
+void findSquares( const Mat& image, vector<vector<Point> >& squares )
+{
+    squares.clear();
+
+    Mat gray;
+    cv::ocl::oclMat pyr_ocl, timg_ocl, gray0_ocl, gray_ocl;
+
+    // down-scale and upscale the image to filter out the noise
+    ocl::pyrDown(ocl::oclMat(image), pyr_ocl);
+    ocl::pyrUp(pyr_ocl, timg_ocl);
+
+    vector<vector<Point> > contours;
+    vector<cv::ocl::oclMat> gray0s;
+    ocl::split(timg_ocl, gray0s); // split 3 channels into a vector of oclMat
+    // find squares in every color plane of the image
+    for( int c = 0; c < 3; c++ )
+    {
+        gray0_ocl = gray0s[c];
+        // try several threshold levels
+        for( int l = 0; l < N; l++ )
+        {
+            // hack: use Canny instead of zero threshold level.
+            // Canny helps to catch squares with gradient shading
+            if( l == 0 )
+            {
+                // do canny on OpenCL device
+                // apply Canny. Take the upper threshold from slider
+                // and set the lower to 0 (which forces edges merging)
+                cv::ocl::Canny(gray0_ocl, gray_ocl, 0, thresh, 5);
+                // dilate canny output to remove potential
+                // holes between edge segments
+                ocl::dilate(gray_ocl, gray_ocl, Mat(), Point(-1,-1));
+                gray = Mat(gray_ocl);
+            }
+            else
+            {
+                // apply threshold if l!=0:
+                //     tgray(x,y) = gray(x,y) < (l+1)*255/N ? 255 : 0
+                cv::ocl::threshold(gray0_ocl, gray_ocl, (l+1)*255/N, 255, THRESH_BINARY);
+                gray = gray_ocl;
+            }
+
+            // find contours and store them all as a list
+            findContours(gray, contours, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);
+
+            vector<Point> approx;
+
+            // test each contour
+            for( size_t i = 0; i < contours.size(); i++ )
+            {
+                // approximate contour with accuracy proportional
+                // to the contour perimeter
+                approxPolyDP(Mat(contours[i]), approx, arcLength(Mat(contours[i]), true)*0.02, true);
+
+                // square contours should have 4 vertices after approximation
+                // relatively large area (to filter out noisy contours)
+                // and be convex.
+                // Note: absolute value of an area is used because
+                // area may be positive or negative - in accordance with the
+                // contour orientation
+                if( approx.size() == 4 &&
+                    fabs(contourArea(Mat(approx))) > 1000 &&
+                    isContourConvex(Mat(approx)) )
+                {
+                    double maxCosine = 0;
+
+                    for( int j = 2; j < 5; j++ )
+                    {
+                        // find the maximum cosine of the angle between joint edges
+                        double cosine = fabs(angle(approx[j%4], approx[j-2], approx[j-1]));
+                        maxCosine = MAX(maxCosine, cosine);
+                    }
+
+                    // if cosines of all angles are small
+                    // (all angles are ~90 degree) then write quandrange
+                    // vertices to resultant sequence
+                    if( maxCosine < 0.3 )
+                        squares.push_back(approx);
+                }
+            }
+        }
+    }
+}
+
+
+// the function draws all the squares in the image
+void drawSquares( Mat& image, const vector<vector<Point> >& squares )
+{
+    for( size_t i = 0; i < squares.size(); i++ )
+    {
+        const Point* p = &squares[i][0];
+        int n = (int)squares[i].size();
+        polylines(image, &p, &n, 1, true, Scalar(0,255,0), 3, CV_AA);
+    }
+
+    imshow(wndname, image);
+}
+
+
+int main(int /*argc*/, char** /*argv*/)
+{
+
+    //ocl::setBinpath("F:/kernel_bin");
+    vector<ocl::Info> info;
+    CV_Assert(ocl::getDevice(info));
+
+    static const char* names[] = { "pic1.png", "pic2.png", "pic3.png",
+        "pic4.png", "pic5.png", "pic6.png", 0 };
+    help();
+    namedWindow( wndname, 1 );
+    vector<vector<Point> > squares;
+
+    for( int i = 0; names[i] != 0; i++ )
+    {
+        Mat image = imread(names[i], 1);
+        if( image.empty() )
+        {
+            cout << "Couldn't load " << names[i] << endl;
+            continue;
+        }
+
+        findSquares(image, squares);
+        drawSquares(image, squares);
+
+        int c = waitKey();
+        if( (char)c == 27 )
+            break;
+    }
+
+    return 0;
+}