Commit 297ba853 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #8441 from alalek:dispatch_mathfuncs_core

parents 36e80175 1e6ce1d2
...@@ -275,6 +275,11 @@ set(CPU_BASELINE_FLAGS "") ...@@ -275,6 +275,11 @@ set(CPU_BASELINE_FLAGS "")
set(CPU_BASELINE_FINAL "") set(CPU_BASELINE_FINAL "")
set(CPU_DISPATCH_FINAL "") set(CPU_DISPATCH_FINAL "")
if(CV_DISABLE_OPTIMIZATION)
set(CPU_DISPATCH "")
set(CPU_DISPATCH_REQUIRE "")
endif()
macro(ocv_check_compiler_optimization OPT) macro(ocv_check_compiler_optimization OPT)
if(NOT DEFINED CPU_${OPT}_SUPPORTED) if(NOT DEFINED CPU_${OPT}_SUPPORTED)
if((DEFINED CPU_${OPT}_FLAGS_ON AND NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x") OR CPU_${OPT}_TEST_FILE) if((DEFINED CPU_${OPT}_FLAGS_ON AND NOT "x${CPU_${OPT}_FLAGS_ON}" STREQUAL "x") OR CPU_${OPT}_TEST_FILE)
...@@ -319,7 +324,7 @@ macro(ocv_check_compiler_optimization OPT) ...@@ -319,7 +324,7 @@ macro(ocv_check_compiler_optimization OPT)
endmacro() endmacro()
foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS}) foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "" FORCE) set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "")
if(NOT DEFINED CPU_${OPT}_FORCE) if(NOT DEFINED CPU_${OPT}_FORCE)
set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}") set(CPU_${OPT}_FORCE "${CPU_${OPT}_IMPLIES}")
endif() endif()
...@@ -515,15 +520,27 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T ...@@ -515,15 +520,27 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T
endforeach() endforeach()
foreach(fname ${${SOURCES_VAR_NAME}}) foreach(fname ${${SOURCES_VAR_NAME}})
string(TOLOWER "${fname}" fname_LOWER) string(TOLOWER "${fname}" fname_LOWER)
if(fname_LOWER MATCHES "[.]opt_.*[.]cpp$") if(fname_LOWER MATCHES "\\.(.*)\\.cpp$")
if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS) string(TOUPPER "${CMAKE_MATCH_1}" OPT_)
message(STATUS "Excluding from source files list: ${fname}") if(OPT_ MATCHES "(CUDA.*|DISPATCH.*|OCL)") # don't touch files like filename.cuda.cpp
list(APPEND __result "${fname}")
#continue()
elseif(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
message(STATUS "Excluding from source files list (optimization is disabled): ${fname}")
#continue() #continue()
else() else()
get_source_file_property(__definitions "${fname}" COMPILE_DEFINITIONS)
if(__definitions)
list(APPEND __definitions "CV_CPU_DISPATCH_MODE=${OPT_}")
else()
set(__definitions "CV_CPU_DISPATCH_MODE=${OPT_}")
endif()
set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${__definitions}")
set(__opt_found 0) set(__opt_found 0)
foreach(OPT ${CPU_BASELINE_FINAL}) foreach(OPT ${CPU_BASELINE_FINAL})
string(TOLOWER "${OPT}" OPT_LOWER) string(TOLOWER "${OPT}" OPT_LOWER)
if(fname_LOWER MATCHES "_${OPT_LOWER}[.]cpp$") if(fname_LOWER MATCHES "\\.${OPT_LOWER}\\.cpp$")
#message("${fname} BASELINE-${OPT}") #message("${fname} BASELINE-${OPT}")
set(__opt_found 1) set(__opt_found 1)
list(APPEND __result "${fname}") list(APPEND __result "${fname}")
...@@ -533,11 +550,11 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T ...@@ -533,11 +550,11 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T
foreach(OPT ${CPU_DISPATCH_FINAL}) foreach(OPT ${CPU_DISPATCH_FINAL})
foreach(OPT2 ${CPU_DISPATCH_${OPT}_FORCED}) foreach(OPT2 ${CPU_DISPATCH_${OPT}_FORCED})
string(TOLOWER "${OPT2}" OPT2_LOWER) string(TOLOWER "${OPT2}" OPT2_LOWER)
if(fname_LOWER MATCHES "_${OPT2_LOWER}[.]cpp$") if(fname_LOWER MATCHES "\\.${OPT2_LOWER}\\.cpp$")
list(APPEND __result_${OPT} "${fname}") list(APPEND __result_${OPT} "${fname}")
math(EXPR CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}+1") math(EXPR CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}+1")
set(CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}" CACHE INTERNAL "" FORCE) set(CPU_${OPT}_USAGE_COUNT "${CPU_${OPT}_USAGE_COUNT}" CACHE INTERNAL "" FORCE)
#message("${fname} ${OPT}") #message("(${CPU_${OPT}_USAGE_COUNT})${fname} ${OPT}")
#message(" ${CPU_DISPATCH_${OPT}_INCLUDED}") #message(" ${CPU_DISPATCH_${OPT}_INCLUDED}")
#message(" ${CPU_DISPATCH_DEFINITIONS_${OPT}}") #message(" ${CPU_DISPATCH_DEFINITIONS_${OPT}}")
#message(" ${CPU_DISPATCH_FLAGS_${OPT}}") #message(" ${CPU_DISPATCH_FLAGS_${OPT}}")
...@@ -573,7 +590,13 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T ...@@ -573,7 +590,13 @@ macro(ocv_compiler_optimization_process_sources SOURCES_VAR_NAME LIBS_VAR_NAME T
list(APPEND __result "$<TARGET_OBJECTS:${TARGET_BASE_NAME}_${OPT}>") list(APPEND __result "$<TARGET_OBJECTS:${TARGET_BASE_NAME}_${OPT}>")
else() else()
foreach(fname ${__result_${OPT}}) foreach(fname ${__result_${OPT}})
set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${CPU_DISPATCH_DEFINITIONS_${OPT}}") get_source_file_property(__definitions "${fname}" COMPILE_DEFINITIONS)
if(__definitions)
list(APPEND __definitions "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
else()
set(__definitions "${CPU_DISPATCH_DEFINITIONS_${OPT}}")
endif()
set_source_files_properties("${fname}" PROPERTIES COMPILE_DEFINITIONS "${__definitions}")
set_source_files_properties("${fname}" PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}") set_source_files_properties("${fname}" PROPERTIES COMPILE_FLAGS "${CPU_DISPATCH_FLAGS_${OPT}}")
endforeach() endforeach()
list(APPEND __result ${__result_${OPT}}) list(APPEND __result ${__result_${OPT}})
...@@ -620,18 +643,25 @@ macro(ocv_compiler_optimization_fill_cpu_config) ...@@ -620,18 +643,25 @@ macro(ocv_compiler_optimization_fill_cpu_config)
set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE} set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT} #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT}
# define CV_CPU_HAS_SUPPORT_${OPT} 1 # define CV_CPU_HAS_SUPPORT_${OPT} 1
# define CV_CPU_CALL_${OPT}(...) return __VA_ARGS__ # define CV_CPU_CALL_${OPT}(fn, args) return (opt_${OPT}::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT} #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT}
# define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT})) # define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT}))
# define CV_CPU_CALL_${OPT}(...) if (CV_CPU_HAS_SUPPORT_${OPT}) return __VA_ARGS__ # define CV_CPU_CALL_${OPT}(fn, args) if (CV_CPU_HAS_SUPPORT_${OPT}) return (opt_${OPT}::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_${OPT} 0 # define CV_CPU_HAS_SUPPORT_${OPT} 0
# define CV_CPU_CALL_${OPT}(...) # define CV_CPU_CALL_${OPT}(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_${OPT}(fn, args, mode, ...) CV_CPU_CALL_${OPT}(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
") ")
endif() endif()
endforeach() endforeach()
set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
")
set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h") set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h")
if(EXISTS "${__file}") if(EXISTS "${__file}")
file(READ "${__file}" __content) file(READ "${__file}" __content)
...@@ -644,6 +674,57 @@ macro(ocv_compiler_optimization_fill_cpu_config) ...@@ -644,6 +674,57 @@ macro(ocv_compiler_optimization_fill_cpu_config)
endif() endif()
endmacro() endmacro()
macro(ocv_add_dispatched_file filename)
if(NOT OPENCV_INITIAL_PASS)
set(__codestr "
#include \"precomp.hpp\"
#include \"${filename}.simd.hpp\"
")
set(__declarations_str "#define CV_CPU_SIMD_FILENAME \"${filename}.simd.hpp\"")
set(__dispatch_modes "BASELINE")
set(__optimizations "${ARGN}")
if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
set(__optimizations "")
endif()
foreach(OPT ${__optimizations})
string(TOLOWER "${OPT}" OPT_LOWER)
set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${OPT_LOWER}.cpp")
if(EXISTS "${__file}")
file(READ "${__file}" __content)
endif()
if(__content STREQUAL __codestr)
#message(STATUS "${__file} contains up-to-date content")
else()
file(WRITE "${__file}" "${__codestr}")
endif()
list(APPEND OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED "${__file}")
set(__declarations_str "${__declarations_str}
#define CV_CPU_DISPATCH_MODE ${OPT}
#include \"opencv2/core/private/cv_cpu_include_simd_declarations.hpp\"
")
set(__dispatch_modes "${OPT}, ${__dispatch_modes}")
endforeach()
set(__declarations_str "${__declarations_str}
#define CV_CPU_DISPATCH_MODES_ALL ${__dispatch_modes}
")
set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.simd_declarations.hpp")
if(EXISTS "${__file}")
file(READ "${__file}" __content)
endif()
if(__content STREQUAL __declarations_str)
#message(STATUS "${__file} contains up-to-date content")
else()
file(WRITE "${__file}" "${__declarations_str}")
endif()
endif()
endmacro()
if(CV_DISABLE_OPTIMIZATION OR CV_ICC) if(CV_DISABLE_OPTIMIZATION OR CV_ICC)
ocv_update(CV_ENABLE_UNROLLED 0) ocv_update(CV_ENABLE_UNROLLED 0)
else() else()
......
...@@ -314,6 +314,7 @@ macro(ocv_glob_modules) ...@@ -314,6 +314,7 @@ macro(ocv_glob_modules)
set(OPENCV_INITIAL_PASS OFF) set(OPENCV_INITIAL_PASS OFF)
if(${BUILD_opencv_world}) if(${BUILD_opencv_world})
foreach(m ${OPENCV_MODULES_BUILD}) foreach(m ${OPENCV_MODULES_BUILD})
set(the_module "${m}")
if("${m}" STREQUAL opencv_world) if("${m}" STREQUAL opencv_world)
add_subdirectory("${OPENCV_MODULE_opencv_world_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/world") add_subdirectory("${OPENCV_MODULE_opencv_world_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/world")
elseif(NOT OPENCV_MODULE_${m}_IS_PART_OF_WORLD AND NOT ${m} STREQUAL opencv_world) elseif(NOT OPENCV_MODULE_${m}_IS_PART_OF_WORLD AND NOT ${m} STREQUAL opencv_world)
...@@ -329,6 +330,7 @@ macro(ocv_glob_modules) ...@@ -329,6 +330,7 @@ macro(ocv_glob_modules)
endforeach() endforeach()
else() else()
foreach(m ${OPENCV_MODULES_BUILD}) foreach(m ${OPENCV_MODULES_BUILD})
set(the_module "${m}")
if(m MATCHES "^opencv_") if(m MATCHES "^opencv_")
string(REGEX REPLACE "^opencv_" "" __shortname "${m}") string(REGEX REPLACE "^opencv_" "" __shortname "${m}")
add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/${__shortname}") add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/${__shortname}")
...@@ -646,11 +648,13 @@ macro(ocv_set_module_sources) ...@@ -646,11 +648,13 @@ macro(ocv_set_module_sources)
ocv_get_module_external_sources() ocv_get_module_external_sources()
endif() endif()
if(OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED)
list(APPEND OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED})
endif()
# use full paths for module to be independent from the module location # use full paths for module to be independent from the module location
ocv_convert_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS) ocv_convert_to_full_paths(OPENCV_MODULE_${the_module}_HEADERS)
ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}") set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}") set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
endmacro() endmacro()
...@@ -766,6 +770,11 @@ macro(ocv_create_module) ...@@ -766,6 +770,11 @@ macro(ocv_create_module)
endmacro() endmacro()
macro(_ocv_create_module) macro(_ocv_create_module)
ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
set(OPENCV_MODULE_${the_module}_HEADERS ${OPENCV_MODULE_${the_module}_HEADERS} CACHE INTERNAL "List of header files for ${the_module}")
set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
# The condition we ought to be testing here is whether ocv_add_precompiled_headers will # The condition we ought to be testing here is whether ocv_add_precompiled_headers will
# be called at some point in the future. We can't look into the future, though, # be called at some point in the future. We can't look into the future, though,
# so this will have to do. # so this will have to do.
......
...@@ -288,11 +288,12 @@ MACRO(ADD_PRECOMPILED_HEADER _targetName _input) ...@@ -288,11 +288,12 @@ MACRO(ADD_PRECOMPILED_HEADER _targetName _input)
foreach(src ${_sources}) foreach(src ${_sources})
if(NOT "${src}" MATCHES "\\.mm$") if(NOT "${src}" MATCHES "\\.mm$")
get_source_file_property(oldProps "${src}" COMPILE_FLAGS) get_source_file_property(oldProps "${src}" COMPILE_FLAGS)
if(NOT oldProps) get_source_file_property(oldProps2 "${src}" COMPILE_DEFINITIONS)
if(NOT oldProps AND NOT oldProps2)
set(newProperties "-include \"${CMAKE_CURRENT_BINARY_DIR}/${_name}\"") set(newProperties "-include \"${CMAKE_CURRENT_BINARY_DIR}/${_name}\"")
set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}") set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}")
else() else()
ocv_debug_message("Skip PCH, flags: ${oldProps} , file: ${src}") ocv_debug_message("Skip PCH, flags: ${oldProps} defines: ${oldProps2}, file: ${src}")
endif() endif()
endif() endif()
endforeach() endforeach()
...@@ -339,11 +340,12 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input) ...@@ -339,11 +340,12 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input)
AND NOT "${src}" MATCHES "^\$" # CMake generator expressions AND NOT "${src}" MATCHES "^\$" # CMake generator expressions
) )
get_source_file_property(oldProps "${src}" COMPILE_FLAGS) get_source_file_property(oldProps "${src}" COMPILE_FLAGS)
if(NOT oldProps) get_source_file_property(oldProps2 "${src}" COMPILE_DEFINITIONS)
if(NOT oldProps AND NOT oldProps2)
set(newProperties "/Yu\"${_input}\" /FI\"${_input}\"") set(newProperties "/Yu\"${_input}\" /FI\"${_input}\"")
set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}") set_source_files_properties("${src}" PROPERTIES COMPILE_FLAGS "${newProperties}")
else() else()
ocv_debug_message("Skip PCH, flags: ${oldProps} , file: ${src}") ocv_debug_message("Skip PCH, flags: ${oldProps} defines: ${oldProps2}, file: ${src}")
endif() endif()
endif() endif()
endforeach() endforeach()
......
set(the_description "The Core Functionality") set(the_description "The Core Functionality")
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
ocv_add_module(core ocv_add_module(core
"${OPENCV_HAL_LINKER_LIBS}" "${OPENCV_HAL_LINKER_LIBS}"
OPTIONAL opencv_cudev OPTIONAL opencv_cudev
......
...@@ -7,6 +7,23 @@ ...@@ -7,6 +7,23 @@
#include "cv_cpu_config.h" #include "cv_cpu_config.h"
#include "cv_cpu_helper.h" #include "cv_cpu_helper.h"
#ifdef CV_CPU_DISPATCH_MODE
#define CV_CPU_OPTIMIZATION_NAMESPACE __CV_CAT(opt_, CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
#else
#define CV_CPU_OPTIMIZATION_NAMESPACE cpu_baseline
#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace cpu_baseline {
#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
#endif
#define __CV_CPU_DISPATCH_CHAIN_END(fn, args, mode, ...) /* done */
#define __CV_CPU_DISPATCH(fn, args, mode, ...) __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#define __CV_CPU_DISPATCH_EXPAND(fn, args, ...) __CV_EXPAND(__CV_CPU_DISPATCH(fn, args, __VA_ARGS__))
#define CV_CPU_DISPATCH(fn, args, ...) __CV_CPU_DISPATCH_EXPAND(fn, args, __VA_ARGS__, END) // expand macros
#if defined CV_ENABLE_INTRINSICS \ #if defined CV_ENABLE_INTRINSICS \
&& !defined CV_DISABLE_OPTIMIZATION \ && !defined CV_DISABLE_OPTIMIZATION \
&& !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \ && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \
...@@ -76,6 +93,16 @@ ...@@ -76,6 +93,16 @@
#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
struct VZeroUpperGuard {
#ifdef __GNUC__
__attribute__((always_inline))
#endif
inline ~VZeroUpperGuard() { _mm256_zeroupper(); }
};
#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard;
#endif
#endif // __OPENCV_BUILD #endif // __OPENCV_BUILD
......
...@@ -2,132 +2,147 @@ ...@@ -2,132 +2,147 @@
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
# define CV_CPU_HAS_SUPPORT_SSE 1 # define CV_CPU_HAS_SUPPORT_SSE 1
# define CV_CPU_CALL_SSE(...) return __VA_ARGS__ # define CV_CPU_CALL_SSE(fn, args) return (opt_SSE::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
# define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE)) # define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
# define CV_CPU_CALL_SSE(...) if (CV_CPU_HAS_SUPPORT_SSE) return __VA_ARGS__ # define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_SSE 0 # define CV_CPU_HAS_SUPPORT_SSE 0
# define CV_CPU_CALL_SSE(...) # define CV_CPU_CALL_SSE(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...) CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
# define CV_CPU_HAS_SUPPORT_SSE2 1 # define CV_CPU_HAS_SUPPORT_SSE2 1
# define CV_CPU_CALL_SSE2(...) return __VA_ARGS__ # define CV_CPU_CALL_SSE2(fn, args) return (opt_SSE2::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
# define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2)) # define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
# define CV_CPU_CALL_SSE2(...) if (CV_CPU_HAS_SUPPORT_SSE2) return __VA_ARGS__ # define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_SSE2 0 # define CV_CPU_HAS_SUPPORT_SSE2 0
# define CV_CPU_CALL_SSE2(...) # define CV_CPU_CALL_SSE2(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...) CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
# define CV_CPU_HAS_SUPPORT_SSE3 1 # define CV_CPU_HAS_SUPPORT_SSE3 1
# define CV_CPU_CALL_SSE3(...) return __VA_ARGS__ # define CV_CPU_CALL_SSE3(fn, args) return (opt_SSE3::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
# define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3)) # define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
# define CV_CPU_CALL_SSE3(...) if (CV_CPU_HAS_SUPPORT_SSE3) return __VA_ARGS__ # define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_SSE3 0 # define CV_CPU_HAS_SUPPORT_SSE3 0
# define CV_CPU_CALL_SSE3(...) # define CV_CPU_CALL_SSE3(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...) CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
# define CV_CPU_HAS_SUPPORT_SSSE3 1 # define CV_CPU_HAS_SUPPORT_SSSE3 1
# define CV_CPU_CALL_SSSE3(...) return __VA_ARGS__ # define CV_CPU_CALL_SSSE3(fn, args) return (opt_SSSE3::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
# define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3)) # define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
# define CV_CPU_CALL_SSSE3(...) if (CV_CPU_HAS_SUPPORT_SSSE3) return __VA_ARGS__ # define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_SSSE3 0 # define CV_CPU_HAS_SUPPORT_SSSE3 0
# define CV_CPU_CALL_SSSE3(...) # define CV_CPU_CALL_SSSE3(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...) CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
# define CV_CPU_HAS_SUPPORT_SSE4_1 1 # define CV_CPU_HAS_SUPPORT_SSE4_1 1
# define CV_CPU_CALL_SSE4_1(...) return __VA_ARGS__ # define CV_CPU_CALL_SSE4_1(fn, args) return (opt_SSE4_1::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
# define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1)) # define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
# define CV_CPU_CALL_SSE4_1(...) if (CV_CPU_HAS_SUPPORT_SSE4_1) return __VA_ARGS__ # define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_SSE4_1 0 # define CV_CPU_HAS_SUPPORT_SSE4_1 0
# define CV_CPU_CALL_SSE4_1(...) # define CV_CPU_CALL_SSE4_1(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...) CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
# define CV_CPU_HAS_SUPPORT_SSE4_2 1 # define CV_CPU_HAS_SUPPORT_SSE4_2 1
# define CV_CPU_CALL_SSE4_2(...) return __VA_ARGS__ # define CV_CPU_CALL_SSE4_2(fn, args) return (opt_SSE4_2::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
# define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2)) # define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
# define CV_CPU_CALL_SSE4_2(...) if (CV_CPU_HAS_SUPPORT_SSE4_2) return __VA_ARGS__ # define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_SSE4_2 0 # define CV_CPU_HAS_SUPPORT_SSE4_2 0
# define CV_CPU_CALL_SSE4_2(...) # define CV_CPU_CALL_SSE4_2(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...) CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
# define CV_CPU_HAS_SUPPORT_POPCNT 1 # define CV_CPU_HAS_SUPPORT_POPCNT 1
# define CV_CPU_CALL_POPCNT(...) return __VA_ARGS__ # define CV_CPU_CALL_POPCNT(fn, args) return (opt_POPCNT::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
# define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT)) # define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
# define CV_CPU_CALL_POPCNT(...) if (CV_CPU_HAS_SUPPORT_POPCNT) return __VA_ARGS__ # define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_POPCNT 0 # define CV_CPU_HAS_SUPPORT_POPCNT 0
# define CV_CPU_CALL_POPCNT(...) # define CV_CPU_CALL_POPCNT(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...) CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
# define CV_CPU_HAS_SUPPORT_AVX 1 # define CV_CPU_HAS_SUPPORT_AVX 1
# define CV_CPU_CALL_AVX(...) return __VA_ARGS__ # define CV_CPU_CALL_AVX(fn, args) return (opt_AVX::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
# define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX)) # define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
# define CV_CPU_CALL_AVX(...) if (CV_CPU_HAS_SUPPORT_AVX) return __VA_ARGS__ # define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_AVX 0 # define CV_CPU_HAS_SUPPORT_AVX 0
# define CV_CPU_CALL_AVX(...) # define CV_CPU_CALL_AVX(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...) CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
# define CV_CPU_HAS_SUPPORT_FP16 1 # define CV_CPU_HAS_SUPPORT_FP16 1
# define CV_CPU_CALL_FP16(...) return __VA_ARGS__ # define CV_CPU_CALL_FP16(fn, args) return (opt_FP16::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
# define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16)) # define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
# define CV_CPU_CALL_FP16(...) if (CV_CPU_HAS_SUPPORT_FP16) return __VA_ARGS__ # define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_FP16 0 # define CV_CPU_HAS_SUPPORT_FP16 0
# define CV_CPU_CALL_FP16(...) # define CV_CPU_CALL_FP16(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...) CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
# define CV_CPU_HAS_SUPPORT_AVX2 1 # define CV_CPU_HAS_SUPPORT_AVX2 1
# define CV_CPU_CALL_AVX2(...) return __VA_ARGS__ # define CV_CPU_CALL_AVX2(fn, args) return (opt_AVX2::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
# define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2)) # define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
# define CV_CPU_CALL_AVX2(...) if (CV_CPU_HAS_SUPPORT_AVX2) return __VA_ARGS__ # define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_AVX2 0 # define CV_CPU_HAS_SUPPORT_AVX2 0
# define CV_CPU_CALL_AVX2(...) # define CV_CPU_CALL_AVX2(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...) CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
# define CV_CPU_HAS_SUPPORT_FMA3 1 # define CV_CPU_HAS_SUPPORT_FMA3 1
# define CV_CPU_CALL_FMA3(...) return __VA_ARGS__ # define CV_CPU_CALL_FMA3(fn, args) return (opt_FMA3::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3 #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
# define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3)) # define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
# define CV_CPU_CALL_FMA3(...) if (CV_CPU_HAS_SUPPORT_FMA3) return __VA_ARGS__ # define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_FMA3 0 # define CV_CPU_HAS_SUPPORT_FMA3 0
# define CV_CPU_CALL_FMA3(...) # define CV_CPU_CALL_FMA3(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...) CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
# define CV_CPU_HAS_SUPPORT_NEON 1 # define CV_CPU_HAS_SUPPORT_NEON 1
# define CV_CPU_CALL_NEON(...) return __VA_ARGS__ # define CV_CPU_CALL_NEON(fn, args) return (opt_NEON::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
# define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON)) # define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
# define CV_CPU_CALL_NEON(...) if (CV_CPU_HAS_SUPPORT_NEON) return __VA_ARGS__ # define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
#else #else
# define CV_CPU_HAS_SUPPORT_NEON 0 # define CV_CPU_HAS_SUPPORT_NEON 0
# define CV_CPU_CALL_NEON(...) # define CV_CPU_CALL_NEON(fn, args)
#endif #endif
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
...@@ -52,6 +52,17 @@ ...@@ -52,6 +52,17 @@
#include "cvconfig.h" #include "cvconfig.h"
#endif #endif
#ifndef __CV_EXPAND
#define __CV_EXPAND(x) x
#endif
#ifndef __CV_CAT
#define __CV_CAT__(x, y) x ## y
#define __CV_CAT_(x, y) __CV_CAT__(x, y)
#define __CV_CAT(x, y) __CV_CAT_(x, y)
#endif
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300 #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
# define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */ # define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
#endif #endif
......
...@@ -60,6 +60,25 @@ ...@@ -60,6 +60,25 @@
// access from within opencv code more accessible // access from within opencv code more accessible
namespace cv { namespace cv {
#ifndef CV_DOXYGEN
#ifdef CV_CPU_DISPATCH_MODE
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#else
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#endif
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
//! @addtogroup core_hal_intrin //! @addtogroup core_hal_intrin
//! @{ //! @{
...@@ -281,6 +300,9 @@ template <typename T> struct V_SIMD128Traits ...@@ -281,6 +300,9 @@ template <typename T> struct V_SIMD128Traits
//! @} //! @}
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
} }
#ifdef CV_DOXYGEN #ifdef CV_DOXYGEN
...@@ -323,6 +345,10 @@ template <typename T> struct V_SIMD128Traits ...@@ -323,6 +345,10 @@ template <typename T> struct V_SIMD128Traits
namespace cv { namespace cv {
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
template <typename R> struct V_RegTrait128; template <typename R> struct V_RegTrait128;
template <> struct V_RegTrait128<uchar> { template <> struct V_RegTrait128<uchar> {
...@@ -407,6 +433,10 @@ template <> struct V_RegTrait128<double> { ...@@ -407,6 +433,10 @@ template <> struct V_RegTrait128<double> {
}; };
#endif #endif
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
} // cv:: } // cv::
//! @endcond //! @endcond
......
...@@ -53,6 +53,10 @@ ...@@ -53,6 +53,10 @@
namespace cv namespace cv
{ {
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
/** @addtogroup core_hal_intrin /** @addtogroup core_hal_intrin
"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
...@@ -1827,7 +1831,9 @@ static inline bool hasSIMD128() ...@@ -1827,7 +1831,9 @@ static inline bool hasSIMD128()
//! @} //! @}
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
} }
#endif #endif
...@@ -53,6 +53,8 @@ namespace cv ...@@ -53,6 +53,8 @@ namespace cv
//! @cond IGNORED //! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD128 1 #define CV_SIMD128 1
#if defined(__aarch64__) #if defined(__aarch64__)
#define CV_SIMD128_64F 1 #define CV_SIMD128_64F 1
...@@ -1238,11 +1240,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a) ...@@ -1238,11 +1240,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a)
//! @brief Check CPU capability of SIMD operation //! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD128() static inline bool hasSIMD128()
{ {
return checkHardwareSupport(CV_CPU_NEON); return (CV_CPU_HAS_SUPPORT_NEON) ? true : false;
} }
//! @} //! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond //! @endcond
} }
......
...@@ -56,6 +56,8 @@ namespace cv ...@@ -56,6 +56,8 @@ namespace cv
//! @cond IGNORED //! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
struct v_uint8x16 struct v_uint8x16
{ {
typedef uchar lane_type; typedef uchar lane_type;
...@@ -1791,11 +1793,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a) ...@@ -1791,11 +1793,13 @@ inline v_float16x4 v_cvt_f16(const v_float32x4& a)
//! @brief Check CPU capability of SIMD operation //! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD128() static inline bool hasSIMD128()
{ {
return checkHardwareSupport(CV_CPU_SSE2); return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
} }
//! @} //! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond //! @endcond
} }
......
...@@ -540,7 +540,7 @@ CV_EXPORTS InstrNode* getCurrentNode(); ...@@ -540,7 +540,7 @@ CV_EXPORTS InstrNode* getCurrentNode();
///// General instrumentation ///// General instrumentation
// General OpenCV region instrumentation macro // General OpenCV region instrumentation macro
#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_META(__FUNCTION__, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN) #define CV_INSTRUMENT_REGION_() CV_INSTRUMENT_REGION_META(__FUNCTION__, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN)
// Custom OpenCV region instrumentation macro // Custom OpenCV region instrumentation macro
#define CV_INSTRUMENT_REGION_NAME(NAME) CV_INSTRUMENT_REGION_CUSTOM_META(NAME, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN) #define CV_INSTRUMENT_REGION_NAME(NAME) CV_INSTRUMENT_REGION_CUSTOM_META(NAME, false, ::cv::instr::TYPE_GENERAL, ::cv::instr::IMPL_PLAIN)
// Instrumentation for parallel_for_ or other regions which forks and gathers threads // Instrumentation for parallel_for_ or other regions which forks and gathers threads
...@@ -566,7 +566,7 @@ CV_EXPORTS InstrNode* getCurrentNode(); ...@@ -566,7 +566,7 @@ CV_EXPORTS InstrNode* getCurrentNode();
#else #else
#define CV_INSTRUMENT_REGION_META(...) #define CV_INSTRUMENT_REGION_META(...)
#define CV_INSTRUMENT_REGION() #define CV_INSTRUMENT_REGION_()
#define CV_INSTRUMENT_REGION_NAME(...) #define CV_INSTRUMENT_REGION_NAME(...)
#define CV_INSTRUMENT_REGION_MT_FORK() #define CV_INSTRUMENT_REGION_MT_FORK()
...@@ -580,6 +580,12 @@ CV_EXPORTS InstrNode* getCurrentNode(); ...@@ -580,6 +580,12 @@ CV_EXPORTS InstrNode* getCurrentNode();
#define CV_INSTRUMENT_MARK_OPENCL(...) #define CV_INSTRUMENT_MARK_OPENCL(...)
#endif #endif
#ifdef __CV_AVX_GUARD
#define CV_INSTRUMENT_REGION() __CV_AVX_GUARD CV_INSTRUMENT_REGION_()
#else
#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_()
#endif
//! @endcond //! @endcond
#endif // OPENCV_CORE_PRIVATE_HPP #endif // OPENCV_CORE_PRIVATE_HPP
// Helper file to include dispatched functions declaration:
//
// Usage:
// #define CV_CPU_SIMD_FILENAME "<filename>.simd.hpp"
// #define CV_CPU_DISPATCH_MODE AVX2
// #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
// #define CV_CPU_DISPATCH_MODE SSE2
// #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
#ifndef CV_DISABLE_OPTIMIZATION
#ifdef _MSC_VER
#pragma warning(disable: 4702) // unreachable code
#endif
#endif
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#endif
#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
#undef CV_CPU_OPTIMIZATION_NAMESPACE_END
#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
#include CV_CPU_SIMD_FILENAME
#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
#undef CV_CPU_OPTIMIZATION_NAMESPACE_END
#undef CV_CPU_DISPATCH_MODE
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "precomp.hpp"
#include "mathfuncs_core.simd.hpp"
#include "mathfuncs_core.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv { namespace hal {
///////////////////////////////////// ATAN2 ////////////////////////////////////
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
CV_CPU_DISPATCH(fastAtan32f, (Y, X, angle, len, angleInDegrees),
CV_CPU_DISPATCH_MODES_ALL);
}
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
{
CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
CV_CPU_DISPATCH(fastAtan64f, (Y, X, angle, len, angleInDegrees),
CV_CPU_DISPATCH_MODES_ALL);
}
// deprecated
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{
CV_INSTRUMENT_REGION()
fastAtan32f(Y, X, angle, len, angleInDegrees);
}
void magnitude32f(const float* x, const float* y, float* mag, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
CV_CPU_DISPATCH(magnitude32f, (x, y, mag, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void magnitude64f(const double* x, const double* y, double* mag, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0);
CV_CPU_DISPATCH(magnitude64f, (x, y, mag, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void invSqrt32f(const float* src, float* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0);
CV_CPU_DISPATCH(invSqrt32f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void invSqrt64f(const double* src, double* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0);
CV_CPU_DISPATCH(invSqrt64f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void sqrt32f(const float* src, float* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0);
CV_CPU_DISPATCH(sqrt32f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void sqrt64f(const double* src, double* dst, int len)
{
CV_INSTRUMENT_REGION()
CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0);
CV_CPU_DISPATCH(sqrt64f, (src, dst, len),
CV_CPU_DISPATCH_MODES_ALL);
}
void exp32f(const float *src, float *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0);
CV_CPU_DISPATCH(exp32f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
void exp64f(const double *src, double *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0);
CV_CPU_DISPATCH(exp64f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
void log32f(const float *src, float *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(log32f, cv_hal_log32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0);
CV_CPU_DISPATCH(log32f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
void log64f(const double *src, double *dst, int n)
{
CV_INSTRUMENT_REGION()
CALL_HAL(log64f, cv_hal_log64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0);
CV_CPU_DISPATCH(log64f, (src, dst, n),
CV_CPU_DISPATCH_MODES_ALL);
}
//=============================================================================
// for compatibility with 3.0
void exp(const float* src, float* dst, int n)
{
exp32f(src, dst, n);
}
void exp(const double* src, double* dst, int n)
{
exp64f(src, dst, n);
}
void log(const float* src, float* dst, int n)
{
log32f(src, dst, n);
}
void log(const double* src, double* dst, int n)
{
log64f(src, dst, n);
}
void magnitude(const float* x, const float* y, float* dst, int n)
{
magnitude32f(x, y, dst, n);
}
void magnitude(const double* x, const double* y, double* dst, int n)
{
magnitude64f(x, y, dst, n);
}
void sqrt(const float* src, float* dst, int len)
{
sqrt32f(src, dst, len);
}
void sqrt(const double* src, double* dst, int len)
{
sqrt64f(src, dst, len);
}
void invSqrt(const float* src, float* dst, int len)
{
invSqrt32f(src, dst, len);
}
void invSqrt(const double* src, double* dst, int len)
{
invSqrt64f(src, dst, len);
}
}} // namespace cv::hal::
float cv::fastAtan2( float y, float x )
{
using namespace cv::hal;
CV_CPU_CALL_BASELINE(fastAtan2, (y, x));
}
/*M/////////////////////////////////////////////////////////////////////////////////////// // This file is part of OpenCV project.
// // It is subject to the license terms in the LICENSE file found in the top-level directory
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // of this distribution and at http://opencv.org/license.html.
//
// By downloading, copying, installing or using the software you agree to this license. namespace cv { namespace hal {
// If you do not agree to this license, do not download, install,
// copy or use the software. CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
//
// // forward declarations
// License Agreement void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
// For Open Source Computer Vision Library void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees);
// void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. void magnitude32f(const float* x, const float* y, float* mag, int len);
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. void magnitude64f(const double* x, const double* y, double* mag, int len);
// Third party copyrights are property of their respective owners. void invSqrt32f(const float* src, float* dst, int len);
// void invSqrt64f(const double* src, double* dst, int len);
// Redistribution and use in source and binary forms, with or without modification, void sqrt32f(const float* src, float* dst, int len);
// are permitted provided that the following conditions are met: void sqrt64f(const double* src, double* dst, int len);
// void exp32f(const float *src, float *dst, int n);
// * Redistribution's of source code must retain the above copyright notice, void exp64f(const double *src, double *dst, int n);
// this list of conditions and the following disclaimer. void log32f(const float *src, float *dst, int n);
// void log64f(const double *src, double *dst, int n);
// * Redistribution's in binary form must reproduce the above copyright notice, float fastAtan2(float y, float x);
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace std; using namespace std;
...@@ -197,23 +180,17 @@ static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angl ...@@ -197,23 +180,17 @@ static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angl
} // anonymous:: } // anonymous::
namespace cv { namespace hal {
///////////////////////////////////// ATAN2 //////////////////////////////////// ///////////////////////////////////// ATAN2 ////////////////////////////////////
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
atanImpl<float>(Y, X, angle, len, angleInDegrees); atanImpl<float>(Y, X, angle, len, angleInDegrees);
} }
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees) void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
atanImpl<double>(Y, X, angle, len, angleInDegrees); atanImpl<double>(Y, X, angle, len, angleInDegrees);
} }
...@@ -221,7 +198,6 @@ void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool ...@@ -221,7 +198,6 @@ void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
fastAtan32f(Y, X, angle, len, angleInDegrees); fastAtan32f(Y, X, angle, len, angleInDegrees);
} }
...@@ -229,9 +205,6 @@ void magnitude32f(const float* x, const float* y, float* mag, int len) ...@@ -229,9 +205,6 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
int i = 0; int i = 0;
#if CV_SIMD128 #if CV_SIMD128
...@@ -257,9 +230,6 @@ void magnitude64f(const double* x, const double* y, double* mag, int len) ...@@ -257,9 +230,6 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0);
int i = 0; int i = 0;
#if CV_SIMD128_64F #if CV_SIMD128_64F
...@@ -286,9 +256,6 @@ void invSqrt32f(const float* src, float* dst, int len) ...@@ -286,9 +256,6 @@ void invSqrt32f(const float* src, float* dst, int len)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0);
int i = 0; int i = 0;
#if CV_SIMD128 #if CV_SIMD128
...@@ -310,9 +277,6 @@ void invSqrt64f(const double* src, double* dst, int len) ...@@ -310,9 +277,6 @@ void invSqrt64f(const double* src, double* dst, int len)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0);
int i = 0; int i = 0;
#if CV_SSE2 #if CV_SSE2
...@@ -330,9 +294,6 @@ void sqrt32f(const float* src, float* dst, int len) ...@@ -330,9 +294,6 @@ void sqrt32f(const float* src, float* dst, int len)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0);
int i = 0; int i = 0;
#if CV_SIMD128 #if CV_SIMD128
...@@ -354,9 +315,6 @@ void sqrt64f(const double* src, double* dst, int len) ...@@ -354,9 +315,6 @@ void sqrt64f(const double* src, double* dst, int len)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0);
int i = 0; int i = 0;
#if CV_SIMD128_64F #if CV_SIMD128_64F
...@@ -381,9 +339,6 @@ void exp32f(const float *src, float *dst, int n) ...@@ -381,9 +339,6 @@ void exp32f(const float *src, float *dst, int n)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0);
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
dst[i] = std::exp(src[i]); dst[i] = std::exp(src[i]);
...@@ -394,9 +349,6 @@ void exp64f(const double *src, double *dst, int n) ...@@ -394,9 +349,6 @@ void exp64f(const double *src, double *dst, int n)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0);
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
dst[i] = std::exp(src[i]); dst[i] = std::exp(src[i]);
...@@ -407,9 +359,6 @@ void log32f(const float *src, float *dst, int n) ...@@ -407,9 +359,6 @@ void log32f(const float *src, float *dst, int n)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(log32f, cv_hal_log32f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0);
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
dst[i] = std::log(src[i]); dst[i] = std::log(src[i]);
...@@ -419,9 +368,6 @@ void log64f(const double *src, double *dst, int n) ...@@ -419,9 +368,6 @@ void log64f(const double *src, double *dst, int n)
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(log64f, cv_hal_log64f, src, dst, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0);
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
dst[i] = std::log(src[i]); dst[i] = std::log(src[i]);
...@@ -534,9 +480,6 @@ void exp32f( const float *_x, float *y, int n ) ...@@ -534,9 +480,6 @@ void exp32f( const float *_x, float *y, int n )
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(exp32f, cv_hal_exp32f, _x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, _x, y, n) >= 0);
static const float static const float
A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
...@@ -551,7 +494,90 @@ void exp32f( const float *_x, float *y, int n ) ...@@ -551,7 +494,90 @@ void exp32f( const float *_x, float *y, int n )
const Cv32suf* x = (const Cv32suf*)_x; const Cv32suf* x = (const Cv32suf*)_x;
Cv32suf buf[4]; Cv32suf buf[4];
#if CV_SSE2 #if CV_AVX2
if( n >= 8 )
{
static const __m256d prescale4 = _mm256_set1_pd(exp_prescale);
static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale);
static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
static const __m256 mA1 = _mm256_set1_ps(A1);
static const __m256 mA2 = _mm256_set1_ps(A2);
static const __m256 mA3 = _mm256_set1_ps(A3);
static const __m256 mA4 = _mm256_set1_ps(A4);
bool y_aligned = (size_t)(void*)y % 32 == 0;
ushort CV_DECL_ALIGNED(32) tab_idx[16];
for( ; i <= n - 8; i += 8 )
{
__m128i xi0, xi1;
__m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4));
__m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4));
xd0 = _mm256_mul_pd(xd0, prescale4);
xd1 = _mm256_mul_pd(xd1, prescale4);
xi0 = _mm256_cvtpd_epi32(xd0);
xi1 = _mm256_cvtpd_epi32(xd1);
xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0));
xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1));
// gcc does not support _mm256_set_m128
//xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
__m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1);
xf = _mm256_mul_ps(xf, postscale8);
xi0 = _mm_packs_epi32(xi0, xi1);
_mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
__m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]);
__m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]);
// gcc does not support _mm256_set_m128
//__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
__m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1);
//_mm256_set_m128i(xi1, xi0)
__m256i temp = (__m256i)_mm256_insertf128_ps(_mm256_castps128_ps256((__m128)xi0), (__m128)xi1, 1);
yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23)));
__m256 zf = _mm256_add_ps(xf, mA1);
#if CV_FMA3
zf = _mm256_fmadd_ps(zf, xf, mA2);
zf = _mm256_fmadd_ps(zf, xf, mA3);
zf = _mm256_fmadd_ps(zf, xf, mA4);
#else
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2);
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3);
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4);
#endif
zf = _mm256_mul_ps(zf, yf);
if( y_aligned )
{
_mm256_store_ps(y + i, zf);
}
else
{
_mm256_storeu_ps(y + i, zf);
}
}
}
#elif CV_SSE2
if( n >= 8 ) if( n >= 8 )
{ {
static const __m128d prescale2 = _mm_set1_pd(exp_prescale); static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
...@@ -738,9 +764,6 @@ void exp64f( const double *_x, double *y, int n ) ...@@ -738,9 +764,6 @@ void exp64f( const double *_x, double *y, int n )
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(exp64f, cv_hal_exp64f, _x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, _x, y, n) >= 0);
static const double static const double
A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
...@@ -1187,9 +1210,6 @@ void log32f( const float *_x, float *y, int n ) ...@@ -1187,9 +1210,6 @@ void log32f( const float *_x, float *y, int n )
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(log32f, cv_hal_log32f, _x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, _x, y, n) >= 0);
static const float shift[] = { 0, -1.f/512 }; static const float shift[] = { 0, -1.f/512 };
static const float static const float
A0 = 0.3333333333333333333333333f, A0 = 0.3333333333333333333333333f,
...@@ -1336,9 +1356,6 @@ void log64f( const double *x, double *y, int n ) ...@@ -1336,9 +1356,6 @@ void log64f( const double *x, double *y, int n )
{ {
CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION()
CALL_HAL(log64f, cv_hal_log64f, x, y, n);
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, x, y, n) >= 0);
static const double shift[] = { 0, -1./512 }; static const double shift[] = { 0, -1./512 };
static const double static const double
A7 = 1.0, A7 = 1.0,
...@@ -1524,64 +1541,13 @@ void log64f( const double *x, double *y, int n ) ...@@ -1524,64 +1541,13 @@ void log64f( const double *x, double *y, int n )
#endif // issue 7795 #endif // issue 7795
//============================================================================= float fastAtan2( float y, float x )
// for compatibility with 3.0
void exp(const float* src, float* dst, int n)
{
exp32f(src, dst, n);
}
void exp(const double* src, double* dst, int n)
{
exp64f(src, dst, n);
}
void log(const float* src, float* dst, int n)
{ {
log32f(src, dst, n); return atanImpl<float>(y, x);
}
void log(const double* src, double* dst, int n)
{
log64f(src, dst, n);
}
void magnitude(const float* x, const float* y, float* dst, int n)
{
magnitude32f(x, y, dst, n);
}
void magnitude(const double* x, const double* y, double* dst, int n)
{
magnitude64f(x, y, dst, n);
}
void sqrt(const float* src, float* dst, int len)
{
sqrt32f(src, dst, len);
}
void sqrt(const double* src, double* dst, int len)
{
sqrt64f(src, dst, len);
}
void invSqrt(const float* src, float* dst, int len)
{
invSqrt32f(src, dst, len);
}
void invSqrt(const double* src, double* dst, int len)
{
invSqrt64f(src, dst, len);
} }
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
} // cv::hal:: CV_CPU_OPTIMIZATION_NAMESPACE_END
} // cv::
float cv::fastAtan2( float y, float x ) }} // namespace cv::hal
{
return atanImpl<float>(y, x);
}
...@@ -24,6 +24,7 @@ if(NOT OPENCV_INITIAL_PASS) ...@@ -24,6 +24,7 @@ if(NOT OPENCV_INITIAL_PASS)
message(STATUS "Processing WORLD modules...") message(STATUS "Processing WORLD modules...")
foreach(m ${OPENCV_MODULES_BUILD}) foreach(m ${OPENCV_MODULES_BUILD})
set(the_module ${m})
if(OPENCV_MODULE_${m}_IS_PART_OF_WORLD) if(OPENCV_MODULE_${m}_IS_PART_OF_WORLD)
message(STATUS " module ${m}...") message(STATUS " module ${m}...")
set(CMAKE_CURRENT_SOURCE_DIR "${OPENCV_MODULE_${m}_LOCATION}") set(CMAKE_CURRENT_SOURCE_DIR "${OPENCV_MODULE_${m}_LOCATION}")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment