Commit 91ac9688 authored by Andrey Kamaev's avatar Andrey Kamaev

Allow OpenCL acceleration in every OpenCV module

parent d28df08e
...@@ -782,7 +782,7 @@ if(HAVE_CUDA) ...@@ -782,7 +782,7 @@ if(HAVE_CUDA)
status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO) status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO)
endif() endif()
if(HAVE_OPENCL AND BUILD_opencv_ocl) if(HAVE_OPENCL)
status("") status("")
status(" OpenCL") status(" OpenCL")
if(OPENCL_INCLUDE_DIR) if(OPENCL_INCLUDE_DIR)
......
...@@ -432,10 +432,22 @@ macro(ocv_glob_module_sources) ...@@ -432,10 +432,22 @@ macro(ocv_glob_module_sources)
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h") file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
file(GLOB cl_kernels "src/opencl/*.cl")
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Include" FILES ${lib_hdrs}) source_group("Include" FILES ${lib_hdrs})
source_group("Include\\detail" FILES ${lib_hdrs_detail}) source_group("Include\\detail" FILES ${lib_hdrs_detail})
if(HAVE_OPENCL AND cl_kernels)
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
endif()
ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs}) ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs})
endmacro() endmacro()
...@@ -449,6 +461,9 @@ macro(ocv_create_module) ...@@ -449,6 +461,9 @@ macro(ocv_create_module)
if(NOT "${ARGN}" STREQUAL "SKIP_LINK") if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN}) target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
if(HAVE_OPENCL AND OPENCL_LIBRARIES)
target_link_libraries(${the_module} ${OPENCL_LIBRARIES})
endif()
endif() endif()
add_dependencies(opencv_modules ${the_module}) add_dependencies(opencv_modules ${the_module})
......
...@@ -3,45 +3,5 @@ if(NOT HAVE_OPENCL) ...@@ -3,45 +3,5 @@ if(NOT HAVE_OPENCL)
endif() endif()
set(the_description "OpenCL-accelerated Computer Vision") set(the_description "OpenCL-accelerated Computer Vision")
ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree) ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
ocv_module_include_directories(${OPENCL_INCLUDE_DIRS})
file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake")
add_custom_command(
OUTPUT ${kernels_cpp}
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script}
DEPENDS ${CL_FILES} ${cl2cpp_script})
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
file(GLOB lib_srcs "src/*.cpp")
file(GLOB lib_int_hdrs "src/*.h*")
source_group("Include" FILES ${lib_hdrs})
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp})
ocv_create_module(${OPENCL_LIBRARIES})
ocv_add_precompiled_headers(${the_module})
################################################################################################################
################################ OpenCL Module Tests ##################################################
################################################################################################################
file(GLOB test_srcs "test/*.cpp")
file(GLOB test_hdrs "test/*.hpp" "test/*.h")
ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
FILES "Src" ${test_srcs})
################################################################################################################
################################ OpenCL Module Performance ##################################################
################################################################################################################
file(GLOB perf_srcs "perf/*.cpp")
file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
FILES "Src" ${perf_srcs})
This diff is collapsed.
...@@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset ...@@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data ,src2_data; uchar4 src1_data ,src2_data;
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
...@@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs ...@@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse ...@@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, ...@@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3) #define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3) #define dst_align ((dst_offset >> bitOfInt) & 3)
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt)); int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
...@@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, ...@@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
...@@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset ...@@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
...@@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs ...@@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3)); int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
......
...@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr ...@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr ...@@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src ...@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src ...@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
char4 src2_data = vload4(0, src2 + src2_index_fix); char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s ...@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s ...@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr ...@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr ...@@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src ...@@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
} }
} }
#endif #endif
...@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_ ...@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
} }
} }
#endif #endif
...@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr ...@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
} }
} }
#endif #endif
...@@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr ...@@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr ...@@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = ~ src1_data; uchar4 tmp_data = ~ src1_data;
/* if(src1_index < 0) /* if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
...@@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src ...@@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s ...@@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr ...@@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o ...@@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
{ {
int src_index = mad24(y, src_step, (x << 3) + src_offset); int src_index = mad24(y, src_step, (x << 3) + src_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 data; char8 data;
data = *((__global char8 *)((__global char *)src + src_index)); data = *((__global char8 *)((__global char *)src + src_index));
data = ~ data; data = ~ data;
*((__global char8 *)((__global char *)dst + dst_index)) = data; *((__global char8 *)((__global char *)dst + dst_index)) = data;
} }
} }
#endif #endif
...@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src ...@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1 ...@@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr ...@@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src ...@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1 ...@@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1
} }
} }
#endif #endif
...@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s ...@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
} }
} }
#endif #endif
...@@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in ...@@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
} }
} }
#endif #endif
...@@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src ...@@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
} }
} }
#endif #endif
...@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr ...@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr ...@@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src ...@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src ...@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
char4 src2_data = vload4(0, src2 + src2_index_fix); char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s ...@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s ...@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr ...@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
...@@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr ...@@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src ...@@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src
} }
} }
#endif #endif
...@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_ ...@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
} }
} }
#endif #endif
...@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr ...@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
} }
} }
#endif #endif
...@@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse ...@@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
} }
} }
#endif #endif
...@@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of ...@@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
...@@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of ...@@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
...@@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of ...@@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
src1_data.s01234567 = src1_data.s45670123; src1_data.s01234567 = src1_data.s45670123;
if(src1_index== -2) if(src1_index== -2)
src1_data.s01234567 = src1_data.s23456701; src1_data.s01234567 = src1_data.s23456701;
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
......
...@@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int ...@@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]); dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
} }
} }
...@@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el ...@@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el
dst[gid + groupnum] = localmem_max[0]; dst[gid + groupnum] = localmem_max[0];
} }
} }
...@@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in ...@@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
dst[gid] = localmem_sum[0]; dst[gid] = localmem_sum[0];
} }
} }
...@@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum, ...@@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
dst[gid*3+2] = localmem_sum3[0]; dst[gid*3+2] = localmem_sum3[0];
} }
} }
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// @Authors // @Authors
// Liu Liujun, liujun@multicorewareinc.com // Liu Liujun, liujun@multicorewareinc.com
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met: // are permitted provided that the following conditions are met:
...@@ -61,7 +61,7 @@ __kernel void BlendLinear_C1_D0( ...@@ -61,7 +61,7 @@ __kernel void BlendLinear_C1_D0(
int pos = mad24(idy,istep >> 2,idx); int pos = mad24(idy,istep >> 2,idx);
int wpos = mad24(idy,wstep >> 2,idx); int wpos = mad24(idy,wstep >> 2,idx);
float4 w1 = weight1[wpos], w2 = weight2[wpos]; float4 w1 = weight1[wpos], w2 = weight2[wpos];
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
} }
} }
...@@ -86,7 +86,7 @@ __kernel void BlendLinear_C4_D0( ...@@ -86,7 +86,7 @@ __kernel void BlendLinear_C4_D0(
int wpos = mad24(idy,wstep, idx); int wpos = mad24(idy,wstep, idx);
float w1 = weight1[wpos]; float w1 = weight1[wpos];
float w2 = weight2[wpos]; float w2 = weight2[wpos];
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
} }
} }
...@@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5( ...@@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5(
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
} }
} }
This diff is collapsed.
...@@ -234,4 +234,3 @@ __kernel ...@@ -234,4 +234,3 @@ __kernel
map_y[y * step_y + x] = ycoo; map_y[y * step_y + x] = ycoo;
} }
} }
...@@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ ...@@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
dst[start_addr] = sum; dst[start_addr] = sum;
} }
} }
...@@ -559,7 +559,3 @@ if(result) ...@@ -559,7 +559,3 @@ if(result)
} }
} }
*/ */
...@@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH ...@@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
newnode[counter].alpha[0] = t1.alpha[0]; newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1]; newnode[counter].alpha[1] = t1.alpha[1];
} }
...@@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global ...@@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
dst[gy*(dst_step >> 2)+gx] = res; dst[gy*(dst_step >> 2)+gx] = res;
} }
} }
...@@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist( ...@@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
} }
} }
*/ */
...@@ -484,4 +484,3 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i ...@@ -484,4 +484,3 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
} }
#undef op(a,b) #undef op(a,b)
...@@ -411,4 +411,3 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src, ...@@ -411,4 +411,3 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
dst[dpos] = src[spos]; dst[dpos] = src[spos];
} }
...@@ -150,4 +150,3 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa ...@@ -150,4 +150,3 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
} }
} }
} }
...@@ -682,4 +682,3 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 ...@@ -682,4 +682,3 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
} }
} }
} }
...@@ -821,4 +821,3 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 ...@@ -821,4 +821,3 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
res[res_idx] = normAcc(num, denum); res[res_idx] = normAcc(num, denum);
} }
} }
...@@ -240,4 +240,3 @@ __kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr, ...@@ -240,4 +240,3 @@ __kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
// outsp[basesp] =(short2)((short)x0,(short)y0); // outsp[basesp] =(short2)((short)x0,(short)y0);
} }
} }
...@@ -27,7 +27,7 @@ typedef long T; ...@@ -27,7 +27,7 @@ typedef long T;
#define DST_ROW_A03 9 #define DST_ROW_A03 9
__kernel void icvContourMoments(int contour_total, __kernel void icvContourMoments(int contour_total,
__global float* reader_oclmat_data, __global float* reader_oclmat_data,
__global T* dst_a, __global T* dst_a,
int dst_step) int dst_step)
{ {
...@@ -58,7 +58,7 @@ __kernel void icvContourMoments(int contour_total, ...@@ -58,7 +58,7 @@ __kernel void icvContourMoments(int contour_total,
dxy = xi_1 * yi - xi * yi_1; dxy = xi_1 * yi - xi * yi_1;
xii_1 = xi_1 + xi; xii_1 = xi_1 + xi;
yii_1 = yi_1 + yi; yii_1 = yi_1 + yi;
dst_step /= sizeof(T); dst_step /= sizeof(T);
*( dst_a + DST_ROW_A00 * dst_step + idx) = dxy; *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
*( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1; *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
......
...@@ -57,4 +57,3 @@ __kernel void set_to_with_mask( ...@@ -57,4 +57,3 @@ __kernel void set_to_with_mask(
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment