Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
90df5e00
Commit
90df5e00
authored
Mar 26, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'upstream/3.4' into merge-3.4
parents
8c25a8eb
9340fc0c
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
277 additions
and
27 deletions
+277
-27
cl_d3d11_ext.h
3rdparty/include/opencl/1.2/CL/cl_d3d11_ext.h
+122
-0
CMakeLists.txt
CMakeLists.txt
+8
-0
OpenCVDetectOpenCL.cmake
cmake/OpenCVDetectOpenCL.cmake
+7
-2
cvconfig.h.in
cmake/templates/cvconfig.h.in
+3
-0
opencl_core.hpp
...opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
+1
-0
directx.cpp
modules/core/src/directx.cpp
+0
-0
directx.inc.hpp
modules/core/src/directx.inc.hpp
+3
-0
opencl_core.hpp.in
.../src/opencl/runtime/generator/template/opencl_core.hpp.in
+1
-0
opencl_core.cpp
modules/core/src/opencl/runtime/opencl_core.cpp
+1
-0
lrn_layer.cpp
modules/dnn/src/layers/lrn_layer.cpp
+1
-1
onnx_importer.cpp
modules/dnn/src/onnx/onnx_importer.cpp
+31
-0
test_halide_layers.cpp
modules/dnn/test/test_halide_layers.cpp
+4
-0
test_onnx_importer.cpp
modules/dnn/test/test_onnx_importer.cpp
+1
-0
sumpixels.avx512_skx.cpp
modules/imgproc/src/sumpixels.avx512_skx.cpp
+92
-23
sumpixels.cpp
modules/imgproc/src/sumpixels.cpp
+1
-1
opencl-opencv-interop.cpp
samples/opencl/opencl-opencv-interop.cpp
+1
-0
No files found.
3rdparty/include/opencl/1.2/CL/cl_d3d11_ext.h
0 → 100644
View file @
90df5e00
/**********************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
#ifndef __OPENCL_CL_D3D11_EXT_H
#define __OPENCL_CL_D3D11_EXT_H
#include <d3d11.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern
"C"
{
#endif
/******************************************************************************
* cl_nv_d3d11_sharing */
typedef
cl_uint
cl_d3d11_device_source_nv
;
typedef
cl_uint
cl_d3d11_device_set_nv
;
/******************************************************************************/
// Error Codes
#define CL_INVALID_D3D11_DEVICE_NV -1006
#define CL_INVALID_D3D11_RESOURCE_NV -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV -1009
// cl_d3d11_device_source_nv
#define CL_D3D11_DEVICE_NV 0x4019
#define CL_D3D11_DXGI_ADAPTER_NV 0x401A
// cl_d3d11_device_set_nv
#define CL_PREFERRED_DEVICES_FOR_D3D11_NV 0x401B
#define CL_ALL_DEVICES_FOR_D3D11_NV 0x401C
// cl_context_info
#define CL_CONTEXT_D3D11_DEVICE_NV 0x401D
// cl_mem_info
#define CL_MEM_D3D11_RESOURCE_NV 0x401E
// cl_image_info
#define CL_IMAGE_D3D11_SUBRESOURCE_NV 0x401F
// cl_command_type
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV 0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV 0x4021
/******************************************************************************/
typedef
CL_API_ENTRY
cl_int
(
CL_API_CALL
*
clGetDeviceIDsFromD3D11NV_fn
)(
cl_platform_id
platform
,
cl_d3d11_device_source_nv
d3d_device_source
,
void
*
d3d_object
,
cl_d3d11_device_set_nv
d3d_device_set
,
cl_uint
num_entries
,
cl_device_id
*
devices
,
cl_uint
*
num_devices
)
CL_API_SUFFIX__VERSION_1_0
;
typedef
CL_API_ENTRY
cl_mem
(
CL_API_CALL
*
clCreateFromD3D11BufferNV_fn
)(
cl_context
context
,
cl_mem_flags
flags
,
ID3D11Buffer
*
resource
,
cl_int
*
errcode_ret
)
CL_API_SUFFIX__VERSION_1_0
;
typedef
CL_API_ENTRY
cl_mem
(
CL_API_CALL
*
clCreateFromD3D11Texture2DNV_fn
)(
cl_context
context
,
cl_mem_flags
flags
,
ID3D11Texture2D
*
resource
,
UINT
subresource
,
cl_int
*
errcode_ret
)
CL_API_SUFFIX__VERSION_1_0
;
typedef
CL_API_ENTRY
cl_mem
(
CL_API_CALL
*
clCreateFromD3D11Texture3DNV_fn
)(
cl_context
context
,
cl_mem_flags
flags
,
ID3D11Texture3D
*
resource
,
UINT
subresource
,
cl_int
*
errcode_ret
)
CL_API_SUFFIX__VERSION_1_0
;
typedef
CL_API_ENTRY
cl_int
(
CL_API_CALL
*
clEnqueueAcquireD3D11ObjectsNV_fn
)(
cl_command_queue
command_queue
,
cl_uint
num_objects
,
const
cl_mem
*
mem_objects
,
cl_uint
num_events_in_wait_list
,
const
cl_event
*
event_wait_list
,
cl_event
*
event
)
CL_API_SUFFIX__VERSION_1_0
;
typedef
CL_API_ENTRY
cl_int
(
CL_API_CALL
*
clEnqueueReleaseD3D11ObjectsNV_fn
)(
cl_command_queue
command_queue
,
cl_uint
num_objects
,
cl_mem
*
mem_objects
,
cl_uint
num_events_in_wait_list
,
const
cl_event
*
event_wait_list
,
cl_event
*
event
)
CL_API_SUFFIX__VERSION_1_0
;
#ifdef __cplusplus
}
#endif
#endif // __OPENCL_CL_D3D11_H
CMakeLists.txt
View file @
90df5e00
...
...
@@ -108,6 +108,10 @@ if(POLICY CMP0067)
cmake_policy
(
SET CMP0067 NEW
)
endif
()
if
(
POLICY CMP0068
)
cmake_policy
(
SET CMP0068 NEW
)
# CMake 3.9+: `RPATH` settings on macOS do not affect `install_name`.
endif
()
include
(
cmake/OpenCVUtils.cmake
)
ocv_cmake_reset_hooks
()
ocv_check_environment_variables
(
OPENCV_CMAKE_HOOKS_DIR
)
...
...
@@ -368,6 +372,9 @@ OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" ON
OCV_OPTION
(
WITH_DIRECTX
"Include DirectX support"
ON
VISIBLE_IF WIN32 AND NOT WINRT
VERIFY HAVE_DIRECTX
)
OCV_OPTION
(
WITH_OPENCL_D3D11_NV
"Include NVIDIA OpenCL D3D11 support"
WITH_DIRECTX
VISIBLE_IF WIN32 AND NOT WINRT
VERIFY HAVE_OPENCL_D3D11_NV
)
OCV_OPTION
(
WITH_LIBREALSENSE
"Include Intel librealsense support"
OFF
VISIBLE_IF NOT WITH_INTELPERC
VERIFY HAVE_LIBREALSENSE
)
...
...
@@ -1570,6 +1577,7 @@ if(WITH_OPENCL OR HAVE_OPENCL)
IF HAVE_OPENCL_SVM THEN
"SVM"
IF HAVE_CLAMDFFT THEN
"AMDFFT"
IF HAVE_CLAMDBLAS THEN
"AMDBLAS"
IF HAVE_OPENCL_D3D11_NV THEN
"NVD3D11"
ELSE
"no extra features"
)
status
(
""
)
status
(
" OpenCL:"
HAVE_OPENCL THEN
"YES (
${
opencl_features
}
)"
ELSE
"NO"
)
...
...
cmake/OpenCVDetectOpenCL.cmake
View file @
90df5e00
...
...
@@ -2,14 +2,19 @@ set(OPENCL_FOUND ON CACHE BOOL "OpenCL library is found")
if
(
APPLE
)
set
(
OPENCL_LIBRARY
"-framework OpenCL"
CACHE STRING
"OpenCL library"
)
set
(
OPENCL_INCLUDE_DIR
""
CACHE PATH
"OpenCL include directory"
)
else
(
APPLE
)
else
()
set
(
OPENCL_LIBRARY
""
CACHE STRING
"OpenCL library"
)
set
(
OPENCL_INCLUDE_DIR
"
${
OpenCV_SOURCE_DIR
}
/3rdparty/include/opencl/1.2"
CACHE PATH
"OpenCL include directory"
)
ocv_install_3rdparty_licenses
(
opencl-headers
"
${
OpenCV_SOURCE_DIR
}
/3rdparty/include/opencl/LICENSE.txt"
)
endif
(
APPLE
)
endif
()
mark_as_advanced
(
OPENCL_INCLUDE_DIR OPENCL_LIBRARY
)
if
(
OPENCL_FOUND
)
if
(
WITH_OPENCL_D3D11_NV AND EXISTS
"
${
OPENCL_INCLUDE_DIR
}
/CL/cl_d3d11_ext.h"
)
set
(
HAVE_OPENCL_D3D11_NV ON
)
endif
()
if
(
OPENCL_LIBRARY
)
set
(
HAVE_OPENCL_STATIC ON
)
set
(
OPENCL_LIBRARIES
"
${
OPENCL_LIBRARY
}
"
)
...
...
cmake/templates/cvconfig.h.in
View file @
90df5e00
...
...
@@ -100,6 +100,9 @@
#cmakedefine HAVE_OPENCL_STATIC
#cmakedefine HAVE_OPENCL_SVM
/* NVIDIA OpenCL D3D Extensions support */
#cmakedefine HAVE_OPENCL_D3D11_NV
/* OpenEXR codec */
#cmakedefine HAVE_OPENEXR
...
...
modules/core/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
View file @
90df5e00
...
...
@@ -96,6 +96,7 @@
#define clWaitForEvents clWaitForEvents_
#if defined __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
...
...
modules/core/src/directx.cpp
View file @
90df5e00
This diff is collapsed.
Click to expand it.
modules/core/src/directx.inc.hpp
View file @
90df5e00
...
...
@@ -48,6 +48,9 @@
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
#include <CL/cl_d3d11.h>
#ifdef HAVE_OPENCL_D3D11_NV
#include <CL/cl_d3d11_ext.h>
#endif
#include <CL/cl_d3d10.h>
#include <CL/cl_dx9_media_sharing.h>
#endif // HAVE_OPENCL
...
...
modules/core/src/opencl/runtime/generator/template/opencl_core.hpp.in
View file @
90df5e00
...
...
@@ -5,6 +5,7 @@
@CL_REMAP_ORIGIN@
#if defined __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
...
...
modules/core/src/opencl/runtime/opencl_core.cpp
View file @
90df5e00
...
...
@@ -47,6 +47,7 @@
#if defined(HAVE_OPENCL_STATIC)
#if defined __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
...
...
modules/dnn/src/layers/lrn_layer.cpp
View file @
90df5e00
...
...
@@ -92,7 +92,7 @@ public:
virtual
bool
supportBackend
(
int
backendId
)
CV_OVERRIDE
{
if
(
backendId
==
DNN_BACKEND_INFERENCE_ENGINE
)
return
(
bias
==
1
)
&&
(
preferableTarget
!=
DNN_TARGET_MYRIAD
||
type
==
SPATIAL_NRM
)
;
return
bias
==
1
;
return
backendId
==
DNN_BACKEND_OPENCV
||
backendId
==
DNN_BACKEND_HALIDE
||
(
backendId
==
DNN_BACKEND_VKCOM
&&
haveVulkan
()
&&
(
size
%
2
==
1
)
&&
(
type
==
CHANNEL_NRM
));
...
...
modules/dnn/src/onnx/onnx_importer.cpp
View file @
90df5e00
...
...
@@ -591,6 +591,37 @@ void ONNXImporter::populateNet(Net dstNet)
}
layerParams
.
set
(
"num_output"
,
layerParams
.
blobs
[
0
].
size
[
1
]
*
layerParams
.
get
<
int
>
(
"group"
,
1
));
layerParams
.
set
(
"bias_term"
,
node_proto
.
input_size
()
==
3
);
if
(
layerParams
.
has
(
"output_shape"
))
{
const
DictValue
&
outShape
=
layerParams
.
get
(
"output_shape"
);
if
(
outShape
.
size
()
!=
4
)
CV_Error
(
Error
::
StsNotImplemented
,
"Output shape must have 4 elements."
);
const
int
strideY
=
layerParams
.
get
<
int
>
(
"stride_h"
,
1
);
const
int
strideX
=
layerParams
.
get
<
int
>
(
"stride_w"
,
1
);
const
int
outH
=
outShape
.
getIntValue
(
2
);
const
int
outW
=
outShape
.
getIntValue
(
3
);
if
(
layerParams
.
get
<
String
>
(
"pad_mode"
)
==
"SAME"
)
{
layerParams
.
set
(
"adj_w"
,
(
outW
-
1
)
%
strideX
);
layerParams
.
set
(
"adj_h"
,
(
outH
-
1
)
%
strideY
);
}
else
if
(
layerParams
.
get
<
String
>
(
"pad_mode"
)
==
"VALID"
)
{
if
(
!
layerParams
.
has
(
"kernel_h"
)
||
!
layerParams
.
has
(
"kernel_w"
))
CV_Error
(
Error
::
StsNotImplemented
,
"Required attributes 'kernel_h' and 'kernel_w' are not present."
);
int
kernelH
=
layerParams
.
get
<
int
>
(
"kernel_h"
);
int
kernelW
=
layerParams
.
get
<
int
>
(
"kernel_w"
);
layerParams
.
set
(
"adj_w"
,
(
outW
-
kernelW
)
%
strideX
);
layerParams
.
set
(
"adj_h"
,
(
outH
-
kernelH
)
%
strideY
);
}
}
}
else
if
(
layer_type
==
"Transpose"
)
{
...
...
modules/dnn/test/test_halide_layers.cpp
View file @
90df5e00
...
...
@@ -228,6 +228,10 @@ TEST_P(LRN, Accuracy)
Backend
backendId
=
get
<
0
>
(
get
<
5
>
(
GetParam
()));
Target
targetId
=
get
<
1
>
(
get
<
5
>
(
GetParam
()));
if
((
inSize
.
width
==
5
||
inSize
.
height
==
5
)
&&
targetId
==
DNN_TARGET_MYRIAD
&&
nrmType
==
"ACROSS_CHANNELS"
)
throw
SkipTestException
(
"This test case is disabled"
);
LayerParams
lp
;
lp
.
set
(
"norm_region"
,
nrmType
);
lp
.
set
(
"local_size"
,
localSize
);
...
...
modules/dnn/test/test_onnx_importer.cpp
View file @
90df5e00
...
...
@@ -73,6 +73,7 @@ TEST_P(Test_ONNX_layers, Deconvolution)
testONNXModels
(
"deconvolution"
);
testONNXModels
(
"two_deconvolution"
);
testONNXModels
(
"deconvolution_group"
);
testONNXModels
(
"deconvolution_output_shape"
);
}
TEST_P
(
Test_ONNX_layers
,
Dropout
)
...
...
modules/imgproc/src/sumpixels.avx512_skx.cpp
View file @
90df5e00
...
...
@@ -13,7 +13,6 @@ namespace { // Anonymous namespace to avoid exposing the implementation classes
// NOTE: Look at the bottom of the file for the entry-point function for external callers
//
// TODO: Add support for 1 channel input (WIP: currently hitting hardware glassjaw)
template
<
size_t
num_channels
>
class
IntegralCalculator
;
template
<
size_t
num_channels
>
...
...
@@ -191,51 +190,55 @@ public:
}
// The calculate_integral function referenced here must be implemented in the templated derivatives
// because the algorithm depends heavily on the number of channels in the image
// This is the incomplete definition (just the prototype) here.
//
static
CV_ALWAYS_INLINE
__m512d
calculate_integral
(
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
);
static
CV_ALWAYS_INLINE
__m512i
read_64_bytes
(
const
__m512i
*
srcs
,
__mmask64
data_mask
)
{
__m512i
read_64_bytes
(
const
__m512i
*
srcs
,
const
__mmask64
data_mask
)
{
return
_mm512_maskz_loadu_epi8
(
data_mask
,
srcs
);
}
static
CV_ALWAYS_INLINE
__m128i
extract_lower_16bytes
(
__m512i
src_64byte_chunk
)
{
__m128i
extract_lower_16bytes
(
const
__m512i
src_64byte_chunk
)
{
return
_mm512_extracti64x2_epi64
(
src_64byte_chunk
,
0x0
);
}
static
CV_ALWAYS_INLINE
__m512i
convert_lower_8bytes_to_longs
(
__m128i
src_16bytes
)
{
__m512i
convert_lower_8bytes_to_longs
(
const
__m128i
src_16bytes
)
{
return
_mm512_cvtepu8_epi64
(
src_16bytes
);
}
static
CV_ALWAYS_INLINE
__m512i
square_m512
(
__m512i
src_longs
)
{
__m512i
square_m512
(
const
__m512i
src_longs
)
{
return
_mm512_mullo_epi64
(
src_longs
,
src_longs
);
}
static
CV_ALWAYS_INLINE
__m128i
shift_right_8_bytes
(
__m128i
src_16bytes
)
{
__m128i
shift_right_8_bytes
(
const
__m128i
src_16bytes
)
{
return
_mm_maskz_compress_epi64
(
2
,
src_16bytes
);
}
static
CV_ALWAYS_INLINE
__m512i
shift_right_16_bytes
(
__m512i
src_64byte_chunk
)
{
__m512i
shift_right_16_bytes
(
const
__m512i
src_64byte_chunk
)
{
return
_mm512_maskz_compress_epi64
(
0xFC
,
src_64byte_chunk
);
}
static
CV_ALWAYS_INLINE
__m512i
m512_hadd
(
const
__m512i
a
){
return
_mm512_add_epi64
(
_mm512_maskz_compress_epi64
(
0xAA
,
a
),
_mm512_maskz_compress_epi64
(
0x55
,
a
));
}
// The calculate_integral function referenced here must be implemented in the templated derivatives
// because the algorithm depends heavily on the number of channels in the image
// This is the incomplete definition (just the prototype) here.
//
static
CV_ALWAYS_INLINE
__m512d
calculate_integral
(
const
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
);
};
...
...
@@ -246,7 +249,7 @@ public:
//
// The function prototype that needs to be implemented is:
//
// __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... }
// __m512d calculate_integral(
const
__m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... }
//
// Description of parameters:
// INPUTS:
...
...
@@ -265,12 +268,72 @@ public:
// Below here is the channel specific implementation
//
//========================================
// 1 Channel Integral Implementation
//========================================
template
<>
CV_ALWAYS_INLINE
__m512d
IntegralCalculator
<
1
>
::
calculate_integral
(
const
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
{
// One channel support is implemented differently than 2, 3, or 4 channel
// One channel support has more horizontal operations that cannot be made vertical without losing performance
// The logical operations needed look like:
// Vertical LANES : |7|6|5|4|3|2|1|0|
// src_longs : |H|G|F|E|D|C|B|A|
// shift_by_1 : + |G|F|E|D|C|B|A| |
// shift_by_2 : + |F|E|D|C|B|A| | |
// shift_by_3 : + |E|D|C|B|A| | | |
// shift_by_4 : + |D|C|B|A| | | | |
// shift_by_5 : + |C|B|A| | | | | |
// shift_by_6 : + |B|A| | | | | | |
// shift_by_7 : + |A| | | | | | | |
// carry_over_idxs : + |7|7|7|7|7|7|7|7| (index position of result from previous iteration)
// = integral
//
// If we do this vertically we end up losing performance because of the number of operations. We will instead
// do a horizontal add tree to create the vertical sections we need as a tree
// Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// src_longs: | H | G | F | E | D | C | B | A |
// horiz_sum_1: | | | | | G+H | E+F | C+D | A+B |
// horiz_sum_2: | | | | | | | EFGH | ABCD |
//
const
__m512i
horiz_sum_1
=
m512_hadd
(
src_longs
);
// indexes for the permutes below (3,2,1,0) = (GH, EF, CD, AB)
const
__m512i
horiz_sum_2
=
m512_hadd
(
horiz_sum_1
);
// indexes for the permutes below (9, 8) = (EFGH, ABCD)
// Then we can use the partial sums by looking at the vertical stacks above and realize that, for example
// ABCD appears vertically in lanes 7, 6, 5, 4, and 3 so we will permute the values so that all partial products
// appear in the right lanes. and sum them up along with the carry over value from the accumulator. So we setup
// the lanes like:
// Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// s1 : | 0 | G | 0 | E | 0 | C | 0 | A |
// s2 : | ABCD | ABCD | ABCD | ABCD | ABCD | AB | AB | 0 |
// s3 : | EFGH | EF | EF | 0 | 0 | 0 | 0 | 0 |
// +------+------+------+------+------+------+------+------+
// sum : | A..H | A..G | A..F | A..E | A..D | A..C | A..B | A | Integral :-)
//
const
__m512i
s1
=
_mm512_maskz_mov_epi64
(
0x55
,
src_longs
);
// 0 G 0 E 0 D 0 C 0 A
const
__m512i
s2
=
_mm512_permutex2var_epi64
(
horiz_sum_1
,
_mm512_set_epi64
(
8
,
8
,
8
,
8
,
8
,
0
,
0
,
4
),
horiz_sum_2
);
const
__m512i
s3
=
_mm512_permutex2var_epi64
(
horiz_sum_1
,
_mm512_set_epi64
(
9
,
2
,
2
,
4
,
4
,
4
,
4
,
4
),
horiz_sum_2
);
// Now we use the rolling sum from the previous iteration from accumulator and replicate it into carry_over
// And sum everything up into the accumulator
//
const
__m512i
carry_over
=
_mm512_permutex2var_epi64
(
accumulator
,
_mm512_set_epi64
(
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
),
accumulator
);
accumulator
=
_mm512_add_epi64
(
_mm512_add_epi64
(
s2
,
s3
),
_mm512_add_epi64
(
carry_over
,
s1
));
// Convert to double precision and store
//
__m512d
integral_pd
=
_mm512_add_pd
(
_mm512_cvtepu64_pd
(
accumulator
),
above_values
);
return
integral_pd
;
}
//========================================
// 2 Channel Integral Implementation
//========================================
template
<>
CV_ALWAYS_INLINE
__m512d
IntegralCalculator
<
2
>
::
calculate_integral
(
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
__m512d
IntegralCalculator
<
2
>
::
calculate_integral
(
const
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
{
__m512i
carryover_idxs
=
_mm512_set_epi64
(
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
);
...
...
@@ -300,12 +363,13 @@ __m512d IntegralCalculator < 2 > ::calculate_integral(__m512i src_longs, const _
return
integral_pd
;
}
//========================================
// 3 Channel Integral Implementation
//========================================
template
<>
CV_ALWAYS_INLINE
__m512d
IntegralCalculator
<
3
>
::
calculate_integral
(
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
__m512d
IntegralCalculator
<
3
>
::
calculate_integral
(
const
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
{
__m512i
carryover_idxs
=
_mm512_set_epi64
(
6
,
5
,
7
,
6
,
5
,
7
,
6
,
5
);
...
...
@@ -338,7 +402,7 @@ __m512d IntegralCalculator < 3 > ::calculate_integral(__m512i src_longs, const _
//========================================
template
<>
CV_ALWAYS_INLINE
__m512d
IntegralCalculator
<
4
>
::
calculate_integral
(
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
__m512d
IntegralCalculator
<
4
>
::
calculate_integral
(
const
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
{
__m512i
carryover_idxs
=
_mm512_set_epi64
(
7
,
6
,
5
,
4
,
7
,
6
,
5
,
4
);
...
...
@@ -376,18 +440,23 @@ void calculate_integral_avx512(const uchar *src, size_t _srcstep,
int
width
,
int
height
,
int
cn
)
{
switch
(
cn
){
case
1
:
{
IntegralCalculator
<
1
>
calculator
;
calculator
.
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
);
break
;
}
case
2
:
{
IntegralCalculator
<
2
>
calculator
;
IntegralCalculator
<
2
>
calculator
;
calculator
.
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
);
break
;
}
case
3
:
{
IntegralCalculator
<
3
>
calculator
;
IntegralCalculator
<
3
>
calculator
;
calculator
.
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
);
break
;
}
case
4
:
{
IntegralCalculator
<
4
>
calculator
;
IntegralCalculator
<
4
>
calculator
;
calculator
.
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
);
}
}
...
...
modules/imgproc/src/sumpixels.cpp
View file @
90df5e00
...
...
@@ -77,7 +77,7 @@ struct Integral_SIMD<uchar, double, double> {
#if CV_TRY_AVX512_SKX
CV_UNUSED
(
_tiltedstep
);
// TODO: Add support for 1 channel input (WIP)
if
(
CV_CPU_HAS_SUPPORT_AVX512_SKX
&&
!
tilted
&&
(
(
cn
>=
2
)
&&
(
cn
<=
4
)
)){
if
(
CV_CPU_HAS_SUPPORT_AVX512_SKX
&&
!
tilted
&&
(
cn
<=
4
)){
opt_AVX512_SKX
::
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
,
cn
);
return
true
;
...
...
samples/opencl/opencl-opencv-interop.cpp
View file @
90df5e00
...
...
@@ -19,6 +19,7 @@
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS // eliminate build warning
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment