Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
7a1874b2
Commit
7a1874b2
authored
Nov 12, 2012
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new reduce and reduceKeyVal implementation
parent
d47c1124
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1149 additions
and
15 deletions
+1149
-15
reduce.hpp
modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
+352
-0
reduce_key_val.hpp
.../gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
+489
-0
reduce.hpp
modules/gpu/include/opencv2/gpu/device/reduce.hpp
+197
-0
utility.hpp
modules/gpu/include/opencv2/gpu/device/utility.hpp
+1
-1
vec_distance.hpp
modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+4
-4
warp_shuffle.hpp
modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
+97
-0
orb.cu
modules/gpu/src/cuda/orb.cu
+7
-8
surf.cu
modules/gpu/src/cuda/surf.cu
+2
-2
No files found.
modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
0 → 100644
View file @
7a1874b2
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
#include <thrust/tuple.h>
#include "../warp.hpp"
#include "../warp_shuffle.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
{
namespace
reduce_detail
{
template
<
typename
T
>
struct
GetType
;
template
<
typename
T
>
struct
GetType
<
T
*>
{
typedef
T
type
;
};
template
<
typename
T
>
struct
GetType
<
volatile
T
*>
{
typedef
T
type
;
};
template
<
typename
T
>
struct
GetType
<
T
&>
{
typedef
T
type
;
};
template
<
unsigned
int
I
,
unsigned
int
N
>
struct
For
{
template
<
class
PointerTuple
,
class
ValTuple
>
static
__device__
void
loadToSmem
(
const
PointerTuple
&
smem
,
const
ValTuple
&
val
,
unsigned
int
tid
)
{
thrust
::
get
<
I
>
(
smem
)[
tid
]
=
thrust
::
get
<
I
>
(
val
);
For
<
I
+
1
,
N
>::
loadToSmem
(
smem
,
val
,
tid
);
}
template
<
class
PointerTuple
,
class
ValTuple
>
static
__device__
void
loadFromSmem
(
const
PointerTuple
&
smem
,
const
ValTuple
&
val
,
unsigned
int
tid
)
{
thrust
::
get
<
I
>
(
val
)
=
thrust
::
get
<
I
>
(
smem
)[
tid
];
For
<
I
+
1
,
N
>::
loadFromSmem
(
smem
,
val
,
tid
);
}
template
<
class
PointerTuple
,
class
ValTuple
,
class
OpTuple
>
static
__device__
void
merge
(
const
PointerTuple
&
smem
,
const
ValTuple
&
val
,
unsigned
int
tid
,
unsigned
int
delta
,
const
OpTuple
&
op
)
{
typename
GetType
<
typename
thrust
::
tuple_element
<
I
,
PointerTuple
>::
type
>::
type
reg
=
thrust
::
get
<
I
>
(
smem
)[
tid
+
delta
];
thrust
::
get
<
I
>
(
smem
)[
tid
]
=
thrust
::
get
<
I
>
(
val
)
=
thrust
::
get
<
I
>
(
op
)(
thrust
::
get
<
I
>
(
val
),
reg
);
For
<
I
+
1
,
N
>::
merge
(
smem
,
val
,
tid
,
delta
,
op
);
}
template
<
class
ValTuple
,
class
OpTuple
>
static
__device__
void
mergeShfl
(
const
ValTuple
&
val
,
unsigned
int
delta
,
unsigned
int
width
,
const
OpTuple
&
op
)
{
typename
GetType
<
typename
thrust
::
tuple_element
<
I
,
ValTuple
>::
type
>::
type
reg
=
shfl_down
(
thrust
::
get
<
I
>
(
val
),
delta
,
width
);
thrust
::
get
<
I
>
(
val
)
=
thrust
::
get
<
I
>
(
op
)(
thrust
::
get
<
I
>
(
val
),
reg
);
For
<
I
+
1
,
N
>::
mergeShfl
(
val
,
delta
,
width
,
op
);
}
};
template
<
unsigned
int
N
>
struct
For
<
N
,
N
>
{
template
<
class
PointerTuple
,
class
ValTuple
>
static
__device__
void
loadToSmem
(
const
PointerTuple
&
,
const
ValTuple
&
,
unsigned
int
)
{
}
template
<
class
PointerTuple
,
class
ValTuple
>
static
__device__
void
loadFromSmem
(
const
PointerTuple
&
,
const
ValTuple
&
,
unsigned
int
)
{
}
template
<
class
PointerTuple
,
class
ValTuple
,
class
OpTuple
>
static
__device__
void
merge
(
const
PointerTuple
&
,
const
ValTuple
&
,
unsigned
int
,
unsigned
int
,
const
OpTuple
&
)
{
}
template
<
class
ValTuple
,
class
OpTuple
>
static
__device__
void
mergeShfl
(
const
ValTuple
&
,
unsigned
int
,
unsigned
int
,
const
OpTuple
&
)
{
}
};
template
<
typename
T
>
__device__
__forceinline__
void
loadToSmem
(
volatile
T
*
smem
,
T
&
val
,
unsigned
int
tid
)
{
smem
[
tid
]
=
val
;
}
template
<
typename
T
>
__device__
__forceinline__
void
loadFromSmem
(
volatile
T
*
smem
,
T
&
val
,
unsigned
int
tid
)
{
val
=
smem
[
tid
];
}
template
<
typename
P0
,
typename
P1
,
typename
P2
,
typename
P3
,
typename
P4
,
typename
P5
,
typename
P6
,
typename
P7
,
typename
P8
,
typename
P9
,
typename
R0
,
typename
R1
,
typename
R2
,
typename
R3
,
typename
R4
,
typename
R5
,
typename
R6
,
typename
R7
,
typename
R8
,
typename
R9
>
__device__
__forceinline__
void
loadToSmem
(
const
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>&
smem
,
const
thrust
::
tuple
<
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
,
R7
,
R8
,
R9
>&
val
,
unsigned
int
tid
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>
>::
value
>::
loadToSmem
(
smem
,
val
,
tid
);
}
template
<
typename
P0
,
typename
P1
,
typename
P2
,
typename
P3
,
typename
P4
,
typename
P5
,
typename
P6
,
typename
P7
,
typename
P8
,
typename
P9
,
typename
R0
,
typename
R1
,
typename
R2
,
typename
R3
,
typename
R4
,
typename
R5
,
typename
R6
,
typename
R7
,
typename
R8
,
typename
R9
>
__device__
__forceinline__
void
loadFromSmem
(
const
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>&
smem
,
const
thrust
::
tuple
<
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
,
R7
,
R8
,
R9
>&
val
,
unsigned
int
tid
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>
>::
value
>::
loadFromSmem
(
smem
,
val
,
tid
);
}
template
<
typename
T
,
class
Op
>
__device__
__forceinline__
void
merge
(
volatile
T
*
smem
,
T
&
val
,
unsigned
int
tid
,
unsigned
int
delta
,
const
Op
&
op
)
{
T
reg
=
smem
[
tid
+
delta
];
smem
[
tid
]
=
val
=
op
(
val
,
reg
);
}
template
<
typename
T
,
class
Op
>
__device__
__forceinline__
void
mergeShfl
(
T
&
val
,
unsigned
int
delta
,
unsigned
int
width
,
const
Op
&
op
)
{
T
reg
=
shfl_down
(
val
,
delta
,
width
);
val
=
op
(
val
,
reg
);
}
template
<
typename
P0
,
typename
P1
,
typename
P2
,
typename
P3
,
typename
P4
,
typename
P5
,
typename
P6
,
typename
P7
,
typename
P8
,
typename
P9
,
typename
R0
,
typename
R1
,
typename
R2
,
typename
R3
,
typename
R4
,
typename
R5
,
typename
R6
,
typename
R7
,
typename
R8
,
typename
R9
,
class
Op0
,
class
Op1
,
class
Op2
,
class
Op3
,
class
Op4
,
class
Op5
,
class
Op6
,
class
Op7
,
class
Op8
,
class
Op9
>
__device__
__forceinline__
void
merge
(
const
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>&
smem
,
const
thrust
::
tuple
<
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
,
R7
,
R8
,
R9
>&
val
,
unsigned
int
tid
,
unsigned
int
delta
,
const
thrust
::
tuple
<
Op0
,
Op1
,
Op2
,
Op3
,
Op4
,
Op5
,
Op6
,
Op7
,
Op8
,
Op9
>&
op
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>
>::
value
>::
merge
(
smem
,
val
,
tid
,
delta
,
op
);
}
template
<
typename
R0
,
typename
R1
,
typename
R2
,
typename
R3
,
typename
R4
,
typename
R5
,
typename
R6
,
typename
R7
,
typename
R8
,
typename
R9
,
class
Op0
,
class
Op1
,
class
Op2
,
class
Op3
,
class
Op4
,
class
Op5
,
class
Op6
,
class
Op7
,
class
Op8
,
class
Op9
>
__device__
__forceinline__
void
mergeShfl
(
const
thrust
::
tuple
<
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
,
R7
,
R8
,
R9
>&
val
,
unsigned
int
delta
,
unsigned
int
width
,
const
thrust
::
tuple
<
Op0
,
Op1
,
Op2
,
Op3
,
Op4
,
Op5
,
Op6
,
Op7
,
Op8
,
Op9
>&
op
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
,
R7
,
R8
,
R9
>
>::
value
>::
mergeShfl
(
val
,
delta
,
width
,
op
);
}
template
<
unsigned
int
N
>
struct
Generic
{
template
<
typename
Pointer
,
typename
Reference
,
class
Op
>
static
__device__
void
reduce
(
Pointer
smem
,
Reference
val
,
unsigned
int
tid
,
Op
op
)
{
loadToSmem
(
smem
,
val
,
tid
);
if
(
N
>=
32
)
__syncthreads
();
if
(
N
>=
2048
)
{
if
(
tid
<
1024
)
merge
(
smem
,
val
,
tid
,
1024
,
op
);
__syncthreads
();
}
if
(
N
>=
1024
)
{
if
(
tid
<
512
)
merge
(
smem
,
val
,
tid
,
512
,
op
);
__syncthreads
();
}
if
(
N
>=
512
)
{
if
(
tid
<
256
)
merge
(
smem
,
val
,
tid
,
256
,
op
);
__syncthreads
();
}
if
(
N
>=
256
)
{
if
(
tid
<
128
)
merge
(
smem
,
val
,
tid
,
128
,
op
);
__syncthreads
();
}
if
(
N
>=
128
)
{
if
(
tid
<
64
)
merge
(
smem
,
val
,
tid
,
64
,
op
);
__syncthreads
();
}
if
(
N
>=
64
)
{
if
(
tid
<
32
)
merge
(
smem
,
val
,
tid
,
32
,
op
);
}
if
(
tid
<
16
)
{
merge
(
smem
,
val
,
tid
,
16
,
op
);
merge
(
smem
,
val
,
tid
,
8
,
op
);
merge
(
smem
,
val
,
tid
,
4
,
op
);
merge
(
smem
,
val
,
tid
,
2
,
op
);
merge
(
smem
,
val
,
tid
,
1
,
op
);
}
}
};
template
<
unsigned
int
N
>
struct
WarpOptimized
{
template
<
typename
Pointer
,
typename
Reference
,
class
Op
>
static
__device__
void
reduce
(
Pointer
smem
,
Reference
val
,
unsigned
int
tid
,
Op
op
)
{
#if __CUDA_ARCH >= 300
(
void
)
smem
;
(
void
)
tid
;
#pragma unroll
for
(
unsigned
int
i
=
N
/
2
;
i
>=
1
;
i
/=
2
)
mergeShfl
(
val
,
i
,
N
,
op
);
#else
loadToSmem
(
smem
,
val
,
tid
);
if
(
tid
<
N
/
2
)
{
#pragma unroll
for
(
unsigned
int
i
=
N
/
2
;
i
>=
1
;
i
/=
2
)
merge
(
smem
,
val
,
tid
,
i
,
op
);
}
#endif
}
};
template
<
unsigned
int
N
>
struct
GenericOptimized32
{
enum
{
M
=
N
/
32
};
template
<
typename
Pointer
,
typename
Reference
,
class
Op
>
static
__device__
void
reduce
(
Pointer
smem
,
Reference
val
,
unsigned
int
tid
,
Op
op
)
{
const
unsigned
int
laneId
=
Warp
::
laneId
();
#if __CUDA_ARCH >= 300
#pragma unroll
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
mergeShfl
(
val
,
i
,
warpSize
,
op
);
if
(
laneId
==
0
)
loadToSmem
(
smem
,
val
,
tid
/
32
);
#else
loadToSmem
(
smem
,
val
,
tid
);
if
(
laneId
<
16
)
{
#pragma unroll
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
merge
(
smem
,
val
,
tid
,
i
,
op
);
}
__syncthreads
();
if
(
laneId
==
0
)
loadToSmem
(
smem
,
val
,
tid
/
32
);
#endif
__syncthreads
();
loadFromSmem
(
smem
,
val
,
tid
);
if
(
tid
<
32
)
{
#if __CUDA_ARCH >= 300
#pragma unroll
for
(
int
i
=
M
/
2
;
i
>=
1
;
i
/=
2
)
mergeShfl
(
val
,
i
,
M
,
op
);
#else
#pragma unroll
for
(
int
i
=
M
/
2
;
i
>=
1
;
i
/=
2
)
merge
(
smem
,
val
,
tid
,
i
,
op
);
#endif
}
}
};
template
<
bool
val
,
class
T1
,
class
T2
>
struct
StaticIf
;
template
<
class
T1
,
class
T2
>
struct
StaticIf
<
true
,
T1
,
T2
>
{
typedef
T1
type
;
};
template
<
class
T1
,
class
T2
>
struct
StaticIf
<
false
,
T1
,
T2
>
{
typedef
T2
type
;
};
template
<
unsigned
int
N
>
struct
IsPowerOf2
{
enum
{
value
=
((
N
!=
0
)
&&
!
(
N
&
(
N
-
1
)))
};
};
template
<
unsigned
int
N
>
struct
Dispatcher
{
typedef
typename
StaticIf
<
(
N
<=
32
)
&&
IsPowerOf2
<
N
>::
value
,
WarpOptimized
<
N
>
,
typename
StaticIf
<
(
N
<=
1024
)
&&
IsPowerOf2
<
N
>::
value
,
GenericOptimized32
<
N
>
,
Generic
<
N
>
>::
type
>::
type
reductor
;
};
}
}}}
#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
0 → 100644
View file @
7a1874b2
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
#include <thrust/tuple.h>
#include "../warp.hpp"
#include "../warp_shuffle.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
{
namespace
reduce_key_val_detail
{
template
<
typename
T
>
struct
GetType
;
template
<
typename
T
>
struct
GetType
<
T
*>
{
typedef
T
type
;
};
template
<
typename
T
>
struct
GetType
<
volatile
T
*>
{
typedef
T
type
;
};
template
<
typename
T
>
struct
GetType
<
T
&>
{
typedef
T
type
;
};
template
<
unsigned
int
I
,
unsigned
int
N
>
struct
For
{
template
<
class
PointerTuple
,
class
ReferenceTuple
>
static
__device__
void
loadToSmem
(
const
PointerTuple
&
smem
,
const
ReferenceTuple
&
data
,
unsigned
int
tid
)
{
thrust
::
get
<
I
>
(
smem
)[
tid
]
=
thrust
::
get
<
I
>
(
data
);
For
<
I
+
1
,
N
>::
loadToSmem
(
smem
,
data
,
tid
);
}
template
<
class
PointerTuple
,
class
ReferenceTuple
>
static
__device__
void
loadFromSmem
(
const
PointerTuple
&
smem
,
const
ReferenceTuple
&
data
,
unsigned
int
tid
)
{
thrust
::
get
<
I
>
(
data
)
=
thrust
::
get
<
I
>
(
smem
)[
tid
];
For
<
I
+
1
,
N
>::
loadFromSmem
(
smem
,
data
,
tid
);
}
template
<
class
ReferenceTuple
>
static
__device__
void
copyShfl
(
const
ReferenceTuple
&
val
,
unsigned
int
delta
,
int
width
)
{
thrust
::
get
<
I
>
(
val
)
=
shfl_down
(
thrust
::
get
<
I
>
(
val
),
delta
,
width
);
For
<
I
+
1
,
N
>::
copyShfl
(
val
,
delta
,
width
);
}
template
<
class
PointerTuple
,
class
ReferenceTuple
>
static
__device__
void
copy
(
const
PointerTuple
&
svals
,
const
ReferenceTuple
&
val
,
unsigned
int
tid
,
unsigned
int
delta
)
{
thrust
::
get
<
I
>
(
svals
)[
tid
]
=
thrust
::
get
<
I
>
(
val
)
=
thrust
::
get
<
I
>
(
svals
)[
tid
+
delta
];
For
<
I
+
1
,
N
>::
copy
(
svals
,
val
,
tid
,
delta
);
}
template
<
class
KeyReferenceTuple
,
class
ValReferenceTuple
,
class
CmpTuple
>
static
__device__
void
mergeShfl
(
const
KeyReferenceTuple
&
key
,
const
ValReferenceTuple
&
val
,
const
CmpTuple
&
cmp
,
unsigned
int
delta
,
int
width
)
{
typename
GetType
<
typename
thrust
::
tuple_element
<
I
,
KeyReferenceTuple
>::
type
>::
type
reg
=
shfl_down
(
thrust
::
get
<
I
>
(
key
),
delta
,
width
);
if
(
thrust
::
get
<
I
>
(
cmp
)(
reg
,
thrust
::
get
<
I
>
(
key
)))
{
thrust
::
get
<
I
>
(
key
)
=
reg
;
thrust
::
get
<
I
>
(
val
)
=
shfl_down
(
thrust
::
get
<
I
>
(
val
),
delta
,
width
);
}
For
<
I
+
1
,
N
>::
mergeShfl
(
key
,
val
,
cmp
,
delta
,
width
);
}
template
<
class
KeyPointerTuple
,
class
KeyReferenceTuple
,
class
ValPointerTuple
,
class
ValReferenceTuple
,
class
CmpTuple
>
static
__device__
void
merge
(
const
KeyPointerTuple
&
skeys
,
const
KeyReferenceTuple
&
key
,
const
ValPointerTuple
&
svals
,
const
ValReferenceTuple
&
val
,
const
CmpTuple
&
cmp
,
unsigned
int
tid
,
unsigned
int
delta
)
{
typename
GetType
<
typename
thrust
::
tuple_element
<
I
,
KeyPointerTuple
>::
type
>::
type
reg
=
thrust
::
get
<
I
>
(
skeys
)[
tid
+
delta
];
if
(
thrust
::
get
<
I
>
(
cmp
)(
reg
,
thrust
::
get
<
I
>
(
key
)))
{
thrust
::
get
<
I
>
(
skeys
)[
tid
]
=
thrust
::
get
<
I
>
(
key
)
=
reg
;
thrust
::
get
<
I
>
(
svals
)[
tid
]
=
thrust
::
get
<
I
>
(
val
)
=
thrust
::
get
<
I
>
(
svals
)[
tid
+
delta
];
}
For
<
I
+
1
,
N
>::
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
delta
);
}
};
template
<
unsigned
int
N
>
struct
For
<
N
,
N
>
{
template
<
class
PointerTuple
,
class
ReferenceTuple
>
static
__device__
void
loadToSmem
(
const
PointerTuple
&
,
const
ReferenceTuple
&
,
unsigned
int
)
{
}
template
<
class
PointerTuple
,
class
ReferenceTuple
>
static
__device__
void
loadFromSmem
(
const
PointerTuple
&
,
const
ReferenceTuple
&
,
unsigned
int
)
{
}
template
<
class
ReferenceTuple
>
static
__device__
void
copyShfl
(
const
ReferenceTuple
&
,
unsigned
int
,
int
)
{
}
template
<
class
PointerTuple
,
class
ReferenceTuple
>
static
__device__
void
copy
(
const
PointerTuple
&
,
const
ReferenceTuple
&
,
unsigned
int
,
unsigned
int
)
{
}
template
<
class
KeyReferenceTuple
,
class
ValReferenceTuple
,
class
CmpTuple
>
static
__device__
void
mergeShfl
(
const
KeyReferenceTuple
&
,
const
ValReferenceTuple
&
,
const
CmpTuple
&
,
unsigned
int
,
int
)
{
}
template
<
class
KeyPointerTuple
,
class
KeyReferenceTuple
,
class
ValPointerTuple
,
class
ValReferenceTuple
,
class
CmpTuple
>
static
__device__
void
merge
(
const
KeyPointerTuple
&
,
const
KeyReferenceTuple
&
,
const
ValPointerTuple
&
,
const
ValReferenceTuple
&
,
const
CmpTuple
&
,
unsigned
int
,
unsigned
int
)
{
}
};
//////////////////////////////////////////////////////
// loadToSmem
template
<
typename
T
>
__device__
__forceinline__
void
loadToSmem
(
volatile
T
*
smem
,
T
&
data
,
unsigned
int
tid
)
{
smem
[
tid
]
=
data
;
}
template
<
typename
T
>
__device__
__forceinline__
void
loadFromSmem
(
volatile
T
*
smem
,
T
&
data
,
unsigned
int
tid
)
{
data
=
smem
[
tid
];
}
template
<
typename
VP0
,
typename
VP1
,
typename
VP2
,
typename
VP3
,
typename
VP4
,
typename
VP5
,
typename
VP6
,
typename
VP7
,
typename
VP8
,
typename
VP9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
>
__device__
__forceinline__
void
loadToSmem
(
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
smem
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
data
,
unsigned
int
tid
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>
>::
value
>::
loadToSmem
(
smem
,
data
,
tid
);
}
template
<
typename
VP0
,
typename
VP1
,
typename
VP2
,
typename
VP3
,
typename
VP4
,
typename
VP5
,
typename
VP6
,
typename
VP7
,
typename
VP8
,
typename
VP9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
>
__device__
__forceinline__
void
loadFromSmem
(
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
smem
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
data
,
unsigned
int
tid
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>
>::
value
>::
loadFromSmem
(
smem
,
data
,
tid
);
}
//////////////////////////////////////////////////////
// copyVals
template
<
typename
V
>
__device__
__forceinline__
void
copyValsShfl
(
V
&
val
,
unsigned
int
delta
,
int
width
)
{
val
=
shfl_down
(
val
,
delta
,
width
);
}
template
<
typename
V
>
__device__
__forceinline__
void
copyVals
(
volatile
V
*
svals
,
V
&
val
,
unsigned
int
tid
,
unsigned
int
delta
)
{
svals
[
tid
]
=
val
=
svals
[
tid
+
delta
];
}
template
<
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
>
__device__
__forceinline__
void
copyValsShfl
(
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
unsigned
int
delta
,
int
width
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>
>::
value
>::
copyShfl
(
val
,
delta
,
width
);
}
template
<
typename
VP0
,
typename
VP1
,
typename
VP2
,
typename
VP3
,
typename
VP4
,
typename
VP5
,
typename
VP6
,
typename
VP7
,
typename
VP8
,
typename
VP9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
>
__device__
__forceinline__
void
copyVals
(
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
svals
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
unsigned
int
tid
,
unsigned
int
delta
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>
>::
value
>::
copy
(
svals
,
val
,
tid
,
delta
);
}
//////////////////////////////////////////////////////
// merge
template
<
typename
K
,
typename
V
,
class
Cmp
>
__device__
__forceinline__
void
mergeShfl
(
K
&
key
,
V
&
val
,
const
Cmp
&
cmp
,
unsigned
int
delta
,
int
width
)
{
K
reg
=
shfl_down
(
key
,
delta
,
width
);
if
(
cmp
(
reg
,
key
))
{
key
=
reg
;
copyValsShfl
(
val
,
delta
,
width
);
}
}
template
<
typename
K
,
typename
V
,
class
Cmp
>
__device__
__forceinline__
void
merge
(
volatile
K
*
skeys
,
K
&
key
,
volatile
V
*
svals
,
V
&
val
,
const
Cmp
&
cmp
,
unsigned
int
tid
,
unsigned
int
delta
)
{
K
reg
=
skeys
[
tid
+
delta
];
if
(
cmp
(
reg
,
key
))
{
skeys
[
tid
]
=
key
=
reg
;
copyVals
(
svals
,
val
,
tid
,
delta
);
}
}
template
<
typename
K
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
,
class
Cmp
>
__device__
__forceinline__
void
mergeShfl
(
K
&
key
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
const
Cmp
&
cmp
,
unsigned
int
delta
,
int
width
)
{
K
reg
=
shfl_down
(
key
,
delta
,
width
);
if
(
cmp
(
reg
,
key
))
{
key
=
reg
;
copyValsShfl
(
val
,
delta
,
width
);
}
}
template
<
typename
K
,
typename
VP0
,
typename
VP1
,
typename
VP2
,
typename
VP3
,
typename
VP4
,
typename
VP5
,
typename
VP6
,
typename
VP7
,
typename
VP8
,
typename
VP9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
,
class
Cmp
>
__device__
__forceinline__
void
merge
(
volatile
K
*
skeys
,
K
&
key
,
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
svals
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
const
Cmp
&
cmp
,
unsigned
int
tid
,
unsigned
int
delta
)
{
K
reg
=
skeys
[
tid
+
delta
];
if
(
cmp
(
reg
,
key
))
{
skeys
[
tid
]
=
key
=
reg
;
copyVals
(
svals
,
val
,
tid
,
delta
);
}
}
template
<
typename
KR0
,
typename
KR1
,
typename
KR2
,
typename
KR3
,
typename
KR4
,
typename
KR5
,
typename
KR6
,
typename
KR7
,
typename
KR8
,
typename
KR9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
,
class
Cmp0
,
class
Cmp1
,
class
Cmp2
,
class
Cmp3
,
class
Cmp4
,
class
Cmp5
,
class
Cmp6
,
class
Cmp7
,
class
Cmp8
,
class
Cmp9
>
__device__
__forceinline__
void
mergeShfl
(
const
thrust
::
tuple
<
KR0
,
KR1
,
KR2
,
KR3
,
KR4
,
KR5
,
KR6
,
KR7
,
KR8
,
KR9
>&
key
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
const
thrust
::
tuple
<
Cmp0
,
Cmp1
,
Cmp2
,
Cmp3
,
Cmp4
,
Cmp5
,
Cmp6
,
Cmp7
,
Cmp8
,
Cmp9
>&
cmp
,
unsigned
int
delta
,
int
width
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
KR0
,
KR1
,
KR2
,
KR3
,
KR4
,
KR5
,
KR6
,
KR7
,
KR8
,
KR9
>
>::
value
>::
mergeShfl
(
key
,
val
,
cmp
,
delta
,
width
);
}
template
<
typename
KP0
,
typename
KP1
,
typename
KP2
,
typename
KP3
,
typename
KP4
,
typename
KP5
,
typename
KP6
,
typename
KP7
,
typename
KP8
,
typename
KP9
,
typename
KR0
,
typename
KR1
,
typename
KR2
,
typename
KR3
,
typename
KR4
,
typename
KR5
,
typename
KR6
,
typename
KR7
,
typename
KR8
,
typename
KR9
,
typename
VP0
,
typename
VP1
,
typename
VP2
,
typename
VP3
,
typename
VP4
,
typename
VP5
,
typename
VP6
,
typename
VP7
,
typename
VP8
,
typename
VP9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
,
class
Cmp0
,
class
Cmp1
,
class
Cmp2
,
class
Cmp3
,
class
Cmp4
,
class
Cmp5
,
class
Cmp6
,
class
Cmp7
,
class
Cmp8
,
class
Cmp9
>
__device__
__forceinline__
void
merge
(
const
thrust
::
tuple
<
KP0
,
KP1
,
KP2
,
KP3
,
KP4
,
KP5
,
KP6
,
KP7
,
KP8
,
KP9
>&
skeys
,
const
thrust
::
tuple
<
KR0
,
KR1
,
KR2
,
KR3
,
KR4
,
KR5
,
KR6
,
KR7
,
KR8
,
KR9
>&
key
,
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
svals
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
const
thrust
::
tuple
<
Cmp0
,
Cmp1
,
Cmp2
,
Cmp3
,
Cmp4
,
Cmp5
,
Cmp6
,
Cmp7
,
Cmp8
,
Cmp9
>&
cmp
,
unsigned
int
tid
,
unsigned
int
delta
)
{
For
<
0
,
thrust
::
tuple_size
<
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>
>::
value
>::
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
delta
);
}
//////////////////////////////////////////////////////
// Generic
template
<
unsigned
int
N
>
struct
Generic
{
template
<
class
KP
,
class
KR
,
class
VP
,
class
VR
,
class
Cmp
>
static
__device__
void
reduce
(
KP
skeys
,
KR
key
,
VP
svals
,
VR
val
,
unsigned
int
tid
,
Cmp
cmp
)
{
loadToSmem
(
skeys
,
key
,
tid
);
loadValsToSmem
(
svals
,
val
,
tid
);
if
(
N
>=
32
)
__syncthreads
();
if
(
N
>=
2048
)
{
if
(
tid
<
1024
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
1024
);
__syncthreads
();
}
if
(
N
>=
1024
)
{
if
(
tid
<
512
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
512
);
__syncthreads
();
}
if
(
N
>=
512
)
{
if
(
tid
<
256
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
256
);
__syncthreads
();
}
if
(
N
>=
256
)
{
if
(
tid
<
128
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
128
);
__syncthreads
();
}
if
(
N
>=
128
)
{
if
(
tid
<
64
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
64
);
__syncthreads
();
}
if
(
N
>=
64
)
{
if
(
tid
<
32
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
32
);
}
if
(
tid
<
16
)
{
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
16
);
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
8
);
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
4
);
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
2
);
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
1
);
}
}
};
template
<
unsigned
int
N
>
struct
WarpOptimized
{
template
<
class
KP
,
class
KR
,
class
VP
,
class
VR
,
class
Cmp
>
static
__device__
void
reduce
(
KP
skeys
,
KR
key
,
VP
svals
,
VR
val
,
unsigned
int
tid
,
Cmp
cmp
)
{
#if __CUDA_ARCH >= 300
(
void
)
skeys
;
(
void
)
svals
;
(
void
)
tid
;
#pragma unroll
for
(
unsigned
int
i
=
N
/
2
;
i
>=
1
;
i
/=
2
)
mergeShfl
(
key
,
val
,
cml
,
i
,
N
);
#else
loadToSmem
(
skeys
,
key
,
tid
);
loadToSmem
(
svals
,
val
,
tid
);
if
(
tid
<
N
/
2
)
{
#pragma unroll
for
(
unsigned
int
i
=
N
/
2
;
i
>=
1
;
i
/=
2
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
i
);
}
#endif
}
};
template
<
unsigned
int
N
>
struct
GenericOptimized32
{
enum
{
M
=
N
/
32
};
template
<
class
KP
,
class
KR
,
class
VP
,
class
VR
,
class
Cmp
>
static
__device__
void
reduce
(
KP
skeys
,
KR
key
,
VP
svals
,
VR
val
,
unsigned
int
tid
,
Cmp
cmp
)
{
const
unsigned
int
laneId
=
Warp
::
laneId
();
#if __CUDA_ARCH >= 300
#pragma unroll
for
(
unsigned
int
i
=
16
;
i
>=
1
;
i
/=
2
)
mergeShfl
(
key
,
val
,
cml
,
i
,
warpSize
);
if
(
laneId
==
0
)
{
loadToSmem
(
skeys
,
key
,
tid
/
32
);
loadToSmem
(
svals
,
val
,
tid
/
32
);
}
#else
loadToSmem
(
skeys
,
key
,
tid
);
loadToSmem
(
svals
,
val
,
tid
);
if
(
laneId
<
16
)
{
#pragma unroll
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
i
);
}
__syncthreads
();
if
(
laneId
==
0
)
{
loadToSmem
(
skeys
,
key
,
tid
/
32
);
loadToSmem
(
svals
,
val
,
tid
/
32
);
}
#endif
__syncthreads
();
loadFromSmem
(
skeys
,
key
,
tid
);
if
(
tid
<
32
)
{
#if __CUDA_ARCH >= 300
loadFromSmem
(
svals
,
val
,
tid
);
#pragma unroll
for
(
unsigned
int
i
=
M
/
2
;
i
>=
1
;
i
/=
2
)
mergeShfl
(
key
,
val
,
cml
,
i
,
M
);
#else
#pragma unroll
for
(
unsigned
int
i
=
M
/
2
;
i
>=
1
;
i
/=
2
)
merge
(
skeys
,
key
,
svals
,
val
,
cmp
,
tid
,
i
);
#endif
}
}
};
template
<
bool
val
,
class
T1
,
class
T2
>
struct
StaticIf
;
template
<
class
T1
,
class
T2
>
struct
StaticIf
<
true
,
T1
,
T2
>
{
typedef
T1
type
;
};
template
<
class
T1
,
class
T2
>
struct
StaticIf
<
false
,
T1
,
T2
>
{
typedef
T2
type
;
};
template
<
unsigned
int
N
>
struct
IsPowerOf2
{
enum
{
value
=
((
N
!=
0
)
&&
!
(
N
&
(
N
-
1
)))
};
};
template
<
unsigned
int
N
>
struct
Dispatcher
{
typedef
typename
StaticIf
<
(
N
<=
32
)
&&
IsPowerOf2
<
N
>::
value
,
WarpOptimized
<
N
>
,
typename
StaticIf
<
(
N
<=
1024
)
&&
IsPowerOf2
<
N
>::
value
,
GenericOptimized32
<
N
>
,
Generic
<
N
>
>::
type
>::
type
reductor
;
};
}
}}}
#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
modules/gpu/include/opencv2/gpu/device/reduce.hpp
0 → 100644
View file @
7a1874b2
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_REDUCE_HPP__
#define __OPENCV_GPU_REDUCE_HPP__
#include <thrust/tuple.h>
#include "detail/reduce.hpp"
#include "detail/reduce_key_val.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
int
N
,
typename
T
,
class
Op
>
__device__
__forceinline__
void
reduce
(
volatile
T
*
smem
,
T
&
val
,
unsigned
int
tid
,
const
Op
&
op
)
{
reduce_detail
::
Dispatcher
<
N
>::
reductor
::
template
reduce
<
volatile
T
*
,
T
&
,
const
Op
&>
(
smem
,
val
,
tid
,
op
);
}
template
<
int
N
,
typename
P0
,
typename
P1
,
typename
P2
,
typename
P3
,
typename
P4
,
typename
P5
,
typename
P6
,
typename
P7
,
typename
P8
,
typename
P9
,
typename
R0
,
typename
R1
,
typename
R2
,
typename
R3
,
typename
R4
,
typename
R5
,
typename
R6
,
typename
R7
,
typename
R8
,
typename
R9
,
class
Op0
,
class
Op1
,
class
Op2
,
class
Op3
,
class
Op4
,
class
Op5
,
class
Op6
,
class
Op7
,
class
Op8
,
class
Op9
>
__device__
__forceinline__
void
reduce
(
const
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>&
smem
,
const
thrust
::
tuple
<
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
,
R7
,
R8
,
R9
>&
val
,
unsigned
int
tid
,
const
thrust
::
tuple
<
Op0
,
Op1
,
Op2
,
Op3
,
Op4
,
Op5
,
Op6
,
Op7
,
Op8
,
Op9
>&
op
)
{
reduce_detail
::
Dispatcher
<
N
>::
reductor
::
template
reduce
<
const
thrust
::
tuple
<
P0
,
P1
,
P2
,
P3
,
P4
,
P5
,
P6
,
P7
,
P8
,
P9
>&
,
const
thrust
::
tuple
<
R0
,
R1
,
R2
,
R3
,
R4
,
R5
,
R6
,
R7
,
R8
,
R9
>&
,
const
thrust
::
tuple
<
Op0
,
Op1
,
Op2
,
Op3
,
Op4
,
Op5
,
Op6
,
Op7
,
Op8
,
Op9
>&>
(
smem
,
val
,
tid
,
op
);
}
template
<
unsigned
int
N
,
typename
K
,
typename
V
,
class
Cmp
>
__device__
__forceinline__
void
reduceKeyVal
(
volatile
K
*
skeys
,
K
&
key
,
volatile
V
*
svals
,
V
&
val
,
unsigned
int
tid
,
const
Cmp
&
cmp
)
{
reduce_key_val_detail
::
Dispatcher
<
N
>::
reductor
::
template
reduce
<
volatile
K
*
,
K
&
,
volatile
V
*
,
V
&
,
const
Cmp
&>
(
skeys
,
key
,
svals
,
val
,
tid
,
cmp
);
}
template
<
unsigned
int
N
,
typename
K
,
typename
VP0
,
typename
VP1
,
typename
VP2
,
typename
VP3
,
typename
VP4
,
typename
VP5
,
typename
VP6
,
typename
VP7
,
typename
VP8
,
typename
VP9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
,
class
Cmp
>
__device__
__forceinline__
void
reduceKeyVal
(
volatile
K
*
skeys
,
K
&
key
,
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
svals
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
unsigned
int
tid
,
const
Cmp
&
cmp
)
{
reduce_key_val_detail
::
Dispatcher
<
N
>::
reductor
::
template
reduce
<
volatile
K
*
,
K
&
,
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
,
const
Cmp
&>
(
skeys
,
key
,
svals
,
val
,
tid
,
cmp
);
}
template
<
unsigned
int
N
,
typename
KP0
,
typename
KP1
,
typename
KP2
,
typename
KP3
,
typename
KP4
,
typename
KP5
,
typename
KP6
,
typename
KP7
,
typename
KP8
,
typename
KP9
,
typename
KR0
,
typename
KR1
,
typename
KR2
,
typename
KR3
,
typename
KR4
,
typename
KR5
,
typename
KR6
,
typename
KR7
,
typename
KR8
,
typename
KR9
,
typename
VP0
,
typename
VP1
,
typename
VP2
,
typename
VP3
,
typename
VP4
,
typename
VP5
,
typename
VP6
,
typename
VP7
,
typename
VP8
,
typename
VP9
,
typename
VR0
,
typename
VR1
,
typename
VR2
,
typename
VR3
,
typename
VR4
,
typename
VR5
,
typename
VR6
,
typename
VR7
,
typename
VR8
,
typename
VR9
,
class
Cmp0
,
class
Cmp1
,
class
Cmp2
,
class
Cmp3
,
class
Cmp4
,
class
Cmp5
,
class
Cmp6
,
class
Cmp7
,
class
Cmp8
,
class
Cmp9
>
__device__
__forceinline__
void
reduceKeyVal
(
const
thrust
::
tuple
<
KP0
,
KP1
,
KP2
,
KP3
,
KP4
,
KP5
,
KP6
,
KP7
,
KP8
,
KP9
>&
skeys
,
const
thrust
::
tuple
<
KR0
,
KR1
,
KR2
,
KR3
,
KR4
,
KR5
,
KR6
,
KR7
,
KR8
,
KR9
>&
key
,
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
svals
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
val
,
unsigned
int
tid
,
const
thrust
::
tuple
<
Cmp0
,
Cmp1
,
Cmp2
,
Cmp3
,
Cmp4
,
Cmp5
,
Cmp6
,
Cmp7
,
Cmp8
,
Cmp9
>&
cmp
)
{
reduce_key_val_detail
::
Dispatcher
<
N
>::
reductor
::
template
reduce
<
const
thrust
::
tuple
<
KP0
,
KP1
,
KP2
,
KP3
,
KP4
,
KP5
,
KP6
,
KP7
,
KP8
,
KP9
>&
,
const
thrust
::
tuple
<
KR0
,
KR1
,
KR2
,
KR3
,
KR4
,
KR5
,
KR6
,
KR7
,
KR8
,
KR9
>&
,
const
thrust
::
tuple
<
VP0
,
VP1
,
VP2
,
VP3
,
VP4
,
VP5
,
VP6
,
VP7
,
VP8
,
VP9
>&
,
const
thrust
::
tuple
<
VR0
,
VR1
,
VR2
,
VR3
,
VR4
,
VR5
,
VR6
,
VR7
,
VR8
,
VR9
>&
,
const
thrust
::
tuple
<
Cmp0
,
Cmp1
,
Cmp2
,
Cmp3
,
Cmp4
,
Cmp5
,
Cmp6
,
Cmp7
,
Cmp8
,
Cmp9
>&
>
(
skeys
,
key
,
svals
,
val
,
tid
,
cmp
);
}
// smem_tuple
template
<
typename
T0
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*>
smem_tuple
(
T0
*
t0
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
);
}
template
<
typename
T0
,
typename
T1
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
,
typename
T3
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*
,
volatile
T3
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
,
T3
*
t3
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
,
(
volatile
T3
*
)
t3
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
,
typename
T3
,
typename
T4
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*
,
volatile
T3
*
,
volatile
T4
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
,
T3
*
t3
,
T4
*
t4
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
,
(
volatile
T3
*
)
t3
,
(
volatile
T4
*
)
t4
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
,
typename
T3
,
typename
T4
,
typename
T5
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*
,
volatile
T3
*
,
volatile
T4
*
,
volatile
T5
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
,
T3
*
t3
,
T4
*
t4
,
T5
*
t5
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
,
(
volatile
T3
*
)
t3
,
(
volatile
T4
*
)
t4
,
(
volatile
T5
*
)
t5
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
,
typename
T3
,
typename
T4
,
typename
T5
,
typename
T6
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*
,
volatile
T3
*
,
volatile
T4
*
,
volatile
T5
*
,
volatile
T6
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
,
T3
*
t3
,
T4
*
t4
,
T5
*
t5
,
T6
*
t6
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
,
(
volatile
T3
*
)
t3
,
(
volatile
T4
*
)
t4
,
(
volatile
T5
*
)
t5
,
(
volatile
T6
*
)
t6
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
,
typename
T3
,
typename
T4
,
typename
T5
,
typename
T6
,
typename
T7
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*
,
volatile
T3
*
,
volatile
T4
*
,
volatile
T5
*
,
volatile
T6
*
,
volatile
T7
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
,
T3
*
t3
,
T4
*
t4
,
T5
*
t5
,
T6
*
t6
,
T7
*
t7
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
,
(
volatile
T3
*
)
t3
,
(
volatile
T4
*
)
t4
,
(
volatile
T5
*
)
t5
,
(
volatile
T6
*
)
t6
,
(
volatile
T7
*
)
t7
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
,
typename
T3
,
typename
T4
,
typename
T5
,
typename
T6
,
typename
T7
,
typename
T8
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*
,
volatile
T3
*
,
volatile
T4
*
,
volatile
T5
*
,
volatile
T6
*
,
volatile
T7
*
,
volatile
T8
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
,
T3
*
t3
,
T4
*
t4
,
T5
*
t5
,
T6
*
t6
,
T7
*
t7
,
T8
*
t8
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
,
(
volatile
T3
*
)
t3
,
(
volatile
T4
*
)
t4
,
(
volatile
T5
*
)
t5
,
(
volatile
T6
*
)
t6
,
(
volatile
T7
*
)
t7
,
(
volatile
T8
*
)
t8
);
}
template
<
typename
T0
,
typename
T1
,
typename
T2
,
typename
T3
,
typename
T4
,
typename
T5
,
typename
T6
,
typename
T7
,
typename
T8
,
typename
T9
>
__device__
__forceinline__
thrust
::
tuple
<
volatile
T0
*
,
volatile
T1
*
,
volatile
T2
*
,
volatile
T3
*
,
volatile
T4
*
,
volatile
T5
*
,
volatile
T6
*
,
volatile
T7
*
,
volatile
T8
*
,
volatile
T9
*>
smem_tuple
(
T0
*
t0
,
T1
*
t1
,
T2
*
t2
,
T3
*
t3
,
T4
*
t4
,
T5
*
t5
,
T6
*
t6
,
T7
*
t7
,
T8
*
t8
,
T9
*
t9
)
{
return
thrust
::
make_tuple
((
volatile
T0
*
)
t0
,
(
volatile
T1
*
)
t1
,
(
volatile
T2
*
)
t2
,
(
volatile
T3
*
)
t3
,
(
volatile
T4
*
)
t4
,
(
volatile
T5
*
)
t5
,
(
volatile
T6
*
)
t6
,
(
volatile
T7
*
)
t7
,
(
volatile
T8
*
)
t8
,
(
volatile
T9
*
)
t9
);
}
}}}
#endif // __OPENCV_GPU_UTILITY_HPP__
modules/gpu/include/opencv2/gpu/device/utility.hpp
View file @
7a1874b2
...
@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Reduction
// Reduction
template
<
int
n
,
typename
T
,
typename
Op
>
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
template
<
int
n
,
typename
T
,
typename
Op
>
__device__
__forceinline__
void
reduce
_old
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
{
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
utility_detail
::
ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
data
,
partial_reduction
,
tid
,
op
);
utility_detail
::
ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
data
,
partial_reduction
,
tid
,
op
);
...
...
modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
View file @
7a1874b2
...
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
int
*
smem
,
int
tid
)
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
int
*
smem
,
int
tid
)
{
{
reduce
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
int
>
());
reduce
_old
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
int
>
());
}
}
__device__
__forceinline__
operator
int
()
const
__device__
__forceinline__
operator
int
()
const
...
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
float
*
smem
,
int
tid
)
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
float
*
smem
,
int
tid
)
{
{
reduce
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
float
>
());
reduce
_old
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
float
>
());
}
}
__device__
__forceinline__
operator
float
()
const
__device__
__forceinline__
operator
float
()
const
...
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
float
*
smem
,
int
tid
)
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
float
*
smem
,
int
tid
)
{
{
reduce
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
float
>
());
reduce
_old
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
float
>
());
}
}
__device__
__forceinline__
operator
float
()
const
__device__
__forceinline__
operator
float
()
const
...
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
int
*
smem
,
int
tid
)
template
<
int
THREAD_DIM
>
__device__
__forceinline__
void
reduceAll
(
int
*
smem
,
int
tid
)
{
{
reduce
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
int
>
());
reduce
_old
<
THREAD_DIM
>
(
smem
,
mySum
,
tid
,
plus
<
volatile
int
>
());
}
}
__device__
__forceinline__
operator
int
()
const
__device__
__forceinline__
operator
int
()
const
...
...
modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
0 → 100644
View file @
7a1874b2
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
>
__device__
__forceinline__
T
shfl
(
T
val
,
int
srcLane
,
int
width
=
warpSize
)
{
#if __CUDA_ARCH__ >= 300
return
__shfl
(
val
,
srcLane
,
width
);
#else
return
T
();
#endif
}
__device__
__forceinline__
double
shfl
(
double
val
,
int
srcLane
,
int
width
=
warpSize
)
{
#if __CUDA_ARCH__ >= 300
int
lo
=
__double2loint
(
val
);
int
hi
=
__double2hiint
(
val
);
lo
=
__shfl
(
lo
,
srcLane
,
width
);
hi
=
__shfl
(
hi
,
srcLane
,
width
);
return
__hiloint2double
(
hi
,
lo
);
#else
return
0.0
;
#endif
}
template
<
typename
T
>
__device__
__forceinline__
T
shfl_down
(
T
val
,
unsigned
int
delta
,
int
width
=
warpSize
)
{
#if __CUDA_ARCH__ >= 300
return
__shfl_down
(
val
,
delta
,
width
);
#else
return
T
();
#endif
}
__device__
__forceinline__
double
shfl_down
(
double
val
,
unsigned
int
delta
,
int
width
=
warpSize
)
{
#if __CUDA_ARCH__ >= 300
int
lo
=
__double2loint
(
val
);
int
hi
=
__double2hiint
(
val
);
lo
=
__shfl_down
(
lo
,
delta
,
width
);
hi
=
__shfl_down
(
hi
,
delta
,
width
);
return
__hiloint2double
(
hi
,
lo
);
#else
return
0.0
;
#endif
}
}}}
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
modules/gpu/src/cuda/orb.cu
View file @
7a1874b2
...
@@ -109,9 +109,9 @@ namespace cv { namespace gpu { namespace device
...
@@ -109,9 +109,9 @@ namespace cv { namespace gpu { namespace device
c += Ix * Iy;
c += Ix * Iy;
}
}
reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
reduce
_old
<32>(srow, a, threadIdx.x, plus<volatile int>());
reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
reduce
_old
<32>(srow, b, threadIdx.x, plus<volatile int>());
reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
reduce
_old
<32>(srow, c, threadIdx.x, plus<volatile int>());
if (threadIdx.x == 0)
if (threadIdx.x == 0)
{
{
...
@@ -167,7 +167,7 @@ namespace cv { namespace gpu { namespace device
...
@@ -167,7 +167,7 @@ namespace cv { namespace gpu { namespace device
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u);
m_10 += u * image(loc.y, loc.x + u);
reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
reduce
_old
<32>(srow, m_10, threadIdx.x, plus<volatile int>());
for (int v = 1; v <= half_k; ++v)
for (int v = 1; v <= half_k; ++v)
{
{
...
@@ -185,8 +185,8 @@ namespace cv { namespace gpu { namespace device
...
@@ -185,8 +185,8 @@ namespace cv { namespace gpu { namespace device
m_sum += u * (val_plus + val_minus);
m_sum += u * (val_plus + val_minus);
}
}
reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
reduce
_old
<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
reduce
_old
<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
m_10 += m_sum;
m_10 += m_sum;
m_01 += v * v_sum;
m_01 += v * v_sum;
...
@@ -419,4 +419,4 @@ namespace cv { namespace gpu { namespace device
...
@@ -419,4 +419,4 @@ namespace cv { namespace gpu { namespace device
}
}
}}}
}}}
#endif /* CUDA_DISABLER */
#endif /* CUDA_DISABLER */
\ No newline at end of file
modules/gpu/src/cuda/surf.cu
View file @
7a1874b2
...
@@ -599,8 +599,8 @@ namespace cv { namespace gpu { namespace device
...
@@ -599,8 +599,8 @@ namespace cv { namespace gpu { namespace device
sumy += s_Y[threadIdx.x + 96];
sumy += s_Y[threadIdx.x + 96];
}
}
device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
device::reduce
_old
<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
device::reduce
_old
<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
const float temp_mod = sumx * sumx + sumy * sumy;
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
if (temp_mod > best_mod)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment