Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
e7502e76
Commit
e7502e76
authored
Oct 19, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
implemented gpu::gemm via CUBLAS
parent
90ff3dd9
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
259 additions
and
2 deletions
+259
-2
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+4
-0
perf_arithm.cpp
modules/gpu/perf/perf_arithm.cpp
+31
-0
arithm.cpp
modules/gpu/src/arithm.cpp
+128
-0
element_operations.cpp
modules/gpu/src/element_operations.cpp
+2
-2
test_arithm.cpp
modules/gpu/test/test_arithm.cpp
+64
-0
tests.cpp
samples/gpu/performance/tests.cpp
+30
-0
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
e7502e76
...
...
@@ -482,6 +482,10 @@ namespace cv
////////////////////////////// Arithmetics ///////////////////////////////////
//! implements generalized matrix product algorithm GEMM from BLAS
CV_EXPORTS
void
gemm
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
double
alpha
,
const
GpuMat
&
src3
,
double
beta
,
GpuMat
&
dst
,
int
flags
=
0
,
Stream
&
stream
=
Stream
::
Null
());
//! transposes the matrix
//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
CV_EXPORTS
void
transpose
(
const
GpuMat
&
src1
,
GpuMat
&
dst
,
Stream
&
stream
=
Stream
::
Null
());
...
...
modules/gpu/perf/perf_arithm.cpp
View file @
e7502e76
...
...
@@ -747,3 +747,34 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::Val
SANITY_CHECK
(
dst_host
);
}
PERF_TEST_P
(
DevInfo_Size
,
gemm
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
cv
::
Size
(
512
,
512
),
cv
::
Size
(
1024
,
1024
),
cv
::
Size
(
2048
,
2048
),
cv
::
Size
(
4096
,
4096
))))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
Size
size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
setDevice
(
devInfo
.
deviceID
());
Mat
src1_host
(
size
,
CV_32FC1
);
Mat
src2_host
(
size
,
CV_32FC1
);
Mat
src3_host
(
size
,
CV_32FC1
);
declare
.
in
(
src1_host
,
src2_host
,
src3_host
,
WARMUP_RNG
);
GpuMat
src1
(
src1_host
);
GpuMat
src2
(
src2_host
);
GpuMat
src3
(
src3_host
);
GpuMat
dst
(
size
,
CV_32FC1
);
declare
.
time
(
5.0
);
SIMPLE_TEST_CYCLE
()
{
gemm
(
src1
,
src2
,
1.0
,
src3
,
1.0
,
dst
);
}
Mat
dst_host
=
dst
;
SANITY_CHECK
(
dst_host
);
}
modules/gpu/src/arithm.cpp
View file @
e7502e76
...
...
@@ -48,6 +48,7 @@ using namespace std;
#if !defined (HAVE_CUDA)
void
cv
::
gpu
::
gemm
(
const
GpuMat
&
,
const
GpuMat
&
,
double
,
const
GpuMat
&
,
double
,
GpuMat
&
,
int
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
transpose
(
const
GpuMat
&
,
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
flip
(
const
GpuMat
&
,
GpuMat
&
,
int
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
LUT
(
const
GpuMat
&
,
const
Mat
&
,
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
...
...
@@ -63,6 +64,133 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
#else
/* !defined (HAVE_CUDA) */
////////////////////////////////////////////////////////////////////////
// gemm
void
cv
::
gpu
::
gemm
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
double
alpha
,
const
GpuMat
&
src3
,
double
beta
,
GpuMat
&
dst
,
int
flags
,
Stream
&
stream
)
{
#ifndef HAVE_CUBLAS
OPENCV_GPU_UNUSED
(
src1
);
OPENCV_GPU_UNUSED
(
src2
);
OPENCV_GPU_UNUSED
(
alpha
);
OPENCV_GPU_UNUSED
(
src3
);
OPENCV_GPU_UNUSED
(
beta
);
OPENCV_GPU_UNUSED
(
dst
);
OPENCV_GPU_UNUSED
(
flags
);
OPENCV_GPU_UNUSED
(
stream
);
throw_nogpu
();
#else
// CUBLAS works with column-major matrices
CV_Assert
(
src1
.
type
()
==
CV_32FC1
||
src1
.
type
()
==
CV_32FC2
||
src1
.
type
()
==
CV_64FC1
||
src1
.
type
()
==
CV_64FC2
);
CV_Assert
(
src2
.
type
()
==
src1
.
type
()
&&
(
src3
.
empty
()
||
src3
.
type
()
==
src1
.
type
()));
bool
tr1
=
flags
&
GEMM_1_T
;
bool
tr2
=
flags
&
GEMM_2_T
;
bool
tr3
=
flags
&
GEMM_3_T
;
Size
src1Size
=
tr1
?
Size
(
src1
.
rows
,
src1
.
cols
)
:
src1
.
size
();
Size
src2Size
=
tr2
?
Size
(
src2
.
rows
,
src2
.
cols
)
:
src2
.
size
();
Size
src3Size
=
tr3
?
Size
(
src3
.
rows
,
src3
.
cols
)
:
src3
.
size
();
Size
dstSize
(
src2Size
.
width
,
src1Size
.
height
);
CV_Assert
(
src1Size
.
width
==
src2Size
.
height
);
CV_Assert
(
src3
.
empty
()
||
src3Size
==
dstSize
);
dst
.
create
(
dstSize
,
CV_32FC1
);
if
(
beta
!=
0
)
{
if
(
src3
.
empty
())
{
if
(
stream
)
stream
.
enqueueMemSet
(
dst
,
Scalar
::
all
(
0
));
else
dst
.
setTo
(
Scalar
::
all
(
0
));
}
else
{
if
(
tr3
)
{
transpose
(
src3
,
dst
,
stream
);
}
else
{
if
(
stream
)
stream
.
enqueueCopy
(
src3
,
dst
);
else
src3
.
copyTo
(
dst
);
}
}
}
cublasHandle_t
handle
;
cublasSafeCall
(
cublasCreate_v2
(
&
handle
)
);
cublasSafeCall
(
cublasSetStream_v2
(
handle
,
StreamAccessor
::
getStream
(
stream
))
);
cublasSafeCall
(
cublasSetPointerMode_v2
(
handle
,
CUBLAS_POINTER_MODE_HOST
)
);
const
float
alphaf
=
static_cast
<
float
>
(
alpha
);
const
float
betaf
=
static_cast
<
float
>
(
beta
);
const
cuComplex
alphacf
=
make_cuComplex
(
alphaf
,
0
);
const
cuComplex
betacf
=
make_cuComplex
(
betaf
,
0
);
const
cuDoubleComplex
alphac
=
make_cuDoubleComplex
(
alpha
,
0
);
const
cuDoubleComplex
betac
=
make_cuDoubleComplex
(
beta
,
0
);
cublasOperation_t
transa
=
tr2
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
transb
=
tr1
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
switch
(
src1
.
type
())
{
case
CV_32FC1
:
cublasSafeCall
(
cublasSgemm_v2
(
handle
,
transa
,
transb
,
tr2
?
src2
.
rows
:
src2
.
cols
,
tr1
?
src1
.
cols
:
src1
.
rows
,
tr2
?
src2
.
cols
:
src2
.
rows
,
&
alphaf
,
src2
.
ptr
<
float
>
(),
static_cast
<
int
>
(
src2
.
step
/
sizeof
(
float
)),
src1
.
ptr
<
float
>
(),
static_cast
<
int
>
(
src1
.
step
/
sizeof
(
float
)),
&
betaf
,
dst
.
ptr
<
float
>
(),
static_cast
<
int
>
(
dst
.
step
/
sizeof
(
float
)))
);
break
;
case
CV_64FC1
:
cublasSafeCall
(
cublasDgemm_v2
(
handle
,
transa
,
transb
,
tr2
?
src2
.
rows
:
src2
.
cols
,
tr1
?
src1
.
cols
:
src1
.
rows
,
tr2
?
src2
.
cols
:
src2
.
rows
,
&
alpha
,
src2
.
ptr
<
double
>
(),
static_cast
<
int
>
(
src2
.
step
/
sizeof
(
double
)),
src1
.
ptr
<
double
>
(),
static_cast
<
int
>
(
src1
.
step
/
sizeof
(
double
)),
&
beta
,
dst
.
ptr
<
double
>
(),
static_cast
<
int
>
(
dst
.
step
/
sizeof
(
double
)))
);
break
;
case
CV_32FC2
:
cublasSafeCall
(
cublasCgemm_v2
(
handle
,
transa
,
transb
,
tr2
?
src2
.
rows
:
src2
.
cols
,
tr1
?
src1
.
cols
:
src1
.
rows
,
tr2
?
src2
.
cols
:
src2
.
rows
,
&
alphacf
,
src2
.
ptr
<
cuComplex
>
(),
static_cast
<
int
>
(
src2
.
step
/
sizeof
(
cuComplex
)),
src1
.
ptr
<
cuComplex
>
(),
static_cast
<
int
>
(
src1
.
step
/
sizeof
(
cuComplex
)),
&
betacf
,
dst
.
ptr
<
cuComplex
>
(),
static_cast
<
int
>
(
dst
.
step
/
sizeof
(
cuComplex
)))
);
break
;
case
CV_64FC2
:
cublasSafeCall
(
cublasZgemm_v2
(
handle
,
transa
,
transb
,
tr2
?
src2
.
rows
:
src2
.
cols
,
tr1
?
src1
.
cols
:
src1
.
rows
,
tr2
?
src2
.
cols
:
src2
.
rows
,
&
alphac
,
src2
.
ptr
<
cuDoubleComplex
>
(),
static_cast
<
int
>
(
src2
.
step
/
sizeof
(
cuDoubleComplex
)),
src1
.
ptr
<
cuDoubleComplex
>
(),
static_cast
<
int
>
(
src1
.
step
/
sizeof
(
cuDoubleComplex
)),
&
betac
,
dst
.
ptr
<
cuDoubleComplex
>
(),
static_cast
<
int
>
(
dst
.
step
/
sizeof
(
cuDoubleComplex
)))
);
break
;
}
cublasSafeCall
(
cublasDestroy_v2
(
handle
)
);
#endif
}
////////////////////////////////////////////////////////////////////////
// transpose
...
...
modules/gpu/src/element_operations.cpp
View file @
e7502e76
...
...
@@ -434,7 +434,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
{
0
/*multiply_gpu<double, unsigned char>*/
,
0
/*multiply_gpu<double, signed char>*/
,
0
/*multiply_gpu<double, unsigned short>*/
,
0
/*multiply_gpu<double, short>*/
,
0
/*multiply_gpu<double, int>*/
,
0
/*multiply_gpu<double, float>*/
,
multiply_gpu
<
double
,
double
>
}
};
CV_Assert
(
src
.
channels
()
==
1
);
//
CV_Assert(src.channels() == 1);
if
(
dtype
<
0
)
dtype
=
src
.
depth
();
...
...
@@ -463,7 +463,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
const
func_t
func
=
funcs
[
src
.
depth
()][
dst
.
depth
()];
CV_Assert
(
func
!=
0
);
func
(
src
,
sc
.
val
[
0
],
dst
,
scale
,
stream
);
func
(
src
.
reshape
(
1
),
sc
.
val
[
0
],
dst
.
reshape
(
1
)
,
scale
,
stream
);
}
////////////////////////////////////////////////////////////////////////
...
...
modules/gpu/test/test_arithm.cpp
View file @
e7502e76
...
...
@@ -1860,4 +1860,68 @@ INSTANTIATE_TEST_CASE_P(Arithm, Reduce, testing::Combine(
testing
::
Values
(
0
,
1
),
testing
::
Values
((
int
)
CV_REDUCE_SUM
,
(
int
)
CV_REDUCE_AVG
,
(
int
)
CV_REDUCE_MAX
,
(
int
)
CV_REDUCE_MIN
)));
//////////////////////////////////////////////////////////////////////////////
// gemm
struct
GEMM
:
testing
::
TestWithParam
<
std
::
tr1
::
tuple
<
cv
::
gpu
::
DeviceInfo
,
int
,
int
>
>
{
cv
::
gpu
::
DeviceInfo
devInfo
;
int
type
;
int
flags
;
int
size
;
cv
::
Mat
src1
;
cv
::
Mat
src2
;
cv
::
Mat
src3
;
double
alpha
;
double
beta
;
cv
::
Mat
dst_gold
;
virtual
void
SetUp
()
{
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
type
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
flags
=
std
::
tr1
::
get
<
2
>
(
GetParam
());
cv
::
gpu
::
setDevice
(
devInfo
.
deviceID
());
cv
::
RNG
&
rng
=
cvtest
::
TS
::
ptr
()
->
get_rng
();
size
=
rng
.
uniform
(
100
,
500
);
src1
=
cvtest
::
randomMat
(
rng
,
cv
::
Size
(
size
,
size
),
type
,
-
10.0
,
10.0
,
false
);
src2
=
cvtest
::
randomMat
(
rng
,
cv
::
Size
(
size
,
size
),
type
,
-
10.0
,
10.0
,
false
);
src3
=
cvtest
::
randomMat
(
rng
,
cv
::
Size
(
size
,
size
),
type
,
-
10.0
,
10.0
,
false
);
alpha
=
rng
.
uniform
(
-
10.0
,
10.0
);
beta
=
rng
.
uniform
(
-
10.0
,
10.0
);
cv
::
gemm
(
src1
,
src2
,
alpha
,
src3
,
beta
,
dst_gold
,
flags
);
}
};
TEST_P
(
GEMM
,
Accuracy
)
{
PRINT_PARAM
(
devInfo
);
PRINT_TYPE
(
type
);
PRINT_PARAM
(
flags
);
cv
::
Mat
dst
;
ASSERT_NO_THROW
(
cv
::
gpu
::
GpuMat
dev_dst
;
cv
::
gpu
::
gemm
(
cv
::
gpu
::
GpuMat
(
src1
),
cv
::
gpu
::
GpuMat
(
src2
),
alpha
,
cv
::
gpu
::
GpuMat
(
src3
),
beta
,
dev_dst
,
flags
);
dev_dst
.
download
(
dst
);
);
EXPECT_MAT_NEAR
(
dst_gold
,
dst
,
1e-1
);
}
INSTANTIATE_TEST_CASE_P
(
Arithm
,
GEMM
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
CV_32FC1
,
CV_32FC2
),
testing
::
Values
(
0
,
(
int
)
cv
::
GEMM_1_T
,
(
int
)
cv
::
GEMM_2_T
,
(
int
)
cv
::
GEMM_3_T
)));
#endif // HAVE_CUDA
samples/gpu/performance/tests.cpp
View file @
e7502e76
...
...
@@ -1471,3 +1471,33 @@ TEST(reduce)
GPU_OFF
;
}
}
TEST
(
gemm
)
{
Mat
src1
,
src2
,
src3
,
dst
;
gpu
::
GpuMat
d_src1
,
d_src2
,
d_src3
,
d_dst
;
for
(
int
size
=
512
;
size
<=
2048
;
size
*=
2
)
{
SUBTEST
<<
"size "
<<
size
<<
", 32FC1"
;
gen
(
src1
,
size
,
size
,
CV_32FC1
,
Scalar
::
all
(
-
10
),
Scalar
::
all
(
10
));
gen
(
src2
,
size
,
size
,
CV_32FC1
,
Scalar
::
all
(
-
10
),
Scalar
::
all
(
10
));
gen
(
src3
,
size
,
size
,
CV_32FC1
,
Scalar
::
all
(
-
10
),
Scalar
::
all
(
10
));
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CPU_ON
;
gemm
(
src1
,
src2
,
1.0
,
src3
,
1.0
,
dst
);
CPU_OFF
;
d_src1
=
src1
;
d_src2
=
src2
;
d_src3
=
src3
;
d_dst
.
create
(
d_src1
.
size
(),
d_src1
.
type
());
GPU_ON
;
gpu
::
gemm
(
d_src1
,
d_src2
,
1.0
,
d_src3
,
1.0
,
d_dst
);
GPU_OFF
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment