Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0318d277
Commit
0318d277
authored
Jul 10, 2014
by
Alexander Karsakov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Enabled precalculated wave
parent
5dd92638
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
184 additions
and
183 deletions
+184
-183
cvdef.h
modules/core/include/opencv2/core/cvdef.h
+1
-0
perf_dxt.cpp
modules/core/perf/opencl/perf_dxt.cpp
+2
-2
dxt.cpp
modules/core/src/dxt.cpp
+77
-50
fft.cl
modules/core/src/opencl/fft.cl
+100
-127
test_dft.cpp
modules/core/test/ocl/test_dft.cpp
+4
-4
No files found.
modules/core/include/opencv2/core/cvdef.h
View file @
0318d277
...
...
@@ -244,6 +244,7 @@ typedef signed char schar;
/* fundamental constants */
#define CV_PI 3.1415926535897932384626433832795
#define CV_TWO_PI 6.283185307179586476925286766559
#define CV_LOG2 0.69314718055994530941723212145818
/****************************************************************************************\
...
...
modules/core/perf/opencl/perf_dxt.cpp
View file @
0318d277
...
...
@@ -57,8 +57,8 @@ namespace ocl {
typedef
tuple
<
Size
,
int
>
DftParams
;
typedef
TestBaseWithParam
<
DftParams
>
DftFixture
;
OCL_PERF_TEST_P
(
DftFixture
,
Dft
,
::
testing
::
Combine
(
Values
(
/*OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, */
Size
(
1024
,
1024
),
Size
(
1024
,
2048
),
Size
(
512
,
512
),
Size
(
2048
,
2048
)),
Values
((
int
)
DFT_ROWS
/*
, (int) 0/*, (int)DFT_SCALE, (int)DFT_INVERSE,
OCL_PERF_TEST_P
(
DftFixture
,
Dft
,
::
testing
::
Combine
(
Values
(
OCL_SIZE_1
,
OCL_SIZE_2
,
OCL_SIZE_3
,
Size
(
1024
,
1024
),
Size
(
1024
,
2048
),
Size
(
512
,
512
),
Size
(
2048
,
2048
)),
Values
((
int
)
DFT_ROWS
,
(
int
)
0
/*, (int)DFT_SCALE, (int)DFT_INVERSE,
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/
)))
{
const
DftParams
params
=
GetParam
();
...
...
modules/core/src/dxt.cpp
View file @
0318d277
...
...
@@ -2034,26 +2034,6 @@ namespace cv
#ifdef HAVE_OPENCL
static
bool
fft_radixN
(
InputArray
_src
,
OutputArray
_dst
,
int
radix
,
int
block_size
,
int
nonzero_rows
,
int
flags
)
{
int
N
=
_src
.
size
().
width
;
if
(
N
%
radix
)
return
false
;
UMat
src
=
_src
.
getUMat
();
UMat
dst
=
_dst
.
getUMat
();
int
thread_count
=
N
/
radix
;
size_t
globalsize
[
2
]
=
{
thread_count
,
nonzero_rows
};
String
kernel_name
=
format
(
"fft_radix%d"
,
radix
);
ocl
::
Kernel
k
(
kernel_name
.
c_str
(),
ocl
::
core
::
fft_oclsrc
,
(
flags
&
DFT_INVERSE
)
!=
0
?
"-D INVERSE"
:
""
);
if
(
k
.
empty
())
return
false
;
k
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
dst
),
block_size
,
thread_count
,
nonzero_rows
);
return
k
.
run
(
2
,
globalsize
,
NULL
,
false
);
}
static
bool
ocl_packToCCS
(
InputArray
_buffer
,
OutputArray
_dst
,
int
flags
)
{
UMat
buffer
=
_buffer
.
getUMat
();
...
...
@@ -2098,24 +2078,18 @@ static bool ocl_packToCCS(InputArray _buffer, OutputArray _dst, int flags)
return
true
;
}
static
bool
ocl_dft_C2C_row
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_rows
,
int
flags
)
static
std
::
vector
<
int
>
ocl_getRadixes
(
int
cols
,
int
&
min_radix
)
{
int
type
=
_src
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
),
channels
=
CV_MAT_CN
(
type
);
UMat
src
=
_src
.
getUMat
();
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
if
(
depth
==
CV_64F
&&
!
doubleSupport
)
return
false
;
int
factors
[
34
];
int
nf
=
DFTFactorize
(
src
.
cols
,
factors
);
int
nf
=
DFTFactorize
(
cols
,
factors
);
int
n
=
1
;
int
factor_index
=
0
;
String
radix_processing
;
int
min_radix
=
INT_MAX
;
// 1. 2^n transforms
// choose radix order
std
::
vector
<
int
>
radixes
;
// 2^n transforms
if
(
(
factors
[
factor_index
]
&
1
)
==
0
)
{
for
(
;
n
<
factors
[
factor_index
];
)
...
...
@@ -2126,24 +2100,76 @@ static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, int nonzero_rows,
else
if
(
4
*
n
<=
factors
[
0
])
radix
=
4
;
radix
_processing
+=
format
(
"fft_radix%d(smem,x,%d,%d);"
,
radix
,
n
,
src
.
cols
/
radix
);
min_radix
=
min
(
radix
,
min_
radix
);
radix
es
.
push_back
(
radix
);
min_radix
=
min
(
min_radix
,
radix
);
n
*=
radix
;
}
factor_index
++
;
}
//
2.
all the other transforms
// all the other transforms
for
(
;
factor_index
<
nf
;
factor_index
++
)
{
int
radix
=
factors
[
factor_index
];
radix_processing
+=
format
(
"fft_radix%d(smem,x,%d,%d);"
,
radix
,
n
,
src
.
cols
/
radix
);
min_radix
=
min
(
radix
,
min_radix
);
radixes
.
push_back
(
factors
[
factor_index
]);
min_radix
=
min
(
min_radix
,
factors
[
factor_index
]);
}
return
radixes
;
}
static
bool
ocl_dft_C2C_row
(
InputArray
_src
,
OutputArray
_dst
,
InputOutputArray
_twiddles
,
int
nonzero_rows
,
int
flags
)
{
int
type
=
_src
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
),
channels
=
CV_MAT_CN
(
type
);
UMat
src
=
_src
.
getUMat
();
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
if
(
depth
==
CV_64F
&&
!
doubleSupport
)
return
false
;
int
min_radix
=
INT_MAX
;
std
::
vector
<
int
>
radixes
=
ocl_getRadixes
(
src
.
cols
,
min_radix
);
// generate string with radix calls
String
radix_processing
;
int
n
=
1
,
twiddle_index
=
0
;
for
(
size_t
i
=
0
;
i
<
radixes
.
size
();
i
++
)
{
int
radix
=
radixes
[
i
];
radix_processing
+=
format
(
"fft_radix%d(smem,twiddles+%d,x,%d,%d);"
,
radix
,
twiddle_index
,
n
,
src
.
cols
/
radix
);
twiddle_index
+=
(
radix
-
1
)
*
n
;
n
*=
radix
;
}
UMat
dst
=
_dst
.
getUMat
();
UMat
twiddles
=
_twiddles
.
getUMat
();
if
(
twiddles
.
cols
!=
twiddle_index
)
{
// need to create/update tweedle table
int
buffer_size
=
twiddle_index
;
twiddles
.
create
(
1
,
buffer_size
,
CV_32FC2
);
Mat
tw
=
twiddles
.
getMat
(
ACCESS_WRITE
);
float
*
ptr
=
tw
.
ptr
<
float
>
();
int
ptr_index
=
0
;
int
n
=
1
;
for
(
size_t
i
=
0
;
i
<
radixes
.
size
();
i
++
)
{
int
radix
=
radixes
[
i
];
n
*=
radix
;
for
(
int
k
=
0
;
k
<
(
n
/
radix
);
k
++
)
{
double
theta
=
-
CV_TWO_PI
*
k
/
n
;
for
(
int
j
=
1
;
j
<
radix
;
j
++
)
{
ptr
[
ptr_index
++
]
=
cos
(
j
*
theta
);
ptr
[
ptr_index
++
]
=
sin
(
j
*
theta
);
}
}
}
}
//Mat buf = twiddles.getMat(ACCESS_READ);
UMat
dst
=
_dst
.
getUMat
();
int
thread_count
=
src
.
cols
/
min_radix
;
size_t
globalsize
[
2
]
=
{
thread_count
,
nonzero_rows
};
size_t
localsize
[
2
]
=
{
thread_count
,
1
};
...
...
@@ -2154,7 +2180,7 @@ static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, int nonzero_rows,
if
(
k
.
empty
())
return
false
;
k
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
dst
),
thread_count
,
nonzero_rows
);
k
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
dst
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
twiddles
),
thread_count
,
nonzero_rows
);
return
k
.
run
(
2
,
globalsize
,
localsize
,
false
);
}
...
...
@@ -2232,25 +2258,26 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
if
(
nonzero_rows
<=
0
||
nonzero_rows
>
_src
.
rows
()
)
nonzero_rows
=
_src
.
rows
();
UMat
buffer
;
if
(
!
ocl_dft_C2C_row
(
src
,
dst
,
nonzero_rows
,
flags
))
if
(
!
ocl_dft_C2C_row
(
src
,
dst
,
buffer
,
nonzero_rows
,
flags
))
return
false
;
if
((
flags
&
DFT_ROWS
)
==
0
&&
nonzero_rows
>
1
)
{
transpose
(
dst
,
dst
);
if
(
!
ocl_dft_C2C_row
(
dst
,
dst
,
dst
.
rows
,
flags
))
if
(
!
ocl_dft_C2C_row
(
dst
,
dst
,
buffer
,
dst
.
rows
,
flags
))
return
false
;
transpose
(
dst
,
dst
);
}
//
if (complex_output)
//
{
//
if (real_input && is1d)
// _dst.assign(buffer.colRange(0, buffer
.cols/2+1));
//
else
// _dst.assign(buffer
);
//
}
if
(
complex_output
)
{
if
(
real_input
&&
is1d
)
_dst
.
assign
(
dst
.
colRange
(
0
,
dst
.
cols
/
2
+
1
));
else
_dst
.
assign
(
dst
);
}
//else
//{
// if (!inv)
...
...
modules/core/src/opencl/fft.cl
View file @
0318d277
...
...
@@ -7,55 +7,36 @@ __constant float fft5_3 = -0.951056516295f;
__constant
float
fft5_4
=
-1.538841768587f
;
__constant
float
fft5_5
=
0.363271264002f
;
inline
float2
mul_float2
(
float2
a,
float2
b
)
{
__attribute__
((
always_inline
))
float2
mul_float2
(
float2
a,
float2
b
)
{
float2
res
;
res.x
=
a.x
*
b.x
-
a.y
*
b.y
;
res.y
=
a.x
*
b.y
+
a.y
*
b.x
;
return
res
;
}
inline
float2
sincos_float2
(
float
alpha
)
{
__attribute__
((
always_inline
))
float2
sincos_float2
(
float
alpha
)
{
float
cs,
sn
;
sn
=
sincos
(
alpha,
&cs
)
; // sincos
return
(
float2
)(
cs,
sn
)
;
}
inline
float2
twiddle
(
float2
a
)
{
__attribute__
((
always_inline
))
float2
twiddle
(
float2
a
)
{
return
(
float2
)(
a.y,
-a.x
)
;
}
inline
float2
square
(
float2
a
)
{
return
(
float2
)(
a.x
*
a.x
-
a.y
*
a.y,
2.0f
*
a.x
*
a.y
)
;
}
inline
float2
square3
(
float2
a
)
{
return
(
float2
)(
a.x
*
a.x
-
a.y
*
a.y,
3.0f
*
a.x
*
a.y
)
;
}
inline
float2
mul_p1q4
(
float2
a
)
{
return
(
float2
)(
SQRT_2
)
*
(
float2
)(
a.x
+
a.y,
-a.x
+
a.y
)
;
}
inline
float2
mul_p3q4
(
float2
a
)
{
return
(
float2
)(
SQRT_2
)
*
(
float2
)(
-a.x
+
a.y,
-a.x
-
a.y
)
;
}
__attribute__
((
always_inline
))
void
fft_radix2
(
__local
float2*
smem,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix2
(
__local
float2*
smem,
__global
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
float2
in1,
temp
;
float2
a0,
a1
;
if
(
x
<
t
)
{
in1
=
smem[x]
;
float2
in2
=
smem[x+t]
;
float
theta
=
-PI
*
k
/
block_size
;
float
cs
;
float
sn
=
sincos
(
theta,
&cs
)
;
temp
=
(
float2
)
(
in2.x
*
cs
-
in2.y
*
sn,
in2.y
*
cs
+
in2.x
*
sn
)
;
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],smem[x+t]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -64,36 +45,25 @@ void fft_radix2(__local float2* smem, const int x, const int block_size, const i
{
const
int
dst_ind
=
(
x
<<
1
)
-
k
;
smem[dst_ind]
=
in1
+
temp
;
smem[dst_ind+block_size]
=
in1
-
temp
;
smem[dst_ind]
=
a0
+
a1
;
smem[dst_ind+block_size]
=
a0
-
a1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix4
(
__local
float2*
smem,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix4
(
__local
float2*
smem,
__global
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
float2
b0,
b1,
b2,
b
3
;
float2
a0,
a1,
a2,
a
3
;
if
(
x
<
t
)
{
float
theta
=
-PI
*
k
/
(
2
*
block_size
)
;
float2
tw
=
sincos_float2
(
theta
)
;
float2
a0
=
smem[x]
;
float2
a1
=
mul_float2
(
tw,
smem[x+t]
)
;
float2
a2
=
smem[x
+
2*t]
;
float2
a3
=
mul_float2
(
tw,
smem[x
+
3*t]
)
;
tw
=
square
(
tw
)
;
a2
=
mul_float2
(
tw,
a2
)
;
a3
=
mul_float2
(
tw,
a3
)
;
b0
=
a0
+
a2
;
b1
=
a0
-
a2
;
b2
=
a1
+
a3
;
b3
=
twiddle
(
a1
-
a3
)
;
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[3*k],smem[x+t]
)
;
a2
=
mul_float2
(
twiddles[3*k
+
1],smem[x+2*t]
)
;
a3
=
mul_float2
(
twiddles[3*k
+
2],smem[x+3*t]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -101,63 +71,62 @@ void fft_radix4(__local float2* smem, const int x, const int block_size, const i
if
(
x
<
t
)
{
const
int
dst_ind
=
((
x
-
k
)
<<
2
)
+
k
;
smem[dst_ind]
=
b0
+
b2
;
smem[dst_ind
+
block_size]
=
b1
+
b3
;
smem[dst_ind
+
2*block_size]
=
b0
-
b2
;
smem[dst_ind
+
3*block_size]
=
b1
-
b3
;
float2
b0
=
a0
+
a2
;
a2
=
a0
-
a2
;
float2
b1
=
a1
+
a3
;
a3
=
twiddle
(
a1
-
a3
)
;
smem[dst_ind]
=
b0
+
b1
;
smem[dst_ind
+
block_size]
=
a2
+
a3
;
smem[dst_ind
+
2*block_size]
=
b0
-
b1
;
smem[dst_ind
+
3*block_size]
=
a2
-
a3
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix8
(
__local
float2*
smem,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix8
(
__local
float2*
smem,
__global
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
%
block_size
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
if
(
x
<
t
)
{
float
theta
=
-PI
*
k
/
(
4
*
block_size
)
;
int
tw_ind
=
block_size
/
8
;
float2
tw
=
sincos_float2
(
theta
)
; // W
a0
=
smem[x]
;
a1
=
mul_float2
(
tw,
smem[x
+
t]
)
;
a2
=
smem[x
+
2
*
t]
;
a3
=
mul_float2
(
tw,
smem[x
+
3
*
t]
)
;
a4
=
smem[x
+
4
*
t]
;
a5
=
mul_float2
(
tw,
smem[x
+
5
*
t]
)
;
a6
=
smem[x
+
6
*
t]
;
a7
=
mul_float2
(
tw,
smem[x
+
7
*
t]
)
;
tw
=
square
(
tw
)
; // W^2
a2
=
mul_float2
(
tw,
a2
)
;
a3
=
mul_float2
(
tw,
a3
)
;
a6
=
mul_float2
(
tw,
a6
)
;
a7
=
mul_float2
(
tw,
a7
)
;
tw
=
square
(
tw
)
; // W^4
a4
=
mul_float2
(
tw,
a4
)
;
a5
=
mul_float2
(
tw,
a5
)
;
a6
=
mul_float2
(
tw,
a6
)
;
a7
=
mul_float2
(
tw,
a7
)
;
float2
b0
=
a0
+
a4
;
float2
b4
=
a0
-
a4
;
float2
b1
=
a1
+
a5
;
float2
b5
=
mul_p1q4
(
a1
-
a5
)
;
float2
b2
=
a2
+
a6
;
float2
b6
=
twiddle
(
a2
-
a6
)
;
float2
b3
=
a3
+
a7
;
float2
b7
=
mul_p3q4
(
a3
-
a7
)
;
a0
=
b0
+
b2
;
a2
=
b0
-
b2
;
a1
=
b1
+
b3
;
a3
=
twiddle
(
b1
-
b3
)
;
a4
=
b4
+
b6
;
a6
=
b4
-
b6
;
a5
=
b5
+
b7
;
a7
=
twiddle
(
b5
-
b7
)
;
a1
=
mul_float2
(
twiddles[7*k],
smem[x
+
t]
)
;
a2
=
mul_float2
(
twiddles[7*k+1],smem[x+2*t]
)
;
a3
=
mul_float2
(
twiddles[7*k+2],smem[x+3*t]
)
;
a4
=
mul_float2
(
twiddles[7*k+3],smem[x+4*t]
)
;
a5
=
mul_float2
(
twiddles[7*k+4],smem[x+5*t]
)
;
a6
=
mul_float2
(
twiddles[7*k+5],smem[x+6*t]
)
;
a7
=
mul_float2
(
twiddles[7*k+6],smem[x+7*t]
)
;
float2
b0,
b1,
b6,
b7
;
b0
=
a0
+
a4
;
a4
=
a0
-
a4
;
b1
=
a1
+
a5
;
a5
=
a1
-
a5
;
a5
=
(
float2
)(
SQRT_2
)
*
(
float2
)(
a5.x
+
a5.y,
-a5.x
+
a5.y
)
;
b6
=
twiddle
(
a2
-
a6
)
;
a2
=
a2
+
a6
;
b7
=
a3
-
a7
;
b7
=
(
float2
)(
SQRT_2
)
*
(
float2
)(
-b7.x
+
b7.y,
-b7.x
-
b7.y
)
;
a3
=
a3
+
a7
;
a0
=
b0
+
a2
;
a2
=
b0
-
a2
;
a1
=
b1
+
a3
;
a3
=
twiddle
(
b1
-
a3
)
;
a6
=
a4
-
b6
;
a4
=
a4
+
b6
;
a7
=
twiddle
(
a5
-
b7
)
;
a5
=
a5
+
b7
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -181,21 +150,16 @@ void fft_radix8(__local float2* smem, const int x, const int block_size, const i
}
__attribute__
((
always_inline
))
void
fft_radix3
(
__local
float2*
smem,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix3
(
__local
float2*
smem,
__global
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
%
block_size
;
float2
a0,
a1,
a2
,
b0,
b1
;
float2
a0,
a1,
a2
;
if
(
x
<
t
)
{
const
float
theta
=
-PI
*
k
*
2
/
(
3
*
block_size
)
;
a0
=
smem[x]
;
a1
=
mul_float2
(
sincos_float2
(
theta
)
,
smem[x+t]
)
;
a2
=
mul_float2
(
sincos_float2
(
2
*
theta
)
,
smem[x+2*t]
)
;
b1
=
a1
+
a2
;
a2
=
twiddle
((
float2
)
sin_120*
(
a1
-
a2
))
;
b0
=
a0
-
(
float2
)(
0.5f
)
*b1
;
a1
=
mul_float2
(
twiddles[2*k],
smem[x+t]
)
;
a2
=
mul_float2
(
twiddles[2*k+1],
smem[x+2*t]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -204,6 +168,10 @@ void fft_radix3(__local float2* smem, const int x, const int block_size, const i
{
const
int
dst_ind
=
((
x
-
k
)
*
3
)
+
k
;
float2
b1
=
a1
+
a2
;
a2
=
twiddle
((
float2
)
sin_120*
(
a1
-
a2
))
;
float2
b0
=
a0
-
(
float2
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a0
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a2
;
smem[dst_ind
+
2*block_size]
=
b0
-
a2
;
...
...
@@ -213,20 +181,30 @@ void fft_radix3(__local float2* smem, const int x, const int block_size, const i
}
__attribute__
((
always_inline
))
void
fft_radix5
(
__local
float2*
smem,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix5
(
__local
float2*
smem,
__global
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
%
block_size
;
float2
a0,
a1,
a2,
a3,
a4
,
b0,
b1,
b2,
b5
;
float2
a0,
a1,
a2,
a3,
a4
;
if
(
x
<
t
)
{
const
float
theta
=
-PI
*
k
*
2
/
(
5
*
block_size
)
;
int
tw_ind
=
block_size
/
5
;
a0
=
smem[x]
;
a1
=
mul_float2
(
sincos_float2
(
theta
)
,
smem[x
+
t]
)
;
a2
=
mul_float2
(
sincos_float2
(
theta*2
)
,
smem[x+2*t]
)
;
a3
=
mul_float2
(
sincos_float2
(
theta*3
)
,
smem[x+3*t]
)
;
a4
=
mul_float2
(
sincos_float2
(
theta*4
)
,
smem[x+4*t]
)
;
a1
=
mul_float2
(
twiddles[4*k],
smem[x
+
t]
)
;
a2
=
mul_float2
(
twiddles[4*k+1],smem[x+2*t]
)
;
a3
=
mul_float2
(
twiddles[4*k+2],smem[x+3*t]
)
;
a4
=
mul_float2
(
twiddles[4*k+3],smem[x+4*t]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t
)
{
const
int
dst_ind
=
((
x
-
k
)
*
5
)
+
k
;
__local
float2*
dst
=
smem
+
dst_ind
;
float2
b0,
b1,
b5
;
b1
=
a1
+
a4
;
a1
-=
a4
;
...
...
@@ -234,13 +212,11 @@ void fft_radix5(__local float2* smem, const int x, const int block_size, const i
a4
=
a3
+
a2
;
a3
-=
a2
;
b2
=
b1
+
a4
;
b0
=
a0
-
(
float2
)
0.25f
*
b2
;
b1
=
(
float2
)
fft5_2
*
(
b1
-
a4
)
;
a4
=
-
(
float2
)
fft5_3
*
(
a1
+
a3
)
;
a4
=
twiddle
(
a4
)
;
a2
=
b1
+
a4
;
b0
=
a0
-
(
float2
)
0.25f
*
a2
;
b1
=
(
float2
)
fft5_2
*
(
b1
-
a4
)
;
a4
=
(
float2
)
fft5_3
*
(
float2
)(
-a1.y
-
a3.y,
a1.x
+
a3.x
)
;
b5
=
(
float2
)(
a4.x
-
fft5_5
*
a1.y,
a4.y
+
fft5_5
*
a1.x
)
;
a4.x
+=
fft5_4
*
a3.y
;
...
...
@@ -248,16 +224,8 @@ void fft_radix5(__local float2* smem, const int x, const int block_size, const i
a1
=
b0
+
b1
;
b0
-=
b1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t
)
{
const
int
dst_ind
=
((
x
-
k
)
*
5
)
+
k
;
__local
float2*
dst
=
smem
+
dst_ind
;
dst[0]
=
a0
+
b
2
;
dst[0]
=
a0
+
a
2
;
dst[block_size]
=
a1
+
a4
;
dst[2
*
block_size]
=
b0
+
b5
;
dst[3
*
block_size]
=
b0
-
b5
;
...
...
@@ -267,8 +235,9 @@ void fft_radix5(__local float2* smem, const int x, const int block_size, const i
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__kernel
void
fft_multi_radix
(
__global
const
uchar*
srcptr,
int
src_step,
int
src_offset,
__global
uchar*
dstptr,
int
dst_step,
int
dst_offset,
__kernel
void
fft_multi_radix
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
__global
const
uchar*
twiddles_ptr,
int
twiddles_step,
int
twiddles_offset,
const
int
t
,
const
int
nz
)
{
const
int
x
=
get_global_id
(
0
)
;
...
...
@@ -277,8 +246,9 @@ __kernel void fft_multi_radix(__global const uchar* srcptr, int src_step, int sr
if
(
y
<
nz
)
{
__local
float2
smem[LOCAL_SIZE]
;
__global
const
float2*
src
=
(
__global
const
float2*
)(
srcptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
src_offset
)))
;
__global
float2*
dst
=
(
__global
float2*
)(
dstptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
dst_offset
)))
;
__global
const
float2*
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
src_offset
)))
;
__global
float2*
dst
=
(
__global
float2*
)(
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
dst_offset
)))
;
__global
const
float2*
twiddles
=
(
__global
float2*
)
twiddles_ptr
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
#
pragma
unroll
...
...
@@ -292,6 +262,8 @@ __kernel void fft_multi_radix(__global const uchar* srcptr, int src_step, int sr
//
copy
data
to
dst
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
{
dst[i*block_size]
=
smem[x
+
i*block_size]
;
}
}
}
\ No newline at end of file
modules/core/test/ocl/test_dft.cpp
View file @
0318d277
...
...
@@ -115,10 +115,10 @@ OCL_TEST_P(Dft, Mat)
OCL_OFF
(
cv
::
dft
(
src
,
dst
,
dft_flags
));
OCL_ON
(
cv
::
dft
(
usrc
,
udst
,
dft_flags
));
Mat
gpu
=
udst
.
getMat
(
ACCESS_READ
);
std
::
cout
<<
src
<<
std
::
endl
;
std
::
cout
<<
dst
<<
std
::
endl
;
std
::
cout
<<
gpu
<<
std
::
endl
;
//
Mat gpu = udst.getMat(ACCESS_READ);
//
std::cout << src << std::endl;
//
std::cout << dst << std::endl;
//
std::cout << gpu << std::endl;
//int cn = udst.channels();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment