Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
52f76a32
Commit
52f76a32
authored
Jul 22, 2014
by
Alexander Karsakov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added rest Elena's changes
parent
77912645
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
458 additions
and
404 deletions
+458
-404
perf_dxt.cpp
modules/core/perf/opencl/perf_dxt.cpp
+3
-3
dxt.cpp
modules/core/src/dxt.cpp
+90
-41
fft.cl
modules/core/src/opencl/fft.cl
+349
-344
test_dft.cpp
modules/core/test/ocl/test_dft.cpp
+16
-16
No files found.
modules/core/perf/opencl/perf_dxt.cpp
View file @
52f76a32
...
...
@@ -65,10 +65,10 @@ enum OCL_FFT_TYPE
typedef
tuple
<
OCL_FFT_TYPE
,
Size
,
int
>
DftParams
;
typedef
TestBaseWithParam
<
DftParams
>
DftFixture
;
OCL_PERF_TEST_P
(
DftFixture
,
Dft
,
::
testing
::
Combine
(
Values
(
C2C
/*, R2R, C2R, R2C*/
),
OCL_PERF_TEST_P
(
DftFixture
,
Dft
,
::
testing
::
Combine
(
Values
(
C2C
,
R2R
,
C2R
,
R2C
),
Values
(
OCL_SIZE_1
,
OCL_SIZE_2
,
OCL_SIZE_3
,
Size
(
1024
,
1024
),
Size
(
512
,
512
),
Size
(
2048
,
2048
)),
Values
((
int
)
0
,
(
int
)
DFT_ROWS
,
(
int
)
DFT_SCALE
,
(
int
)
DFT_INVERSE
,
/*(int)DFT_INVERSE | DFT_SCALE,*/
(
int
)
DFT_ROWS
|
DFT_INVERSE
)))
Values
((
int
)
0
,
(
int
)
DFT_ROWS
,
(
int
)
DFT_SCALE
/*
, (int)DFT_INVERSE,
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/
)))
{
const
DftParams
params
=
GetParam
();
const
int
dft_type
=
get
<
0
>
(
params
);
...
...
modules/core/src/dxt.cpp
View file @
52f76a32
...
...
@@ -1791,14 +1791,6 @@ namespace cv {
CV_Assert(s == CLFFT_SUCCESS); \
}
enum
FftType
{
R2R
=
0
,
// real to real
C2R
=
1
,
// opencl HERMITIAN_INTERLEAVED to real
R2C
=
2
,
// real to opencl HERMITIAN_INTERLEAVED
C2C
=
3
// complex to complex
};
class
PlanCache
{
struct
FftPlan
...
...
@@ -2034,6 +2026,14 @@ namespace cv
#ifdef HAVE_OPENCL
enum
FftType
{
R2R
=
0
,
C2R
=
1
,
R2C
=
2
,
C2C
=
3
};
static
std
::
vector
<
int
>
ocl_getRadixes
(
int
cols
,
std
::
vector
<
int
>&
radixes
,
std
::
vector
<
int
>&
blocks
,
int
&
min_radix
)
{
int
factors
[
34
];
...
...
@@ -2054,13 +2054,19 @@ static std::vector<int> ocl_getRadixes(int cols, std::vector<int>& radixes, std:
else
if
(
4
*
n
<=
factors
[
0
])
{
radix
=
4
;
if
(
cols
%
8
==
0
)
if
(
cols
%
12
==
0
)
block
=
3
;
else
if
(
cols
%
8
==
0
)
block
=
2
;
}
else
{
if
(
cols
%
8
==
0
)
if
(
cols
%
10
==
0
)
block
=
5
;
else
if
(
cols
%
8
==
0
)
block
=
4
;
else
if
(
cols
%
6
==
0
)
block
=
3
;
else
if
(
cols
%
4
==
0
)
block
=
2
;
}
...
...
@@ -2081,6 +2087,8 @@ static std::vector<int> ocl_getRadixes(int cols, std::vector<int>& radixes, std:
{
if
(
cols
%
12
==
0
)
block
=
4
;
else
if
(
cols
%
9
==
0
)
block
=
3
;
else
if
(
cols
%
6
==
0
)
block
=
2
;
}
...
...
@@ -2142,7 +2150,6 @@ struct OCL_FftPlan
{
int
radix
=
radixes
[
i
];
n
*=
radix
;
for
(
int
j
=
1
;
j
<
radix
;
j
++
)
{
...
...
@@ -2160,7 +2167,7 @@ struct OCL_FftPlan
dft_size
,
dft_size
/
thread_count
,
radix_processing
.
c_str
());
}
bool
enqueueTransform
(
InputArray
_src
,
OutputArray
_dst
,
int
dft_size
,
int
flags
,
bool
rows
=
true
)
const
bool
enqueueTransform
(
InputArray
_src
,
OutputArray
_dst
,
int
dft_size
,
int
flags
,
int
fftType
,
bool
rows
=
true
)
const
{
if
(
!
status
)
return
false
;
...
...
@@ -2195,12 +2202,25 @@ struct OCL_FftPlan
if
(
src
.
channels
()
==
1
)
options
+=
" -D REAL_INPUT"
;
else
options
+=
" -D COMPLEX_INPUT"
;
if
(
dst
.
channels
()
==
1
)
options
+=
" -D CCS_OUTPUT"
;
if
((
is1d
&&
src
.
channels
()
==
1
)
||
(
rows
&&
(
flags
&
DFT_REAL_OUTPUT
)))
options
+=
" -D NO_CONJUGATE"
;
options
+=
" -D REAL_OUTPUT"
;
if
(
is1d
)
options
+=
" -D IS_1D"
;
if
(
!
inv
)
{
if
((
is1d
&&
src
.
channels
()
==
1
)
||
(
rows
&&
(
fftType
==
R2R
)))
options
+=
" -D NO_CONJUGATE"
;
}
else
{
if
(
is1d
&&
fftType
==
C2R
||
(
rows
&&
fftType
==
R2R
))
options
+=
" -D NO_CONJUGATE"
;
if
(
dst
.
cols
%
2
==
0
)
options
+=
" -D EVEN"
;
}
ocl
::
Kernel
k
(
kernel_name
.
c_str
(),
ocl
::
core
::
fft_oclsrc
,
options
);
if
(
k
.
empty
())
...
...
@@ -2253,16 +2273,16 @@ protected:
std
::
vector
<
OCL_FftPlan
*>
planStorage
;
};
static
bool
ocl_dft_C2C_rows
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_rows
,
int
flags
)
static
bool
ocl_dft_C2C_rows
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_rows
,
int
flags
,
int
fftType
)
{
const
OCL_FftPlan
*
plan
=
OCL_FftPlanCache
::
getInstance
().
getFftPlan
(
_src
.
cols
(),
flags
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_rows
,
flags
,
true
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_rows
,
flags
,
fftType
,
true
);
}
static
bool
ocl_dft_C2C_cols
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_cols
,
int
flags
)
static
bool
ocl_dft_C2C_cols
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_cols
,
int
flags
,
int
fftType
)
{
const
OCL_FftPlan
*
plan
=
OCL_FftPlanCache
::
getInstance
().
getFftPlan
(
_src
.
rows
(),
flags
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_cols
,
flags
,
false
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_cols
,
flags
,
f
ftType
,
f
alse
);
}
static
bool
ocl_dft
(
InputArray
_src
,
OutputArray
_dst
,
int
flags
,
int
nonzero_rows
)
...
...
@@ -2298,29 +2318,26 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
complex_output
=
1
;
}
FftType
fftType
=
(
FftType
)(
complex_input
<<
0
|
complex_output
<<
1
);
// Forward Complex to CCS not supported
if
(
complex_input
&&
real_output
&&
!
inv
)
{
flags
^=
DFT_REAL_OUTPUT
;
flags
|=
DFT_COMPLEX_OUTPUT
;
real_output
=
0
;
complex_output
=
1
;
}
if
(
fftType
==
C2R
&&
!
inv
)
fftType
=
C2C
;
// Inverse CCS to Complex not supported
if
(
real_input
&&
complex_output
&&
inv
)
{
complex_output
=
0
;
real_output
=
1
;
}
if
(
fftType
==
R2C
&&
inv
)
fftType
=
R2R
;
UMat
output
;
if
(
complex_output
)
if
(
fftType
==
C2C
||
fftType
==
R2C
)
{
// complex output
_dst
.
create
(
src
.
size
(),
CV_32FC2
);
output
=
_dst
.
getUMat
();
}
}
else
{
// real output
if
(
is1d
)
{
_dst
.
create
(
src
.
size
(),
CV_32FC1
);
...
...
@@ -2333,17 +2350,49 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
}
}
if
(
!
ocl_dft_C2C_rows
(
src
,
output
,
nonzero_rows
,
flags
))
return
false
;
if
(
!
is1d
)
if
(
!
inv
)
{
int
nonzero_cols
=
real_input
&&
real_output
?
output
.
cols
/
2
+
1
:
output
.
cols
;
if
(
!
ocl_dft_C2C_cols
(
output
,
_dst
,
nonzero_cols
,
flags
))
if
(
!
ocl_dft_C2C_rows
(
src
,
output
,
nonzero_rows
,
flags
,
fftType
))
return
false
;
}
else
if
(
!
is1d
)
{
int
nonzero_cols
=
fftType
==
R2R
?
output
.
cols
/
2
+
1
:
output
.
cols
;
if
(
!
ocl_dft_C2C_cols
(
output
,
_dst
,
nonzero_cols
,
flags
,
fftType
))
return
false
;
}
}
else
{
_dst
.
assign
(
output
);
if
(
fftType
==
C2C
)
{
// complex output
if
(
!
ocl_dft_C2C_rows
(
src
,
output
,
nonzero_rows
,
flags
,
fftType
))
return
false
;
if
(
!
is1d
)
{
if
(
!
ocl_dft_C2C_cols
(
output
,
output
,
output
.
cols
,
flags
,
fftType
))
return
false
;
}
}
else
{
if
(
is1d
)
{
if
(
!
ocl_dft_C2C_rows
(
src
,
output
,
nonzero_rows
,
flags
,
fftType
))
return
false
;
}
else
{
int
nonzero_cols
=
src
.
cols
/
2
+
1
;
// : src.cols;
if
(
!
ocl_dft_C2C_cols
(
src
,
output
,
nonzero_cols
,
flags
,
fftType
))
return
false
;
if
(
!
ocl_dft_C2C_rows
(
output
,
_dst
,
nonzero_rows
,
flags
,
fftType
))
return
false
;
}
}
}
return
true
;
}
...
...
modules/core/src/opencl/fft.cl
View file @
52f76a32
...
...
@@ -16,106 +16,224 @@ float2 twiddle(float2 a) {
}
__attribute__
((
always_inline
))
void
fft_radix2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
butterfly2
(
float2
a0,
float2
a1,
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
a1
=
mul_float2
(
twiddles[k],
a1
)
;
const
int
dst_ind
=
(
x
<<
1
)
-
k
;
smem[dst_ind]
=
a0
+
a1
;
smem[dst_ind+block_size]
=
a0
-
a1
;
}
__attribute__
((
always_inline
))
void
butterfly4
(
float2
a0,
float2
a1,
float2
a2,
float2
a3,
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
a1
=
mul_float2
(
twiddles[k],
a1
)
;
a2
=
mul_float2
(
twiddles[k
+
block_size],
a2
)
;
a3
=
mul_float2
(
twiddles[k
+
2*block_size],
a3
)
;
const
int
dst_ind
=
((
x
-
k
)
<<
2
)
+
k
;
float2
b0
=
a0
+
a2
;
a2
=
a0
-
a2
;
float2
b1
=
a1
+
a3
;
a3
=
twiddle
(
a1
-
a3
)
;
smem[dst_ind]
=
b0
+
b1
;
smem[dst_ind
+
block_size]
=
a2
+
a3
;
smem[dst_ind
+
2*block_size]
=
b0
-
b1
;
smem[dst_ind
+
3*block_size]
=
a2
-
a3
;
}
__attribute__
((
always_inline
))
void
butterfly3
(
float2
a0,
float2
a1,
float2
a2,
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
a1
=
mul_float2
(
twiddles[k],
a1
)
;
a2
=
mul_float2
(
twiddles[k+block_size],
a2
)
;
const
int
dst_ind
=
((
x
-
k
)
*
3
)
+
k
;
float2
b1
=
a1
+
a2
;
a2
=
twiddle
(
sin_120*
(
a1
-
a2
))
;
float2
b0
=
a0
-
(
float2
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a0
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a2
;
smem[dst_ind
+
2*block_size]
=
b0
-
a2
;
}
__attribute__
((
always_inline
))
void
butterfly5
(
float2
a0,
float2
a1,
float2
a2,
float2
a3,
float2
a4,
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
a1
=
mul_float2
(
twiddles[k],
a1
)
;
a2
=
mul_float2
(
twiddles[k
+
block_size],
a2
)
;
a3
=
mul_float2
(
twiddles[k+2*block_size],
a3
)
;
a4
=
mul_float2
(
twiddles[k+3*block_size],
a4
)
;
const
int
dst_ind
=
((
x
-
k
)
*
5
)
+
k
;
__local
float2*
dst
=
smem
+
dst_ind
;
float2
b0,
b1,
b5
;
b1
=
a1
+
a4
;
a1
-=
a4
;
a4
=
a3
+
a2
;
a3
-=
a2
;
a2
=
b1
+
a4
;
b0
=
a0
-
(
float2
)
0.25f
*
a2
;
b1
=
fft5_2
*
(
b1
-
a4
)
;
a4
=
fft5_3
*
(
float2
)(
-a1.y
-
a3.y,
a1.x
+
a3.x
)
;
b5
=
(
float2
)(
a4.x
-
fft5_5
*
a1.y,
a4.y
+
fft5_5
*
a1.x
)
;
a4.x
+=
fft5_4
*
a3.y
;
a4.y
-=
fft5_4
*
a3.x
;
a1
=
b0
+
b1
;
b0
-=
b1
;
dst[0]
=
a0
+
a2
;
dst[block_size]
=
a1
+
a4
;
dst[2
*
block_size]
=
b0
+
b5
;
dst[3
*
block_size]
=
b0
-
b5
;
dst[4
*
block_size]
=
a1
-
a4
;
}
__attribute__
((
always_inline
))
void
fft_radix2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
float2
a0,
a1
;
if
(
x
<
t
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],smem[x+t]
)
;
a1
=
smem[x+t]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t
)
{
const
int
dst_ind
=
(
x
<<
1
)
-
k
;
smem[dst_ind]
=
a0
+
a1
;
smem[dst_ind+block_size]
=
a0
-
a1
;
}
butterfly2
(
a0,
a1,
smem,
twiddles,
x,
block_size
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix2_B2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix2_B2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x
1
,
const
int
block_size,
const
int
t
)
{
const
int
k1
=
x
&
(
block_size
-
1
)
;
const
int
x2
=
x
+
t/2
;
const
int
k2
=
x2
&
(
block_size
-
1
)
;
const
int
x2
=
x1
+
t/2
;
float2
a0,
a1,
a2,
a3
;
if
(
x
<
t/2
)
if
(
x
1
<
t/2
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k1],smem[x+t]
)
;
a2
=
smem[x2]
;
a3
=
mul_float2
(
twiddles[k2],smem[x2+t]
)
;
a0
=
smem[x1]
; a1 = smem[x1+t];
a2
=
smem[x2]
; a3 = smem[x2+t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t/2
)
if
(
x
1
<
t/2
)
{
int
dst_ind
=
(
x
<<
1
)
-
k1
;
smem[dst_ind]
=
a0
+
a1
;
smem[dst_ind+block_size]
=
a0
-
a1
;
butterfly2
(
a0,
a1,
smem,
twiddles,
x1,
block_size
)
;
butterfly2
(
a2,
a3,
smem,
twiddles,
x2,
block_size
)
;
}
dst_ind
=
(
x2
<<
1
)
-
k2
;
smem[dst_ind]
=
a2
+
a3
;
smem[dst_ind+block_size]
=
a2
-
a3
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix2_B3
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
const
int
x2
=
x1
+
t/3
;
const
int
x3
=
x1
+
2*t/3
;
float2
a0,
a1,
a2,
a3,
a4,
a5
;
if
(
x1
<
t/3
)
{
a0
=
smem[x1]
; a1 = smem[x1+t];
a2
=
smem[x2]
; a3 = smem[x2+t];
a4
=
smem[x3]
; a5 = smem[x3+t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x1
<
t/3
)
{
butterfly2
(
a0,
a1,
smem,
twiddles,
x1,
block_size
)
;
butterfly2
(
a2,
a3,
smem,
twiddles,
x2,
block_size
)
;
butterfly2
(
a4,
a5,
smem,
twiddles,
x3,
block_size
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix2_B4
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix2_B4
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x
1
,
const
int
block_size,
const
int
t
)
{
const
int
thread_block
=
t/4
;
const
int
k1
=
x
&
(
block_size
-
1
)
;
const
int
x2
=
x
+
thread_block
;
const
int
k2
=
x2
&
(
block_size
-
1
)
;
const
int
x3
=
x
+
2*thread_block
;
const
int
k3
=
x3
&
(
block_size
-
1
)
;
const
int
x4
=
x
+
3*thread_block
;
const
int
k4
=
x4
&
(
block_size
-
1
)
;
const
int
x2
=
x1
+
thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
if
(
x
<
t/4
)
if
(
x
1
<
t/4
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k1],smem[x+t]
)
;
a2
=
smem[x2]
;
a3
=
mul_float2
(
twiddles[k2],smem[x2+t]
)
;
a4
=
smem[x3]
;
a5
=
mul_float2
(
twiddles[k3],smem[x3+t]
)
;
a6
=
smem[x4]
;
a7
=
mul_float2
(
twiddles[k4],smem[x4+t]
)
;
a0
=
smem[x1]
; a1 = smem[x1+t];
a2
=
smem[x2]
; a3 = smem[x2+t];
a4
=
smem[x3]
; a5 = smem[x3+t];
a6
=
smem[x4]
; a7 = smem[x4+t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t/4
)
if
(
x
1
<
t/4
)
{
int
dst_ind
=
(
x
<<
1
)
-
k1
;
smem[dst_ind]
=
a0
+
a1
;
smem[dst_ind+block_size]
=
a0
-
a1
;
butterfly2
(
a0,
a1,
smem,
twiddles,
x1,
block_size
)
;
butterfly2
(
a2,
a3,
smem,
twiddles,
x2,
block_size
)
;
butterfly2
(
a4,
a5,
smem,
twiddles,
x3,
block_size
)
;
butterfly2
(
a6,
a7,
smem,
twiddles,
x4,
block_size
)
;
}
dst_ind
=
(
x2
<<
1
)
-
k2
;
smem[dst_ind]
=
a2
+
a3
;
smem[dst_ind+block_size]
=
a2
-
a3
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
dst_ind
=
(
x3
<<
1
)
-
k3
;
smem[dst_ind]
=
a4
+
a5
;
smem[dst_ind+block_size]
=
a4
-
a5
;
__attribute__
((
always_inline
))
void
fft_radix2_B5
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
const
int
thread_block
=
t/5
;
const
int
x2
=
x1
+
thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
const
int
x5
=
x1
+
4*thread_block
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9
;
dst_ind
=
(
x4
<<
1
)
-
k4
;
smem[dst_ind]
=
a6
+
a7
;
smem[dst_ind+block_size]
=
a6
-
a7
;
if
(
x1
<
t/5
)
{
a0
=
smem[x1]
; a1 = smem[x1+t];
a2
=
smem[x2]
; a3 = smem[x2+t];
a4
=
smem[x3]
; a5 = smem[x3+t];
a6
=
smem[x4]
; a7 = smem[x4+t];
a8
=
smem[x5]
; a9 = smem[x5+t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x1
<
t/5
)
{
butterfly2
(
a0,
a1,
smem,
twiddles,
x1,
block_size
)
;
butterfly2
(
a2,
a3,
smem,
twiddles,
x2,
block_size
)
;
butterfly2
(
a4,
a5,
smem,
twiddles,
x3,
block_size
)
;
butterfly2
(
a6,
a7,
smem,
twiddles,
x4,
block_size
)
;
butterfly2
(
a8,
a9,
smem,
twiddles,
x5,
block_size
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -124,85 +242,65 @@ void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, cons
__attribute__
((
always_inline
))
void
fft_radix4
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
float2
a0,
a1,
a2,
a3
;
if
(
x
<
t
)
{
const
int
twiddle_block
=
block_size
/
4
;
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],smem[x+t]
)
;
a2
=
mul_float2
(
twiddles[k
+
block_size],smem[x+2*t]
)
;
a3
=
mul_float2
(
twiddles[k
+
2*block_size],smem[x+3*t]
)
;
a0
=
smem[x]
; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t
)
butterfly4
(
a0,
a1,
a2,
a3,
smem,
twiddles,
x,
block_size
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix4_B2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
const
int
x2
=
x1
+
t/2
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
if
(
x1
<
t/2
)
{
const
int
dst_ind
=
((
x
-
k
)
<<
2
)
+
k
;
a0
=
smem[x1]
; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
a4
=
smem[x2]
; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
}
float2
b0
=
a0
+
a2
;
a2
=
a0
-
a2
;
float2
b1
=
a1
+
a3
;
a3
=
twiddle
(
a1
-
a3
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem[dst_ind]
=
b0
+
b1
;
smem[dst_ind
+
block_size]
=
a2
+
a3
;
smem[dst_ind
+
2*block_size]
=
b0
-
b1
;
smem[dst_ind
+
3*block_size]
=
a2
-
a3
;
if
(
x1
<
t/2
)
{
butterfly4
(
a0,
a1,
a2,
a3,
smem,
twiddles,
x1,
block_size
)
;
butterfly4
(
a4,
a5,
a6,
a7,
smem,
twiddles,
x2,
block_size
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix4_B
2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x
,
const
int
block_size,
const
int
t
)
void
fft_radix4_B
3
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x1
,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
&
(
block_size
-
1
)
;
const
int
x2
=
x
+
t/2
;
const
int
k2
=
x2
&
(
block_size
-
1
)
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
const
int
x2
=
x1
+
t/3
;
const
int
x3
=
x2
+
t/3
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9,
a10,
a11
;
if
(
x
<
t/2
)
if
(
x
1
<
t/3
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],
smem[x+t]
)
;
a2
=
mul_float2
(
twiddles[k
+
block_size],smem[x+2*t]
)
;
a3
=
mul_float2
(
twiddles[k
+
2*block_size],smem[x+3*t]
)
;
a4
=
smem[x2]
;
a5
=
mul_float2
(
twiddles[k2],
smem[x2+t]
)
;
a6
=
mul_float2
(
twiddles[k2
+
block_size],smem[x2+2*t]
)
;
a7
=
mul_float2
(
twiddles[k2
+
2*block_size],smem[x2+3*t]
)
;
a0
=
smem[x1]
; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
a4
=
smem[x2]
; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
a8
=
smem[x3]
; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t/2
)
if
(
x
1
<
t/3
)
{
int
dst_ind
=
((
x
-
k
)
<<
2
)
+
k
;
float2
b0
=
a0
+
a2
;
a2
=
a0
-
a2
;
float2
b1
=
a1
+
a3
;
a3
=
twiddle
(
a1
-
a3
)
;
smem[dst_ind]
=
b0
+
b1
;
smem[dst_ind
+
block_size]
=
a2
+
a3
;
smem[dst_ind
+
2*block_size]
=
b0
-
b1
;
smem[dst_ind
+
3*block_size]
=
a2
-
a3
;
dst_ind
=
((
x2
-
k2
)
<<
2
)
+
k2
;
b0
=
a4
+
a6
;
a6
=
a4
-
a6
;
b1
=
a5
+
a7
;
a7
=
twiddle
(
a5
-
a7
)
;
smem[dst_ind]
=
b0
+
b1
;
smem[dst_ind
+
block_size]
=
a6
+
a7
;
smem[dst_ind
+
2*block_size]
=
b0
-
b1
;
smem[dst_ind
+
3*block_size]
=
a6
-
a7
;
butterfly4
(
a0,
a1,
a2,
a3,
smem,
twiddles,
x1,
block_size
)
;
butterfly4
(
a4,
a5,
a6,
a7,
smem,
twiddles,
x2,
block_size
)
;
butterfly4
(
a8,
a9,
a10,
a11,
smem,
twiddles,
x3,
block_size
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -274,156 +372,95 @@ void fft_radix8(__local float2* smem, __constant const float2* twiddles, const i
__attribute__
((
always_inline
))
void
fft_radix3
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
%
block_size
;
float2
a0,
a1,
a2
;
if
(
x
<
t
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],
smem[x+t]
)
;
a2
=
mul_float2
(
twiddles[k+block_size],
smem[x+2*t]
)
;
a0
=
smem[x]
; a1 = smem[x+t]; a2 = smem[x+2*t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t
)
{
const
int
dst_ind
=
((
x
-
k
)
*
3
)
+
k
;
float2
b1
=
a1
+
a2
;
a2
=
twiddle
(
sin_120*
(
a1
-
a2
))
;
float2
b0
=
a0
-
(
float2
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a0
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a2
;
smem[dst_ind
+
2*block_size]
=
b0
-
a2
;
}
butterfly3
(
a0,
a1,
a2,
smem,
twiddles,
x,
block_size
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix3_B2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix3_B2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x
1
,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
%
block_size
;
const
int
x2
=
x
+
t/2
;
const
int
k2
=
x2
%
block_size
;
const
int
x2
=
x1
+
t/2
;
float2
a0,
a1,
a2,
a3,
a4,
a5
;
if
(
x
<
t/2
)
if
(
x
1
<
t/2
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],
smem[x+t]
)
;
a2
=
mul_float2
(
twiddles[k+block_size],
smem[x+2*t]
)
;
a3
=
smem[x2]
;
a4
=
mul_float2
(
twiddles[k2],
smem[x2+t]
)
;
a5
=
mul_float2
(
twiddles[k2+block_size],
smem[x2+2*t]
)
;
a0
=
smem[x1]
; a1 = smem[x1+t]; a2 = smem[x1+2*t];
a3
=
smem[x2]
; a4 = smem[x2+t]; a5 = smem[x2+2*t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t/2
)
if
(
x
1
<
t/2
)
{
int
dst_ind
=
((
x
-
k
)
*
3
)
+
k
;
butterfly3
(
a0,
a1,
a2,
smem,
twiddles,
x1,
block_size
)
;
butterfly3
(
a3,
a4,
a5,
smem,
twiddles,
x2,
block_size
)
;
}
float2
b1
=
a1
+
a2
;
a2
=
twiddle
(
sin_120*
(
a1
-
a2
))
;
float2
b0
=
a0
-
(
float2
)(
0.5f
)
*b1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
smem[dst_ind]
=
a0
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a2
;
smem[dst_ind
+
2*block_size]
=
b0
-
a2
;
__attribute__
((
always_inline
))
void
fft_radix3_B3
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
const
int
x2
=
x1
+
t/3
;
const
int
x3
=
x2
+
t/3
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8
;
dst_ind
=
((
x2
-
k2
)
*
3
)
+
k2
;
if
(
x1
<
t/2
)
{
a0
=
smem[x1]
; a1 = smem[x1+t]; a2 = smem[x1+2*t];
a3
=
smem[x2]
; a4 = smem[x2+t]; a5 = smem[x2+2*t];
a6
=
smem[x3]
; a7 = smem[x3+t]; a8 = smem[x3+2*t];
}
b1
=
a4
+
a5
;
a5
=
twiddle
(
sin_120*
(
a4
-
a5
))
;
b0
=
a3
-
(
float2
)(
0.5f
)
*b1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
smem[dst_ind]
=
a3
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a5
;
smem[dst_ind
+
2*block_size]
=
b0
-
a5
;
if
(
x1
<
t/2
)
{
butterfly3
(
a0,
a1,
a2,
smem,
twiddles,
x1,
block_size
)
;
butterfly3
(
a3,
a4,
a5,
smem,
twiddles,
x2,
block_size
)
;
butterfly3
(
a6,
a7,
a8,
smem,
twiddles,
x3,
block_size
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix3_B4
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix3_B4
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x
1
,
const
int
block_size,
const
int
t
)
{
const
int
thread_block
=
t/4
;
const
int
k
=
x
%
block_size
;
const
int
x2
=
x
+
thread_block
;
const
int
k2
=
x2
%
block_size
;
const
int
x3
=
x
+
2*thread_block
;
const
int
k3
=
x3
%
block_size
;
const
int
x4
=
x
+
3*thread_block
;
const
int
k4
=
x4
%
block_size
;
const
int
x2
=
x1
+
thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9,
a10,
a11
;
if
(
x
<
t/4
)
if
(
x
1
<
t/4
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],
smem[x+t]
)
;
a2
=
mul_float2
(
twiddles[k+block_size],
smem[x+2*t]
)
;
a3
=
smem[x2]
;
a4
=
mul_float2
(
twiddles[k2],
smem[x2+t]
)
;
a5
=
mul_float2
(
twiddles[k2+block_size],
smem[x2+2*t]
)
;
a6
=
smem[x3]
;
a7
=
mul_float2
(
twiddles[k3],
smem[x3+t]
)
;
a8
=
mul_float2
(
twiddles[k3+block_size],
smem[x3+2*t]
)
;
a9
=
smem[x4]
;
a10
=
mul_float2
(
twiddles[k4],
smem[x4+t]
)
;
a11
=
mul_float2
(
twiddles[k4+block_size],
smem[x4+2*t]
)
;
a0
=
smem[x1]
; a1 = smem[x1+t]; a2 = smem[x1+2*t];
a3
=
smem[x2]
; a4 = smem[x2+t]; a5 = smem[x2+2*t];
a6
=
smem[x3]
; a7 = smem[x3+t]; a8 = smem[x3+2*t];
a9
=
smem[x4]
; a10 = smem[x4+t]; a11 = smem[x4+2*t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t/4
)
if
(
x
1
<
t/4
)
{
int
dst_ind
=
((
x
-
k
)
*
3
)
+
k
;
float2
b1
=
a1
+
a2
;
a2
=
twiddle
(
sin_120*
(
a1
-
a2
))
;
float2
b0
=
a0
-
(
float2
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a0
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a2
;
smem[dst_ind
+
2*block_size]
=
b0
-
a2
;
dst_ind
=
((
x2
-
k2
)
*
3
)
+
k2
;
b1
=
a4
+
a5
;
a5
=
twiddle
(
sin_120*
(
a4
-
a5
))
;
b0
=
a3
-
(
float2
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a3
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a5
;
smem[dst_ind
+
2*block_size]
=
b0
-
a5
;
dst_ind
=
((
x3
-
k3
)
*
3
)
+
k3
;
b1
=
a7
+
a8
;
a8
=
twiddle
(
sin_120*
(
a7
-
a8
))
;
b0
=
a6
-
(
float2
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a6
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a8
;
smem[dst_ind
+
2*block_size]
=
b0
-
a8
;
dst_ind
=
((
x4
-
k4
)
*
3
)
+
k4
;
b1
=
a10
+
a11
;
a11
=
twiddle
(
sin_120*
(
a10
-
a11
))
;
b0
=
a9
-
(
float2
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a9
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a11
;
smem[dst_ind
+
2*block_size]
=
b0
-
a11
;
butterfly3
(
a0,
a1,
a2,
smem,
twiddles,
x1,
block_size
)
;
butterfly3
(
a3,
a4,
a5,
smem,
twiddles,
x2,
block_size
)
;
butterfly3
(
a6,
a7,
a8,
smem,
twiddles,
x3,
block_size
)
;
butterfly3
(
a9,
a10,
a11,
smem,
twiddles,
x4,
block_size
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -437,135 +474,35 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i
if
(
x
<
t
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],
smem[x
+
t]
)
;
a2
=
mul_float2
(
twiddles[k
+
block_size],smem[x+2*t]
)
;
a3
=
mul_float2
(
twiddles[k+2*block_size],smem[x+3*t]
)
;
a4
=
mul_float2
(
twiddles[k+3*block_size],smem[x+4*t]
)
;
a0
=
smem[x]
; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t
)
{
const
int
dst_ind
=
((
x
-
k
)
*
5
)
+
k
;
__local
float2*
dst
=
smem
+
dst_ind
;
float2
b0,
b1,
b5
;
b1
=
a1
+
a4
;
a1
-=
a4
;
a4
=
a3
+
a2
;
a3
-=
a2
;
a2
=
b1
+
a4
;
b0
=
a0
-
(
float2
)
0.25f
*
a2
;
b1
=
fft5_2
*
(
b1
-
a4
)
;
a4
=
fft5_3
*
(
float2
)(
-a1.y
-
a3.y,
a1.x
+
a3.x
)
;
b5
=
(
float2
)(
a4.x
-
fft5_5
*
a1.y,
a4.y
+
fft5_5
*
a1.x
)
;
a4.x
+=
fft5_4
*
a3.y
;
a4.y
-=
fft5_4
*
a3.x
;
a1
=
b0
+
b1
;
b0
-=
b1
;
dst[0]
=
a0
+
a2
;
dst[block_size]
=
a1
+
a4
;
dst[2
*
block_size]
=
b0
+
b5
;
dst[3
*
block_size]
=
b0
-
b5
;
dst[4
*
block_size]
=
a1
-
a4
;
}
butterfly5
(
a0,
a1,
a2,
a3,
a4,
smem,
twiddles,
x,
block_size
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
__attribute__
((
always_inline
))
void
fft_radix5_B2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix5_B2
(
__local
float2*
smem,
__constant
const
float2*
twiddles,
const
int
x
1
,
const
int
block_size,
const
int
t
)
{
const
int
k
=
x
%
block_size
;
const
int
x2
=
x+t/2
;
const
int
k2
=
x2
%
block_size
;
const
int
x2
=
x1+t/2
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9
;
if
(
x
<
t/2
)
if
(
x
1
<
t/2
)
{
a0
=
smem[x]
;
a1
=
mul_float2
(
twiddles[k],
smem[x
+
t]
)
;
a2
=
mul_float2
(
twiddles[k
+
block_size],smem[x+2*t]
)
;
a3
=
mul_float2
(
twiddles[k+2*block_size],smem[x+3*t]
)
;
a4
=
mul_float2
(
twiddles[k+3*block_size],smem[x+4*t]
)
;
a5
=
smem[x2]
;
a6
=
mul_float2
(
twiddles[k2],
smem[x2
+
t]
)
;
a7
=
mul_float2
(
twiddles[k2
+
block_size],smem[x2+2*t]
)
;
a8
=
mul_float2
(
twiddles[k2+2*block_size],smem[x2+3*t]
)
;
a9
=
mul_float2
(
twiddles[k2+3*block_size],smem[x2+4*t]
)
;
a0
=
smem[x1]
; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];
a5
=
smem[x2]
; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
t/2
)
if
(
x
1
<
t/2
)
{
int
dst_ind
=
((
x
-
k
)
*
5
)
+
k
;
__local
float2*
dst
=
smem
+
dst_ind
;
float2
b0,
b1,
b5
;
b1
=
a1
+
a4
;
a1
-=
a4
;
a4
=
a3
+
a2
;
a3
-=
a2
;
a2
=
b1
+
a4
;
b0
=
a0
-
(
float2
)
0.25f
*
a2
;
b1
=
fft5_2
*
(
b1
-
a4
)
;
a4
=
fft5_3
*
(
float2
)(
-a1.y
-
a3.y,
a1.x
+
a3.x
)
;
b5
=
(
float2
)(
a4.x
-
fft5_5
*
a1.y,
a4.y
+
fft5_5
*
a1.x
)
;
a4.x
+=
fft5_4
*
a3.y
;
a4.y
-=
fft5_4
*
a3.x
;
a1
=
b0
+
b1
;
b0
-=
b1
;
dst[0]
=
a0
+
a2
;
dst[block_size]
=
a1
+
a4
;
dst[2
*
block_size]
=
b0
+
b5
;
dst[3
*
block_size]
=
b0
-
b5
;
dst[4
*
block_size]
=
a1
-
a4
;
dst_ind
=
((
x2
-
k2
)
*
5
)
+
k2
;
dst
=
smem
+
dst_ind
;
b1
=
a6
+
a9
;
a6
-=
a9
;
a9
=
a8
+
a7
;
a8
-=
a7
;
a7
=
b1
+
a9
;
b0
=
a5
-
(
float2
)
0.25f
*
a7
;
b1
=
fft5_2
*
(
b1
-
a9
)
;
a9
=
fft5_3
*
(
float2
)(
-a6.y
-
a8.y,
a6.x
+
a8.x
)
;
b5
=
(
float2
)(
a9.x
-
fft5_5
*
a6.y,
a9.y
+
fft5_5
*
a6.x
)
;
a9.x
+=
fft5_4
*
a8.y
;
a9.y
-=
fft5_4
*
a8.x
;
a6
=
b0
+
b1
;
b0
-=
b1
;
dst[0]
=
a5
+
a7
;
dst[block_size]
=
a6
+
a9
;
dst[2
*
block_size]
=
b0
+
b5
;
dst[3
*
block_size]
=
b0
-
b5
;
dst[4
*
block_size]
=
a6
-
a9
;
butterfly5
(
a0,
a1,
a2,
a3,
a4,
smem,
twiddles,
x1,
block_size
)
;
butterfly5
(
a5,
a6,
a7,
a8,
a9,
smem,
twiddles,
x2,
block_size
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -611,7 +548,7 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
RADIX_PROCESS
;
#
ifndef
CCS
_OUTPUT
#
ifndef
REAL
_OUTPUT
#
ifdef
NO_CONJUGATE
//
copy
result
without
complex
conjugate
const
int
cols
=
dst_cols/2
+
1
;
...
...
@@ -659,7 +596,7 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
RADIX_PROCESS
;
#
ifndef
CCS
_OUTPUT
#
ifndef
REAL
_OUTPUT
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
dst_offset
))
;
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
...
...
@@ -696,8 +633,8 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
}
}
__kernel
void
ifft_multi_radix_rows
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__kernel
void
ifft_multi_radix_rows
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
__constant
float2
*
twiddles_ptr,
const
int
t
,
const
int
nz
)
{
const
int
x
=
get_global_id
(
0
)
;
...
...
@@ -709,13 +646,8 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
__constant
const
float2*
twiddles
=
(
__constant
float2*
)
twiddles_ptr
;
const
int
ind
=
x
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
#
ifdef
IS_1D
float
scale
=
1.f/dst_cols
;
#
else
float
scale
=
1.f/
(
dst_cols*dst_rows
)
;
#
endif
#
ifndef
REAL
_INPUT
#
ifndef
REAL
__global
const
float2*
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
src_offset
)))
;
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
...
...
@@ -723,8 +655,14 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
smem[x+i*block_size].x
=
src[i*block_size].x
;
smem[x+i*block_size].y
=
-src[i*block_size].y
;
}
#
else
__global
const
float2*
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
1
,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
#
else
__global
const
float2*
src
;
#
ifdef
COMPLEX_INPUT
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2
,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
#
else
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
1
,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
#
endif
#
pragma
unroll
for
(
int
i=x
; i<(LOCAL_SIZE-1)/2; i+=block_size)
{
...
...
@@ -750,13 +688,13 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
RADIX_PROCESS
;
//
copy
data
to
dst
#
ifndef
REAL
_INPUT
#
ifndef
REAL
__global
float2*
dst
=
(
__global
float*
)(
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
dst_offset
)))
;
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
{
dst[i*block_size].x
=
VAL
(
smem[x
+
i*block_size].x,
scale
)
;
dst[i*block_size].y
=
VAL
(
-smem[x
+
i*block_size].y,
scale
)
;
dst[i*block_size].x
=
smem[x
+
i*block_size].x
;
dst[i*block_size].y
=
-smem[x
+
i*block_size].y
;
}
#
else
__global
float*
dst
=
(
__global
float*
)(
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
))
,
dst_offset
)))
;
...
...
@@ -769,13 +707,14 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
}
}
__kernel
void
ifft_multi_radix_cols
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__kernel
void
ifft_multi_radix_cols
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
__constant
float2
*
twiddles_ptr,
const
int
t
,
const
int
nz
)
{
const
int
x
=
get_group_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
#
ifndef
REAL
if
(
x
<
nz
)
{
__local
float2
smem[LOCAL_SIZE]
;
...
...
@@ -784,7 +723,6 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
__constant
const
float2*
twiddles
=
(
__constant
float2*
)
twiddles_ptr
;
const
int
ind
=
y
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
float
scale
=
1.f/
(
dst_rows*dst_cols
)
;
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
...
...
@@ -802,9 +740,75 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
{
__global
float2*
rez
=
(
__global
float2*
)(
dst
+
i*block_size*src
_step
)
;
rez[0].x
=
VAL
(
smem[y
+
i*block_size].x,
scale
)
;
rez[0].y
=
VAL
(
-smem[y
+
i*block_size].y,
scale
)
;
__global
float2*
rez
=
(
__global
float2*
)(
dst
+
i*block_size*dst
_step
)
;
rez[0].x
=
smem[y
+
i*block_size].x
;
rez[0].y
=
-smem[y
+
i*block_size].y
;
}
}
#
else
if
(
x
<
nz
)
{
__constant
const
float2*
twiddles
=
(
__constant
float2*
)
twiddles_ptr
;
const
int
ind
=
y
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
__local
float2
smem[LOCAL_SIZE]
;
#
ifdef
EVEN
if
(
x!=0
&&
(
x!=
(
nz-1
)))
#
else
if
(
x!=0
)
#
endif
{
__global
const
uchar*
src
=
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2*x-1,
(
int
)
sizeof
(
float
)
,
src_offset
))
;
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
{
float2
temp
=
*
((
__global
const
float2*
)(
src
+
i*block_size*src_step
))
;
smem[y+i*block_size].x
=
temp.x
;
smem[y+i*block_size].y
=
-temp.y
;
}
}
else
{
int
ind
=
x==0
?
0:
2*x-1
;
__global
const
float*
src
=
(
__global
const
float*
)(
src_ptr
+
mad24
(
1
,
src_step,
mad24
(
ind,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
int
step
=
src_step/
(
int
)
sizeof
(
float
)
;
#
pragma
unroll
for
(
int
i=y
; i<(LOCAL_SIZE-1)/2; i+=block_size)
{
smem[i+1].x
=
src[2*i*step]
;
smem[i+1].y
=
-src[
(
2*i+1
)
*step]
;
smem[LOCAL_SIZE-i-1].x
=
src[2*i*step]
;;
smem[LOCAL_SIZE-i-1].y
=
src[
(
2*i+1
)
*step]
;
}
if
(
y==0
)
{
smem[0].x
=
*
(
__global
const
float*
)(
src_ptr
+
mad24
(
ind,
(
int
)
sizeof
(
float
)
,
src_offset
))
;
smem[0].y
=
0.f
;
if
(
LOCAL_SIZE
%
2
==0
)
{
smem[LOCAL_SIZE/2].x
=
src[
(
LOCAL_SIZE-2
)
*step]
;
smem[LOCAL_SIZE/2].y
=
0.f
;
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
RADIX_PROCESS
;
//
copy
data
to
dst
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
))
,
dst_offset
))
;
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
{
__global
float2*
rez
=
(
__global
float2*
)(
dst
+
i*block_size*dst_step
)
;
rez[0].x
=
smem[y
+
i*block_size].x
;
rez[0].y
=
-smem[y
+
i*block_size].y
;
}
}
#
endif
}
\ No newline at end of file
modules/core/test/ocl/test_dft.cpp
View file @
52f76a32
...
...
@@ -50,10 +50,10 @@
enum
OCL_FFT_TYPE
{
R2R
=
0
,
// real to real (CCS)
C2R
=
1
,
// complex to real (CCS)
R2C
=
2
,
// real (CCS) to complex
C2C
=
3
// complex to complex
R2R
=
0
,
C2R
=
1
,
R2C
=
2
,
C2C
=
3
};
namespace
cvtest
{
...
...
@@ -62,7 +62,7 @@ namespace ocl {
////////////////////////////////////////////////////////////////////////////
// Dft
PARAM_TEST_CASE
(
Dft
,
cv
::
Size
,
OCL_FFT_TYPE
,
bool
,
bool
,
bool
)
PARAM_TEST_CASE
(
Dft
,
cv
::
Size
,
OCL_FFT_TYPE
,
bool
,
bool
,
bool
,
bool
)
{
cv
::
Size
dft_size
;
int
dft_flags
,
depth
,
cn
,
dft_type
;
...
...
@@ -88,12 +88,12 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool)
}
if
(
GET_PARAM
(
2
))
dft_flags
|=
cv
::
DFT_
ROWS
;
dft_flags
|=
cv
::
DFT_
INVERSE
;
if
(
GET_PARAM
(
3
))
dft_flags
|=
cv
::
DFT_ROWS
;
if
(
GET_PARAM
(
4
))
dft_flags
|=
cv
::
DFT_SCALE
;
/*if (GET_PARAM(4))
dft_flags |= cv::DFT_INVERSE;*/
inplace
=
GET_PARAM
(
4
);
inplace
=
GET_PARAM
(
5
);
is1d
=
(
dft_flags
&
DFT_ROWS
)
!=
0
||
dft_size
.
height
==
1
;
...
...
@@ -116,16 +116,16 @@ OCL_TEST_P(Dft, Mat)
OCL_OFF
(
cv
::
dft
(
src
,
dst
,
dft_flags
));
OCL_ON
(
cv
::
dft
(
usrc
,
udst
,
dft_flags
));
if
(
dft_type
==
R2C
&&
is1d
)
if
(
dft_type
==
R2C
&&
is1d
&&
(
dft_flags
&
cv
::
DFT_INVERSE
)
==
0
)
{
dst
=
dst
(
cv
::
Range
(
0
,
dst
.
rows
),
cv
::
Range
(
0
,
dst
.
cols
/
2
+
1
));
udst
=
udst
(
cv
::
Range
(
0
,
udst
.
rows
),
cv
::
Range
(
0
,
udst
.
cols
/
2
+
1
));
}
//
Mat gpu = udst.getMat(ACCESS_READ);
//
std::cout << src << std::endl;
//
std::cout << dst << std::endl;
//
std::cout << gpu << std::endl;
Mat
gpu
=
udst
.
getMat
(
ACCESS_READ
);
std
::
cout
<<
src
<<
std
::
endl
;
std
::
cout
<<
dst
<<
std
::
endl
;
std
::
cout
<<
gpu
<<
std
::
endl
;
//int cn = udst.channels();
//
...
...
@@ -188,12 +188,12 @@ OCL_TEST_P(MulSpectrums, Mat)
OCL_INSTANTIATE_TEST_CASE_P
(
OCL_ImgProc
,
MulSpectrums
,
testing
::
Combine
(
Bool
(),
Bool
()));
OCL_INSTANTIATE_TEST_CASE_P
(
Core
,
Dft
,
Combine
(
Values
(
cv
::
Size
(
16
,
4
),
cv
::
Size
(
5
,
8
),
cv
::
Size
(
6
,
6
),
OCL_INSTANTIATE_TEST_CASE_P
(
Core
,
Dft
,
Combine
(
Values
(
cv
::
Size
(
4
,
1
),
cv
::
Size
(
5
,
8
),
cv
::
Size
(
6
,
6
),
cv
::
Size
(
512
,
1
),
cv
::
Size
(
1280
,
768
)),
Values
((
OCL_FFT_TYPE
)
R2C
,
(
OCL_FFT_TYPE
)
C2C
,
(
OCL_FFT_TYPE
)
R2R
,
(
OCL_FFT_TYPE
)
C2R
),
Bool
(),
// DFT_INVERSE
Bool
(),
// DFT_ROWS
Bool
(),
// DFT_SCALE
//Bool(), // DFT_INVERSE
Bool
()
// inplace
)
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment