Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv_contrib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv_contrib
Commits
9786a552
Commit
9786a552
authored
Aug 10, 2014
by
vludv
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Removed OpenCL realization to test build.
parent
291c87a1
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
0 additions
and
1626 deletions
+0
-1626
dtfilter_ocl.cpp
modules/ximgproc/src/dtfilter_ocl.cpp
+0
-834
dtfilter_ocl.hpp
modules/ximgproc/src/dtfilter_ocl.hpp
+0
-113
OpenCLKernel.hpp
modules/ximgproc/src/opencl/OpenCLKernel.hpp
+0
-0
dtfilter_dt.cl
modules/ximgproc/src/opencl/dtfilter_dt.cl
+0
-251
dtfilter_flt.cl
modules/ximgproc/src/opencl/dtfilter_flt.cl
+0
-428
No files found.
modules/ximgproc/src/dtfilter_ocl.cpp
deleted
100644 → 0
View file @
291c87a1
#include "precomp.hpp"
#include "dtfilter_ocl.hpp"
#include "opencl_kernels.hpp"
//#include <opencv2/highgui.hpp>
using
namespace
std
;
#ifdef _MSC_VER
# pragma warning(disable: 4512)
#endif
#define MEASURE_TIME(counter, code) { counter -= getTickCount(); {code;} counter += getTickCount(); }
#define MEASURE_KER_TIME(code) { kernelsTime -= getTickCount(); code; kernelsTime += getTickCount(); }
namespace
cv
{
int64
DTFilterOCL
::
totalKernelsTime
=
0
;
Ptr
<
DTFilterOCL
>
DTFilterOCL
::
create
(
InputArray
guide
,
double
sigmaSpatial
,
double
sigmaColor
,
int
mode
,
int
numIters
)
{
DTFilterOCL
*
p
=
new
DTFilterOCL
();
p
->
init
(
guide
,
sigmaSpatial
,
sigmaColor
,
mode
,
numIters
);
return
Ptr
<
DTFilterOCL
>
(
p
);
}
void
DTFilterOCL
::
init
(
InputArray
guide_
,
double
sigmaSpatial_
,
double
sigmaColor_
,
int
mode_
,
int
numIters_
)
{
CV_Assert
(
guide_
.
channels
()
<=
4
&&
(
guide_
.
depth
()
==
CV_8U
||
guide_
.
depth
()
==
CV_32F
)
);
CV_Assert
(
numIters_
<=
3
);
mode
=
mode_
;
numIters
=
numIters_
;
sigmaSpatial
=
std
::
max
(
1.00
f
,
(
float
)
sigmaSpatial_
);
sigmaColor
=
std
::
max
(
0.01
f
,
(
float
)
sigmaColor_
);
kernelsTime
=
0
;
singleFilterCall
=
false
;
h
=
guide_
.
rows
();
w
=
guide_
.
cols
();
NC_ocl_implememtation
=
ALG_INDEX
;
UMat
guide
=
guide_
.
getUMat
();
initProgram
();
setBuildOptionsDT
(
guide
);
if
(
mode
==
DTF_NC
)
{
initDT_NC
(
guide
);
}
else
if
(
mode
==
DTF_RF
)
{
initDT_RF
(
guide
);
}
else
if
(
mode
==
DTF_IC
)
{
initDT_IC
(
guide
);
}
else
{
CV_Error
(
Error
::
StsBadFlag
,
"Incorrect DT filter mode"
);
}
}
void
DTFilterOCL
::
initDT_NC
(
UMat
&
guide
)
{
bool
useIDT
=
false
;
if
(
useIDT
)
{
UMat
guideT
;
MEASURE_KER_TIME
(
transpose
(
guide
,
guideT
);
);
computeIDT
(
guide
,
guideT
);
if
(
NC_ocl_implememtation
==
ALG_INDEX
)
{
computeIndexMat
(
idistHor
,
distIndexHor
);
computeIndexMat
(
idistVert
,
distIndexVert
);
idistHor
.
release
();
idistVert
.
release
();
}
}
else
{
UMat
dsitHorWrapper
,
distVertWrapper
;
computeDT
(
guide
,
dsitHorWrapper
,
distVertWrapper
);
distHor
=
dsitHorWrapper
(
Range
::
all
(),
Range
(
1
,
w
+
1
));
MEASURE_KER_TIME
(
transpose
(
distVertWrapper
,
distVertWrapper
);
);
distVert
=
distVertWrapper
(
Range
::
all
(),
Range
(
1
,
h
+
1
));
if
(
NC_ocl_implememtation
==
ALG_INDEX
)
{
computeIndexMat
(
distHor
,
distIndexHor
,
false
);
computeIndexMat
(
distVert
,
distIndexVert
,
false
);
distHor
.
release
();
distVert
.
release
();
}
}
}
void
DTFilterOCL
::
initDT_RF
(
UMat
&
guide
)
{
float
sigmaRatio
=
(
float
)(
sigmaSpatial
/
sigmaColor
);
float
alpha1
=
getIterAlpha
(
1
);
ocl
::
Kernel
kerHor
(
"compute_a0DT_vert"
,
kerProgSrcDT
,
buildOptionsDT
);
ocl
::
Kernel
kerVert
(
"compute_a0DT_vert"
,
kerProgSrcDT
,
buildOptionsDT
);
CV_Assert
(
!
kerHor
.
empty
()
&&
!
kerVert
.
empty
());
UMat
guideT
;
transpose
(
guide
,
guideT
);
a0distHor
.
create
(
guide
.
cols
-
1
,
guide
.
rows
,
CV_32FC1
);
a0distVert
.
create
(
guide
.
rows
-
1
,
guide
.
cols
,
CV_32FC1
);
kerHor
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
guideT
),
ocl
::
KernelArg
::
ReadWriteNoSize
(
a0distHor
),
sigmaRatio
,
alpha1
);
kerVert
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
guide
),
ocl
::
KernelArg
::
ReadWriteNoSize
(
a0distVert
),
sigmaRatio
,
alpha1
);
size_t
globalSizeHor
[]
=
{
guide
.
cols
-
1
,
guide
.
rows
};
size_t
globalSizeVert
[]
=
{
guide
.
rows
-
1
,
guide
.
cols
};
size_t
localSizeHor
[]
=
{
1
,
getMaxWorkGropSize
()
};
size_t
localSizeVert
[]
=
{
1
,
getMaxWorkGropSize
()
};
bool
sync
=
true
;
MEASURE_KER_TIME
(
kerHor
.
run
(
2
,
globalSizeHor
,
localSizeHor
,
sync
));
MEASURE_KER_TIME
(
kerVert
.
run
(
2
,
globalSizeVert
,
localSizeVert
,
sync
));
}
void
DTFilterOCL
::
initDT_IC
(
UMat
&
guide
)
{
UMat
distHorWrapper
,
distVertWrapper
;
computeDT
(
guide
,
distHorWrapper
,
distVertWrapper
);
distHor
=
distHorWrapper
(
Range
::
all
(),
Range
(
1
,
w
+
1
));
distVert
=
distVertWrapper
(
Range
(
1
,
1
+
h
),
Range
::
all
());
UMat
distHorWrapperT
;
MEASURE_KER_TIME
(
transpose
(
distHorWrapper
,
distHorWrapperT
);
);
distHorT
=
distHorWrapperT
(
Range
(
1
,
w
+
1
),
Range
::
all
());
UMat
distVertWrapperT
;
MEASURE_KER_TIME
(
transpose
(
distVertWrapper
,
distVertWrapperT
);
);
distVertT
=
distVertWrapperT
(
Range
::
all
(),
Range
(
1
,
h
+
1
));
computeIndexAndTailMat
(
distHor
,
distIndexHor
,
tailHor
);
computeIndexAndTailMat
(
distVertT
,
distIndexVert
,
tailVert
);
}
void
DTFilterOCL
::
filter
(
InputArray
src_
,
OutputArray
dst_
,
int
dDepth
)
{
CV_Assert
(
src_
.
channels
()
<=
4
&&
(
src_
.
depth
()
==
CV_8U
||
src_
.
depth
()
==
CV_32F
)
);
CV_Assert
(
src_
.
cols
()
==
w
&&
src_
.
rows
()
==
h
);
setBuildOptionsFlt
(
src_
);
if
(
dDepth
==
-
1
)
dDepth
=
src_
.
depth
();
if
(
mode
==
DTF_NC
)
{
UMat
src
=
UMatAligned
(
h
,
w
,
workType
);
UMat
dst
;
MEASURE_KER_TIME
(
src_
.
getUMat
().
convertTo
(
src
,
workType
)
);
if
(
NC_ocl_implememtation
==
ALG_PER_ROW
)
{
filterNC_perRowAlg
(
src
,
dst
);
}
else
if
(
NC_ocl_implememtation
==
ALG_PER_PIXEL
)
{
filterNC_PerPixelAlg
(
src
,
dst
);
}
else
if
(
NC_ocl_implememtation
==
ALG_INDEX
)
{
filterNC_IndexAlg
(
src
,
dst
);
}
dst
.
convertTo
(
dst_
,
dDepth
);
}
else
if
(
mode
==
DTF_RF
)
{
filterRF
(
src_
,
dst_
,
dDepth
);
}
else
if
(
mode
==
DTF_IC
)
{
filterIC
(
src_
,
dst_
,
dDepth
);
}
else
{
CV_Error
(
Error
::
StsBadFlag
,
"Incorrect DT filter mode"
);
}
}
void
DTFilterOCL
::
initProgram
()
{
//CV_Assert(ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU);
kerProgSrcDT
=
ocl
::
ximgproc
::
dtfilter_dt_oclsrc
;
kerProgSrcFilter
=
ocl
::
ximgproc
::
dtfilter_flt_oclsrc
;
}
DTFilterOCL
::
DTFilterOCL
()
:
a0distHor
(
distHor
),
a0distVert
(
distVert
)
{
}
DTFilterOCL
::~
DTFilterOCL
()
{
totalKernelsTime
+=
kernelsTime
;
//std::cout << "Kernels time " << kernelsTime / getTickFrequency() << " sec.\n";
//std::cout << "Total kernels time " << totalKernelsTime / getTickFrequency() << " sec.\n";
}
void
DTFilterOCL
::
integrateCols
(
UMat
&
src
,
UMat
&
isrc
)
{
CV_Assert
(
src
.
depth
()
==
CV_32F
);
int
sizeof_float4
=
16
;
int
reqChuncks4f
=
(
src
.
cols
*
src
.
elemSize
()
+
sizeof_float4
-
1
)
/
sizeof_float4
;
int
maxChuncks4f
=
src
.
step
.
p
[
0
]
/
sizeof_float4
;
CV_Assert
(
reqChuncks4f
<=
maxChuncks4f
);
createAligned
(
isrc
,
src
.
rows
+
1
,
src
.
cols
,
src
.
type
(),
sizeof_float4
,
USAGE_ALLOCATE_DEVICE_MEMORY
);
ocl
::
Kernel
ker
=
ocl
::
Kernel
(
"integrate_cols_4f"
,
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
ker
.
empty
());
ker
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
isrc
),
(
int
)
src
.
rows
,
reqChuncks4f
);
size_t
globKerSize4f
[]
=
{
reqChuncks4f
};
bool
oclKerStatus
;
MEASURE_KER_TIME
(
oclKerStatus
=
ker
.
run
(
1
,
globKerSize4f
,
NULL
,
true
);
);
CV_Assert
(
oclKerStatus
);
}
void
DTFilterOCL
::
computeIndexMat
(
UMat
&
_dist
,
UMat
&
distIndex
,
bool
useIDT
)
{
distIndex
.
create
(
_dist
.
rows
*
numIters
,
2
*
_dist
.
cols
,
CV_32SC1
);
const
char
*
kernelName
=
(
useIDT
)
?
"find_conv_bounds_by_idt"
:
"find_conv_bounds_by_dt"
;
ocl
::
Kernel
ker
=
ocl
::
Kernel
(
kernelName
,
kerProgSrcDT
,
buildOptionsDT
);
ker
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
_dist
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
distIndex
),
getIterRadius
(
1
),
getIterRadius
(
2
),
getIterRadius
(
3
)
);
size_t
globSize
[]
=
{
_dist
.
rows
,
_dist
.
cols
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()
};
bool
oclKernelStatus
;
MEASURE_KER_TIME
(
oclKernelStatus
=
ker
.
run
(
2
,
globSize
,
localSize
,
true
));
CV_Assert
(
oclKernelStatus
);
}
void
DTFilterOCL
::
computeIndexAndTailMat
(
UMat
&
dist
,
UMat
&
distIndex
,
UMat
&
tail
)
{
distIndex
.
create
(
dist
.
rows
*
numIters
,
2
*
dist
.
cols
,
CV_32SC1
);
tail
.
create
(
dist
.
rows
*
numIters
,
2
*
dist
.
cols
,
CV_32FC1
);
ocl
::
Kernel
ker
=
ocl
::
Kernel
(
"find_conv_bounds_and_tails"
,
kerProgSrcDT
,
buildOptionsDT
);
CV_Assert
(
!
ker
.
empty
());
ker
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
dist
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
distIndex
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
tail
),
getIterRadius
(
1
),
getIterRadius
(
2
),
getIterRadius
(
3
)
);
size_t
globSize
[]
=
{
dist
.
rows
,
dist
.
cols
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()};
bool
oclKerStatus
;
MEASURE_KER_TIME
(
oclKerStatus
=
ker
.
run
(
2
,
globSize
,
localSize
,
true
);
);
CV_Assert
(
oclKerStatus
);
}
int
DTFilterOCL
::
getMaxWorkGropSize
()
{
int
maxWorkGropSize
=
ocl
::
Device
::
getDefault
().
maxWorkGroupSize
();
return
(
maxWorkGropSize
<=
0
)
?
256
:
maxWorkGropSize
;
}
int
DTFilterOCL
::
getCacheLineSize
()
{
int
lineSize
=
ocl
::
Device
::
getDefault
().
globalMemCacheLineSize
();
return
(
lineSize
<=
0
)
?
128
:
lineSize
;
}
void
DTFilterOCL
::
filterRF
(
InputArray
src_
,
OutputArray
dst_
,
int
dDepth
)
{
UMat
res
=
UMatAligned
(
h
,
w
,
workType
,
getCacheLineSize
());
UMat
resT
=
UMatAligned
(
w
,
h
,
workType
,
getCacheLineSize
());
UMat
adistHor
,
adistVert
;
MEASURE_KER_TIME
(
src_
.
getUMat
().
convertTo
(
res
,
workType
));
if
(
singleFilterCall
||
numIters
==
1
)
{
adistHor
=
a0distHor
;
adistVert
=
a0distVert
;
}
else
{
a0distHor
.
copyTo
(
adistHor
);
a0distVert
.
copyTo
(
adistVert
);
}
if
(
true
)
{
filterRF_blockAlg
(
res
,
resT
,
adistHor
,
adistVert
);
}
else
{
filterRF_naiveAlg
(
res
,
resT
,
adistHor
,
adistVert
);
}
res
.
convertTo
(
dst_
,
dDepth
);
}
void
DTFilterOCL
::
filterRF_blockAlg
(
UMat
&
res
,
UMat
&
resT
,
UMat
&
adistHor
,
UMat
&
adistVert
)
{
resT
.
create
(
res
.
cols
,
res
.
rows
,
res
.
type
());
UMat
weights
=
UMatAligned
(
res
.
rows
,
res
.
cols
,
CV_32FC1
,
getCacheLineSize
(),
USAGE_ALLOCATE_DEVICE_MEMORY
);
UMat
weightsT
=
UMatAligned
(
res
.
cols
,
res
.
rows
,
CV_32FC1
,
getCacheLineSize
(),
USAGE_ALLOCATE_DEVICE_MEMORY
);
for
(
int
iter
=
1
;
iter
<=
numIters
;
iter
++
)
{
transpose
(
res
,
resT
);
filterRF_iter_vert_pass
(
resT
,
adistHor
,
weightsT
);
transpose
(
resT
,
res
);
filterRF_iter_vert_pass
(
res
,
adistVert
,
weights
);
if
(
iter
<
numIters
)
{
cv
::
pow
(
adistHor
,
2
,
adistHor
);
cv
::
pow
(
adistVert
,
2
,
adistVert
);
}
}
}
void
DTFilterOCL
::
filterRF_iter_vert_pass
(
UMat
&
res
,
UMat
&
adist
,
UMat
&
weights
)
{
int
blockSize
=
(
int
)
std
::
sqrt
((
float
)
res
.
rows
);
int
blocksCount
=
(
res
.
rows
+
blockSize
-
1
)
/
blockSize
;
bool
sync
=
true
;
for
(
int
passId
=
0
;
passId
<=
1
;
passId
++
)
{
{
char
*
knames
[]
=
{
"filter_RF_block_init_fwd"
,
"filter_RF_block_init_bwd"
};
ocl
::
Kernel
kerInit
(
knames
[
passId
],
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
kerInit
.
empty
());
kerInit
.
args
(
ocl
::
KernelArg
::
ReadWrite
(
res
),
ocl
::
KernelArg
::
ReadWriteNoSize
(
adist
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
weights
),
blockSize
);
size_t
globSize
[]
=
{
blocksCount
,
res
.
cols
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()
};
CV_Assert
(
kerInit
.
run
(
2
,
globSize
,
localSize
,
sync
));
//imwrite(String(knames[passId]) + ".png", res.t());
}
{
const
char
*
knames
[]
=
{
"filter_RF_block_fill_borders_fwd"
,
"filter_RF_block_fill_borders_bwd"
};
ocl
::
Kernel
kerFillBorders
(
knames
[
passId
],
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
kerFillBorders
.
empty
());
kerFillBorders
.
args
(
ocl
::
KernelArg
::
ReadWrite
(
res
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
weights
),
blockSize
);
size_t
globSize
[]
=
{
res
.
cols
};
CV_Assert
(
kerFillBorders
.
run
(
1
,
globSize
,
NULL
,
sync
));
//imwrite(String(knames[passId]) + ".png", res.t());
}
{
const
char
*
knames
[]
=
{
"filter_RF_block_fill_fwd"
,
"filter_RF_block_fill_bwd"
};
ocl
::
Kernel
kerFill
(
knames
[
passId
],
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
kerFill
.
empty
());
kerFill
.
args
(
ocl
::
KernelArg
::
ReadWrite
(
res
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
weights
),
blockSize
);
size_t
globSize
[]
=
{
res
.
rows
-
blockSize
,
res
.
cols
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()
};
CV_Assert
(
kerFill
.
run
(
2
,
globSize
,
localSize
,
sync
));
//imwrite(String(knames[passId]) + ".png", res.t());
}
}
}
void
DTFilterOCL
::
filterRF_naiveAlg
(
UMat
&
res
,
UMat
&
resT
,
UMat
&
adistHor
,
UMat
&
adistVert
)
{
ocl
::
Kernel
kerHor
(
"filter_RF_vert"
,
kerProgSrcFilter
,
buildOptionsFlt
);
ocl
::
Kernel
kerVert
(
"filter_RF_vert"
,
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
kerHor
.
empty
()
&&
!
kerVert
.
empty
());
size_t
globSizeHor
[]
=
{
h
};
size_t
globSizeVert
[]
=
{
w
};
bool
syncQueue
=
true
;
for
(
int
iter
=
1
;
iter
<=
numIters
;
iter
++
)
{
bool
statusHorPass
,
statusVertPass
;
int
write_a0
=
(
iter
<
numIters
);
MEASURE_KER_TIME
(
transpose
(
res
,
resT
));
kerHor
.
args
(
ocl
::
KernelArg
::
ReadWrite
(
resT
),
ocl
::
KernelArg
::
ReadWriteNoSize
(
adistHor
),
write_a0
);
MEASURE_KER_TIME
(
statusHorPass
=
kerHor
.
run
(
1
,
globSizeHor
,
NULL
,
syncQueue
));
MEASURE_KER_TIME
(
transpose
(
resT
,
res
));
kerVert
.
args
(
ocl
::
KernelArg
::
ReadWrite
(
res
),
ocl
::
KernelArg
::
ReadWriteNoSize
(
adistVert
),
write_a0
);
MEASURE_KER_TIME
(
statusVertPass
=
kerVert
.
run
(
1
,
globSizeVert
,
NULL
,
syncQueue
));
CV_Assert
(
statusHorPass
&&
statusVertPass
);
}
}
void
DTFilterOCL
::
filterNC_IndexAlg
(
UMat
&
src
,
UMat
&
dst
)
{
CV_Assert
(
src
.
rows
==
h
&&
src
.
cols
==
w
);
UMat
srcT
=
UMatAligned
(
w
,
h
,
workType
);
UMat
isrc
,
isrcT
;
ocl
::
Kernel
kerFiltHor
=
ocl
::
Kernel
(
"filter_NC_hor_by_bounds"
,
kerProgSrcFilter
,
buildOptionsFlt
);
ocl
::
Kernel
kerFiltVert
=
ocl
::
Kernel
(
"filter_NC_hor_by_bounds"
,
kerProgSrcFilter
,
buildOptionsFlt
);
size_t
globSizeHor
[]
=
{
h
,
w
};
size_t
globSizeVert
[]
=
{
w
,
h
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()};
bool
sync
=
true
;
MEASURE_KER_TIME
(
transpose
(
src
,
srcT
)
);
for
(
int
iter
=
0
;
iter
<
numIters
;
iter
++
)
{
bool
kerStatus
=
true
;
MEASURE_KER_TIME
(
integrateCols
(
srcT
,
isrcT
)
);
MEASURE_KER_TIME
(
transpose
(
isrcT
,
isrc
)
);
kerFiltHor
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
isrc
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
distIndexHor
),
ocl
::
KernelArg
::
WriteOnly
(
src
),
iter
);
MEASURE_KER_TIME
(
kerStatus
&=
kerFiltHor
.
run
(
2
,
globSizeHor
,
localSize
,
sync
)
);
MEASURE_KER_TIME
(
integrateCols
(
src
,
isrc
)
);
MEASURE_KER_TIME
(
transpose
(
isrc
,
isrcT
)
);
kerFiltVert
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
isrcT
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
distIndexVert
),
ocl
::
KernelArg
::
WriteOnly
(
srcT
),
iter
);
MEASURE_KER_TIME
(
kerStatus
&=
kerFiltVert
.
run
(
2
,
globSizeVert
,
localSize
,
sync
)
);
CV_Assert
(
kerStatus
);
}
MEASURE_KER_TIME
(
transpose
(
srcT
,
dst
)
);
}
void
DTFilterOCL
::
filterNC_perRowAlg
(
UMat
&
src
,
UMat
&
dst
)
{
dst
.
create
(
h
,
w
,
workType
);
UMat
&
res
=
dst
;
UMat
srcT
=
UMat
(
w
,
h
,
workType
);
UMat
resT
=
UMat
(
w
,
h
,
workType
);
MEASURE_KER_TIME
(
transpose
(
src
,
srcT
));
ocl
::
Kernel
kerHor
(
"filter_NC_vert"
,
kerProgSrcFilter
,
buildOptionsFlt
);
ocl
::
Kernel
kerVert
(
"filter_NC_vert"
,
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
kerHor
.
empty
()
&&
!
kerVert
.
empty
());
size_t
globSizeHor
[]
=
{
h
};
size_t
globSizeVert
[]
=
{
w
};
bool
syncQueue
=
true
;
for
(
int
iter
=
1
;
iter
<=
numIters
;
iter
++
)
{
bool
statusHorPass
,
statusVertPass
;
float
radius
=
getIterRadius
(
iter
);
kerHor
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
srcT
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
idistHor
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
resT
),
radius
);
MEASURE_KER_TIME
(
statusHorPass
=
kerHor
.
run
(
1
,
globSizeHor
,
NULL
,
syncQueue
));
MEASURE_KER_TIME
(
transpose
(
resT
,
src
));
kerVert
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
idistVert
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
res
),
radius
);
MEASURE_KER_TIME
(
statusVertPass
=
kerVert
.
run
(
1
,
globSizeVert
,
NULL
,
syncQueue
));
if
(
iter
<
numIters
)
MEASURE_KER_TIME
(
transpose
(
res
,
srcT
));
CV_Assert
(
statusHorPass
&&
statusVertPass
);
}
}
void
DTFilterOCL
::
filterNC_PerPixelAlg
(
UMat
&
src
,
UMat
&
dst
)
{
dst
=
src
;
UMat
res
(
h
,
w
,
workType
);
UMat
srcT
(
w
,
h
,
workType
),
resT
(
w
,
h
,
workType
);
ocl
::
Kernel
kerHor
(
"filter_NC_hor"
,
kerProgSrcFilter
);
ocl
::
Kernel
kerVert
(
"filter_NC_hor"
,
kerProgSrcFilter
);
CV_Assert
(
!
kerHor
.
empty
()
&&
!
kerVert
.
empty
());
size_t
globSizeHor
[]
=
{
h
,
w
};
size_t
globSizeVert
[]
=
{
w
,
h
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()
};
bool
sync
=
true
;
for
(
int
iter
=
1
;
iter
<=
numIters
;
iter
++
)
{
bool
kerStatus
=
true
;
float
radius
=
getIterRadius
(
iter
);
kerHor
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
idistHor
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
res
),
radius
);
MEASURE_KER_TIME
(
kerStatus
&=
kerHor
.
run
(
2
,
globSizeHor
,
localSize
,
sync
));
MEASURE_KER_TIME
(
transpose
(
res
,
srcT
));
kerVert
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
srcT
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
idistVert
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
resT
),
radius
);
MEASURE_KER_TIME
(
kerStatus
&=
kerVert
.
run
(
2
,
globSizeVert
,
localSize
,
sync
));
MEASURE_KER_TIME
(
transpose
(
resT
,
src
));
CV_Assert
(
kerStatus
);
}
}
void
DTFilterOCL
::
filterIC
(
InputArray
src_
,
OutputArray
dst_
,
int
dDepth
)
{
UMat
srcWrapper
(
h
+
1
,
w
+
2
,
workType
);
UMat
srcTWrapper
(
w
+
1
,
h
+
2
,
workType
);
UMat
src
=
srcWrapper
(
Range
(
1
,
1
+
h
),
Range
(
1
,
1
+
w
));
UMat
srcT
=
srcTWrapper
(
Range
(
1
,
1
+
w
),
Range
(
1
,
1
+
h
));
UMat
isrc
=
UMatAligned
(
h
,
w
+
1
,
workType
);
UMat
isrcT
=
UMatAligned
(
w
+
1
,
h
,
workType
);
UMat
res
=
UMatAligned
(
h
,
w
,
workType
);
UMat
resT
=
UMatAligned
(
w
,
h
,
workType
);
MEASURE_KER_TIME
(
src_
.
getUMat
().
convertTo
(
src
,
workType
)
);
ocl
::
Kernel
kerHor
=
ocl
::
Kernel
(
"filter_IC_hor"
,
kerProgSrcFilter
,
buildOptionsFlt
);
ocl
::
Kernel
kerVert
=
ocl
::
Kernel
(
"filter_IC_hor"
,
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
kerHor
.
empty
()
&&
!
kerVert
.
empty
());
size_t
globSizeHor
[]
=
{
h
,
w
};
size_t
globSizeVert
[]
=
{
w
,
h
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()};
bool
sync
=
true
;
MEASURE_KER_TIME
(
transpose
(
src
,
resT
)
);
for
(
int
iter
=
0
;
iter
<
numIters
;
iter
++
)
{
bool
oclKerStatus
=
true
;
float
curIterRadius
=
getIterRadius
(
iter
);
integrateColsIC
(
resT
,
distHorT
,
isrcT
);
MEASURE_KER_TIME
(
transpose
(
isrcT
,
isrc
);
);
if
(
iter
!=
0
)
MEASURE_KER_TIME
(
transpose
(
resT
,
src
);
);
duplicateVerticalBorders
(
srcWrapper
);
kerHor
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
isrc
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
distIndexHor
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
tailHor
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
distHor
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
res
),
curIterRadius
,
iter
);
MEASURE_KER_TIME
(
oclKerStatus
&=
kerHor
.
run
(
2
,
globSizeHor
,
localSize
,
sync
);
);
integrateColsIC
(
res
,
distVert
,
isrc
);
MEASURE_KER_TIME
(
transpose
(
isrc
,
isrcT
);
);
MEASURE_KER_TIME
(
transpose
(
res
,
srcT
);
);
duplicateVerticalBorders
(
srcTWrapper
);
kerVert
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
srcT
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
isrcT
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
distIndexVert
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
tailVert
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
distVertT
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
resT
),
curIterRadius
,
iter
);
MEASURE_KER_TIME
(
oclKerStatus
&=
kerVert
.
run
(
2
,
globSizeVert
,
localSize
,
sync
);
);
CV_Assert
(
oclKerStatus
);
}
if
(
dDepth
==
resT
.
depth
())
{
transpose
(
resT
,
dst_
);
}
else
{
transpose
(
resT
,
res
);
res
.
convertTo
(
dst_
,
dDepth
);
}
}
void
DTFilterOCL
::
setBuildOptionsFlt
(
InputArray
src_
)
{
int
cn
=
src_
.
channels
();
//int depth = src_.depth();
workType
=
CV_MAKE_TYPE
(
CV_32F
,
cn
);
buildOptionsFlt
=
format
(
"-D cn=%d "
"-D SrcVec=%s "
"-D NUM_ITERS=%d "
,
cn
,
ocl
::
typeToStr
(
workType
),
numIters
);
//std::cout << "build options flt.: " << buildOptionsFlt.c_str() << "\n";
}
void
DTFilterOCL
::
setBuildOptionsDT
(
UMat
&
guide
)
{
int
gType
=
guide
.
type
();
int
gcn
=
guide
.
channels
();
int
gDepth
=
guide
.
depth
();
char
strBuf
[
40
];
buildOptionsDT
=
format
(
"-D NUM_ITERS=%d "
"-D cn=%d "
"-D GuideType=%s "
"-D GuideVec=%s "
,
numIters
,
gcn
,
ocl
::
typeToStr
(
gDepth
),
ocl
::
typeToStr
(
gType
)
);
if
(
gDepth
!=
CV_32F
)
{
buildOptionsDT
+=
format
(
"-D convert_guide=%s "
,
ocl
::
convertTypeStr
(
gDepth
,
CV_32F
,
gcn
,
strBuf
));
}
//cout << "buildOptions:" << buildOptionsDT.c_str() << "\n";
}
void
DTFilterOCL
::
computeDT
(
UMat
&
guide
,
UMat
&
distHorOuter
,
UMat
&
distVertOuter
)
{
ocl
::
Kernel
kerHor
=
ocl
::
Kernel
(
"compute_dt_hor"
,
kerProgSrcDT
,
buildOptionsDT
);
ocl
::
Kernel
kerVert
=
ocl
::
Kernel
(
"compute_dt_vert"
,
kerProgSrcDT
,
buildOptionsDT
);
CV_Assert
(
!
kerHor
.
empty
()
&&
!
kerVert
.
empty
());
distHorOuter
.
create
(
h
,
w
+
1
,
CV_32FC1
);
distVertOuter
.
create
(
h
+
1
,
w
,
CV_32FC1
);
UMat
distHor
=
distHorOuter
(
Range
::
all
(),
Range
(
1
,
w
+
1
));
UMat
distVert
=
distVertOuter
(
Range
(
1
,
h
+
1
),
Range
::
all
());
float
maxRadius
=
1.1
f
*
getIterRadius
(
1
);
float
sigmaRatio
=
(
float
)(
sigmaSpatial
/
sigmaColor
);
kerHor
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
guide
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
distHor
),
sigmaRatio
,
maxRadius
);
kerVert
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
guide
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
distVert
),
sigmaRatio
,
maxRadius
);
size_t
globSizeHor
[]
=
{
h
,
w
+
1
};
size_t
globSizeVert
[]
=
{
h
+
1
,
w
};
size_t
localSize
[]
=
{
1
,
getMaxWorkGropSize
()};
bool
sync
=
true
;
bool
oclKernelStatus
=
true
;
MEASURE_KER_TIME
(
oclKernelStatus
&=
kerHor
.
run
(
2
,
globSizeHor
,
localSize
,
sync
);
);
MEASURE_KER_TIME
(
oclKernelStatus
&=
kerVert
.
run
(
2
,
globSizeVert
,
localSize
,
sync
);
);
CV_Assert
(
oclKernelStatus
);
}
void
DTFilterOCL
::
computeIDT
(
UMat
&
guide
,
UMat
&
guideT
)
{
ocl
::
Kernel
kerHor
(
"compute_idt_vert"
,
kerProgSrcDT
,
buildOptionsDT
);
ocl
::
Kernel
kerVert
(
"compute_idt_vert"
,
kerProgSrcDT
,
buildOptionsDT
);
CV_Assert
(
!
kerHor
.
empty
()
&&
!
kerVert
.
empty
());
UMat
idistHorWrap
(
w
+
2
,
h
,
CV_32FC1
);
UMat
idistVertWrap
(
h
+
2
,
w
,
CV_32FC1
);
idistHor
=
idistHorWrap
(
Range
(
1
,
1
+
w
),
Range
::
all
());
idistVert
=
idistVertWrap
(
Range
(
1
,
1
+
h
),
Range
::
all
());
float
sigmaRatio
=
(
float
)(
sigmaSpatial
/
sigmaColor
);
kerHor
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
guideT
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
idistHor
),
sigmaRatio
);
kerVert
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
guide
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
idistVert
),
sigmaRatio
);
size_t
globalSizeHor
[]
=
{
guide
.
rows
};
size_t
globalSizeVert
[]
=
{
guide
.
cols
};
bool
sync
=
true
;
MEASURE_KER_TIME
(
kerHor
.
run
(
1
,
globalSizeHor
,
NULL
,
sync
));
MEASURE_KER_TIME
(
kerVert
.
run
(
1
,
globalSizeVert
,
NULL
,
sync
));
if
(
NC_ocl_implememtation
!=
ALG_PER_ROW
)
{
MEASURE_KER_TIME
(
transpose
(
idistHorWrap
,
idistHorWrap
));
MEASURE_KER_TIME
(
transpose
(
idistVertWrap
,
idistVertWrap
));
idistHor
=
idistHorWrap
(
Range
::
all
(),
Range
(
1
,
1
+
w
));
idistVert
=
idistVertWrap
(
Range
::
all
(),
Range
(
1
,
1
+
h
));
}
else
{
idistHor
=
idistHorWrap
(
Range
(
1
,
1
+
w
),
Range
::
all
());
idistVert
=
idistVertWrap
(
Range
(
1
,
1
+
h
),
Range
::
all
());
}
}
void
DTFilterOCL
::
duplicateVerticalBorders
(
UMat
&
srcOuter
)
{
Range
rangeRows
(
0
,
srcOuter
.
rows
);
{
UMat
leftBorderSrc
=
srcOuter
(
rangeRows
,
Range
(
1
,
2
));
UMat
leftBorderDst
=
srcOuter
(
rangeRows
,
Range
(
0
,
1
));
leftBorderSrc
.
copyTo
(
leftBorderDst
);
}
{
UMat
rightBorderSrc
=
srcOuter
(
rangeRows
,
Range
(
srcOuter
.
cols
-
2
,
srcOuter
.
cols
-
1
));
UMat
rightBorderDst
=
srcOuter
(
rangeRows
,
Range
(
srcOuter
.
cols
-
1
,
srcOuter
.
cols
));
rightBorderSrc
.
copyTo
(
rightBorderDst
);
}
}
void
DTFilterOCL
::
integrateColsIC
(
UMat
&
src
,
UMat
&
dist
,
UMat
&
isrc
)
{
CV_Assert
(
src
.
size
()
==
dist
.
size
()
&&
src
.
depth
()
==
CV_32F
);
isrc
.
create
(
src
.
rows
+
1
,
src
.
cols
,
workType
);
ocl
::
Kernel
ker
(
"integrate_cols_with_dist"
,
kerProgSrcFilter
,
buildOptionsFlt
);
CV_Assert
(
!
ker
.
empty
());
ker
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
dist
),
ocl
::
KernelArg
::
WriteOnlyNoSize
(
isrc
)
);
size_t
globSize
[]
=
{
src
.
cols
};
size_t
localSize
[]
=
{
getMaxWorkGropSize
()};
bool
oclKernelStatus
;
MEASURE_KER_TIME
(
oclKernelStatus
=
ker
.
run
(
1
,
globSize
,
localSize
,
true
)
);
CV_Assert
(
oclKernelStatus
);
}
cv
::
UMat
DTFilterOCL
::
UMatAligned
(
int
rows
,
int
cols
,
int
type
,
int
align
,
int
usageFlags
)
{
int
depth
=
CV_MAT_DEPTH
(
type
);
int
cn
=
CV_MAT_CN
(
type
);
int
elemSize1
=
CV_ELEM_SIZE1
(
type
);
int
elemSize
=
CV_ELEM_SIZE
(
type
);
int
lineSize
=
((
cols
*
elemSize
+
align
-
1
)
/
align
)
*
align
;
CV_Assert
(
lineSize
%
elemSize1
==
0
);
UMat
dst
(
rows
,
lineSize
/
elemSize1
,
CV_MAKE_TYPE
(
depth
,
1
),
usageFlags
);
return
dst
(
Range
::
all
(),
Range
(
0
,
cols
*
cn
)).
reshape
(
cn
);
}
void
DTFilterOCL
::
createAligned
(
UMat
&
src
,
int
rows
,
int
cols
,
int
type
,
int
align
,
int
usageFlags
)
{
if
(
src
.
empty
()
||
src
.
rows
!=
rows
||
src
.
cols
!=
cols
||
src
.
type
()
!=
type
)
{
if
(
CV_ELEM_SIZE
(
type
)
*
cols
%
align
==
0
)
{
src
.
create
(
rows
,
cols
,
type
,
(
UMatUsageFlags
)
usageFlags
);
}
else
{
src
=
UMatAligned
(
rows
,
cols
,
type
,
align
,
usageFlags
);
}
}
}
void
DTFilterOCL
::
setSingleFilterCall
(
bool
flag
)
{
this
->
singleFilterCall
=
flag
;
}
}
\ No newline at end of file
modules/ximgproc/src/dtfilter_ocl.hpp
deleted
100644 → 0
View file @
291c87a1
#pragma once
#include "precomp.hpp"
namespace
cv
{
class
DTFilterOCL
:
public
DTFilter
{
public
:
static
Ptr
<
DTFilterOCL
>
create
(
InputArray
guide
,
double
sigmaSpatial
,
double
sigmaColor
,
int
mode
=
DTF_NC
,
int
numIters
=
3
);
void
filter
(
InputArray
src
,
OutputArray
dst
,
int
dDepth
=
-
1
);
void
setSingleFilterCall
(
bool
flag
);
~
DTFilterOCL
();
protected
:
/*Members declarations*/
UMat
idistHor
,
idistVert
;
UMat
distHor
,
distVert
;
UMat
distHorT
,
distVertT
;
UMat
&
a0distHor
,
&
a0distVert
;
//synonyms of distHor distVert
UMat
distIndexHor
,
distIndexVert
;
UMat
tailVert
,
tailHor
;
int
mode
,
numIters
;
float
sigmaSpatial
,
sigmaColor
;
bool
singleFilterCall
;
int
h
,
w
;
int
workType
;
double
meanDist
;
static
int64
totalKernelsTime
;
int64
kernelsTime
;
ocl
::
ProgramSource
kerProgSrcDT
;
ocl
::
ProgramSource
kerProgSrcFilter
;
cv
::
String
buildOptionsFlt
;
cv
::
String
buildOptionsDT
;
int
NC_ocl_implememtation
;
enum
NCOclImplementation
{
ALG_PER_ROW
,
ALG_PER_PIXEL
,
ALG_INDEX
,
};
protected
:
/*Functions declarations*/
DTFilterOCL
();
void
init
(
InputArray
guide
,
double
sigmaSpatial
,
double
sigmaColor
,
int
mode
=
DTF_NC
,
int
numIters
=
3
);
inline
double
getIterSigmaH
(
int
iterNum
)
{
return
sigmaSpatial
*
std
::
pow
(
2.0
,
numIters
-
iterNum
)
/
sqrt
(
std
::
pow
(
4.0
,
numIters
)
-
1
);
}
inline
float
getIterRadius
(
int
iterNum
)
{
return
(
float
)(
3.0
*
getIterSigmaH
(
iterNum
));
}
inline
float
getIterAlpha
(
int
iterNum
)
{
return
(
float
)
std
::
exp
(
-
std
::
sqrt
(
2.0
/
3.0
)
/
getIterSigmaH
(
iterNum
));
}
int
getMaxWorkGropSize
();
int
getCacheLineSize
();
void
initProgram
();
void
setBuildOptionsFlt
(
InputArray
src_
);
void
setBuildOptionsDT
(
UMat
&
guide
);
void
initDT_NC
(
UMat
&
guide
);
void
initDT_RF
(
UMat
&
guide
);
void
initDT_IC
(
UMat
&
guide
);
void
computeDT
(
UMat
&
guide
,
UMat
&
distHorOuter
,
UMat
&
distVertOuter
);
void
computeIDT
(
UMat
&
guide
,
UMat
&
guideT
);
void
filterNC_IndexAlg
(
UMat
&
src
,
UMat
&
dst
);
void
filterNC_perRowAlg
(
UMat
&
src
,
UMat
&
dst
);
void
filterNC_PerPixelAlg
(
UMat
&
src
,
UMat
&
dst
);
void
filterIC
(
InputArray
src_
,
OutputArray
dst_
,
int
dDepth
);
void
filterRF
(
InputArray
src_
,
OutputArray
dst_
,
int
dDepth
);
void
filterRF_naiveAlg
(
UMat
&
res
,
UMat
&
resT
,
UMat
&
adistHor
,
UMat
&
adistVert
);
void
filterRF_blockAlg
(
UMat
&
res
,
UMat
&
resT
,
UMat
&
adistHor
,
UMat
&
adistVert
);
void
filterRF_iter_vert_pass
(
UMat
&
res
,
UMat
&
adist
,
UMat
&
weights
);
void
computeIndexMat
(
UMat
&
idist
,
UMat
&
distIndex
,
bool
useIDT
=
true
);
void
integrateCols
(
UMat
&
src
,
UMat
&
isrc
);
void
computeIndexAndTailMat
(
UMat
&
dist
,
UMat
&
distIndex
,
UMat
&
tail
);
void
duplicateVerticalBorders
(
UMat
&
srcOuter
);
void
integrateColsIC
(
UMat
&
src
,
UMat
&
dist
,
UMat
&
isrc
);
static
UMat
UMatAligned
(
int
rows
,
int
cols
,
int
type
,
int
align
=
16
,
int
useageFlags
=
USAGE_DEFAULT
);
static
void
createAligned
(
UMat
&
src
,
int
rows
,
int
cols
,
int
type
,
int
align
=
16
,
int
usageFlags
=
USAGE_DEFAULT
);
};
}
\ No newline at end of file
modules/ximgproc/src/opencl/OpenCLKernel.hpp
deleted
100644 → 0
View file @
291c87a1
This source diff could not be displayed because it is too large. You can
view the blob
instead.
modules/ximgproc/src/opencl/dtfilter_dt.cl
deleted
100644 → 0
View file @
291c87a1
#
if
defined
(
_MSC_VER
)
//for
correct
syntax
highlighting
#
include
"OpenCLKernel.hpp"
#
define
cn
3
#
define
GuideType
uchar
#
define
GuideVec
uchar3
#
define
convert_guide
convert_float3
#
define
NUM_ITERS
3
#
endif
#
if
!defined
(
convert_guide
)
#
define
convert_guide
(
a
)
(
a
)
#
endif
#
if
cn==3
#
define
ELEM_SIZE
(
3*sizeof
(
GuideType
))
#
define
loadGuide
(
addr
)
vload3
(
0
,
(
__global
const
GuideType
*
)(
addr
))
#
else
#
define
ELEM_SIZE
sizeof
(
GuideVec
)
#
define
loadGuide
(
addr
)
(
*
(
__global
const
GuideVec*
)(
addr
)
)
#
endif
#
define
storeDist
(
val,
addr
)
*
(
__global
float*
)(
addr
)
=
val
#
if
cn
==
1
#
define
SUM
(
a
)
(
a
)
#
elif
cn
==
2
#
define
SUM
(
a
)
(
a.x
+
a.y
)
#
elif
cn
==
3
#
define
SUM
(
a
)
(
a.x
+
a.y
+
a.z
)
#
elif
cn
==
4
#
define
SUM
(
a
)
(
a.x
+
a.y
+
a.z
+
a.w
)
#
else
#
error
"cn should be <= 4"
#
endif
#
define
NORM
(
a,
b
)
SUM
(
fabs
(
convert_guide
(
a
)
-
convert_guide
(
b
)))
#
define
DT
(
a,
b,
sigmaRatios
)
(
1.0f
+
(
sigmaRatios
)
*NORM
(
a,
b
))
#
define
getFloat
(
addr,
index
)
(
*
(
__global
const
float*
)(
addr
+
(
index
)
*sizeof
(
float
))
)
#
define
storeFloat
(
val,
addr,
index
)
(
*
(
__global
float*
)(
addr
+
(
index
)
*sizeof
(
float
))
=
val
)
#
define
PtrAdd
(
addr,
bytesNum,
Type
)
((
__global
Type
*
)((
__global
uchar*
)(
addr
)
+
bytesNum
))
#
define
PtrAddConst
(
addr,
bytesNum,
Type
)
((
__global
const
Type
*
)((
__global
const
uchar*
)(
addr
)
+
bytesNum
))
__kernel
void
find_conv_bounds_by_idt
(
__global
const
uchar
*idist,
int
idist_step,
int
idist_offset,
int
rows,
int
cols,
__global
uchar
*bounds,
int
bounds_step,
int
bounds_offset,
float
radius1,
float
radius2,
float
radius3
)
{
int
i
=
get_global_id
(
0
)
;
int
j
=
get_global_id
(
1
)
;
if
(
!
(
i
>=
0
&&
i
<
rows
)
|
| !(j >= 0 && j < cols))
return;
idist += mad24(i, idist_step, idist_offset);
__global int *bound = bounds + mad24((NUM_ITERS-1)*rows + i, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset));
float center_idt_val = getFloat(idist, j);
float radius[] = {radius1, radius2, radius3};
int low_bound = j;
int high_bound = j;
float search_idt_val;
for (int iter = NUM_ITERS - 1; iter >= 0; iter--)
{
search_idt_val = center_idt_val - radius[iter];
while (search_idt_val < getFloat(idist, low_bound-1))
low_bound--;
bound[0] = low_bound;
search_idt_val = center_idt_val + radius[iter];
while (getFloat(idist, high_bound + 1) < search_idt_val)
high_bound++;
bound[1] = high_bound;
bound = PtrAdd(bound, -rows*bounds_step, int);
}
}
__kernel void find_conv_bounds_by_dt(__global const uchar *dist, int dist_step, int dist_offset, int rows, int cols,
__global uchar *bounds, int bounds_step, int bounds_offset,
float radius1, float radius2, float radius3)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
dist += mad24(i, dist_step, dist_offset);
int rowsOffset = mad24(NUM_ITERS - 1, rows, i);
__global int *bound = PtrAdd(bounds, mad24(rowsOffset, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset)), int);
float radius[] = {radius1, radius2, radius3};
int low_bound = j;
int high_bound = j;
float val, cur_radius, low_dt_val = 0.0f, high_dt_val = 0.0f;
for (int iter = NUM_ITERS - 1; iter >= 0; iter--)
{
cur_radius = radius[iter];
while (cur_radius > (val = low_dt_val + getFloat(dist, low_bound - 1)) )
{
low_dt_val = val;
low_bound--;
}
bound[0] = low_bound;
while (cur_radius > (val = high_dt_val + getFloat(dist, high_bound)) )
{
high_dt_val = val;
high_bound++;
}
bound[1] = high_bound;
bound = PtrAdd(bound, -rows*bounds_step, int);
}
}
__kernel void find_conv_bounds_and_tails(__global const uchar *dist, int dist_step, int dist_offset, int rows, int cols,
__global uchar *bounds, int bounds_step, int bounds_offset,
__global float *tailmat, int tail_step, int tail_offset,
float radius1, float radius2, float radius3)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
dist += mad24(i, dist_step, dist_offset);
int rowsOffset = mad24(NUM_ITERS - 1, rows, i);
__global int *bound = PtrAdd(bounds, mad24(rowsOffset, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset)), int);
__global float *tail = PtrAdd(tailmat, mad24(rowsOffset, tail_step, mad24(j, 2*(int)sizeof(float), tail_offset)), float);
float radius[] = {radius1, radius2, radius3};
int low_bound = j;
int high_bound = j;
float val, cur_radius, low_dt_val = 0.0f, high_dt_val = 0.0f;
for (int iter = NUM_ITERS - 1; iter >= 0; iter--)
{
cur_radius = radius[iter];
while (cur_radius > (val = low_dt_val + getFloat(dist, low_bound - 1)) )
{
low_dt_val = val;
low_bound--;
}
bound[0] = low_bound;
tail[0] = (cur_radius - low_dt_val);
while (cur_radius > (val = high_dt_val + getFloat(dist, high_bound)) )
{
high_dt_val = val;
high_bound++;
}
bound[1] = high_bound;
tail[1] = (cur_radius - high_dt_val);
bound = PtrAdd(bound, -rows*bounds_step, int);
tail = PtrAdd(tail, -rows*tail_step, float);
}
}
__kernel void compute_dt_hor(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar *dst, int dst_step, int dst_offset,
float sigma_ratio, float max_radius)
{
int i = get_global_id(0);
int j = get_global_id(1) - 1;
if (!(i >= 0 && i < src_rows && j >= -1 && j < src_cols))
return;
src += mad24(i, src_step, mad24(j, (int)ELEM_SIZE, src_offset));
dst += mad24(i, dst_step, mad24(j, (int)sizeof(float) , dst_offset));
float dist;
if (j == -1 || j == src_cols - 1)
dist = max_radius;
else
dist = DT(loadGuide(src), loadGuide(src + ELEM_SIZE), sigma_ratio);
storeDist(dist, dst);
}
__kernel void compute_dt_vert(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar *dst, int dst_step, int dst_offset,
float sigma_ratio, float max_radius)
{
int i = get_global_id(0) - 1;
int j = get_global_id(1);
if (!(i >= -1 && i < src_rows && j >= 0 && j < src_cols))
return;
src += mad24(i, src_step, mad24(j, (int)ELEM_SIZE, src_offset));
dst += mad24(i, dst_step, mad24(j, (int)sizeof(float) , dst_offset));
float dist;
if (i == -1 |
|
i
==
src_rows
-
1
)
dist
=
max_radius
;
else
dist
=
DT
(
loadGuide
(
src
)
,
loadGuide
(
src
+
src_step
)
,
sigma_ratio
)
;
storeDist
(
dist,
dst
)
;
}
__kernel
void
compute_idt_vert
(
__global
const
uchar
*src,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
float
sigma_ratio
)
{
int
j
=
get_global_id
(
0
)
;
if
(
!
(
j
>=
0
&&
j
<
src_cols
))
return
;
int
i
;
float
idist
=
0
;
__global
const
uchar
*src_col
=
src
+
mad24
(
j,
(
int
)
ELEM_SIZE,
src_offset
)
;
__global
uchar
*dst_col
=
dst
+
mad24
(
j,
(
int
)
sizeof
(
float
)
,
dst_offset
)
;
storeFloat
(
-FLT_MAX,
dst_col
+
(
-1
)
*dst_step,
0
)
;
storeFloat
(
0.0f,
dst_col
+
0*dst_step,
0
)
;
for
(
i
=
1
; i < src_rows; i++, src_col += src_step)
{
idist
+=
DT
(
loadGuide
(
src_col
)
,
loadGuide
(
src_col
+
src_step
)
,
sigma_ratio
)
;
storeFloat
(
idist,
dst_col
+
i*dst_step,
0
)
;
}
storeFloat
(
FLT_MAX,
dst_col
+
i*dst_step,
0
)
;
}
__kernel
void
compute_a0DT_vert
(
__global
const
uchar
*src,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
float
sigma_ratio,
float
alpha
)
{
int
i
=
get_global_id
(
0
)
;
int
j
=
get_global_id
(
1
)
;
if
(
!
(
i
>=
0
&&
i
<
src_rows-1
&&
j
>=
0
&&
j
<
src_cols
))
return
;
src
+=
mad24
(
i,
src_step,
mad24
(
j,
(
int
)
ELEM_SIZE,
src_offset
))
;
dst
+=
mad24
(
i,
dst_step,
mad24
(
j,
(
int
)
sizeof
(
float
)
,
dst_offset
))
;
float
dist
=
DT
(
loadGuide
(
src
)
,
loadGuide
(
src
+
src_step
)
,
sigma_ratio
)
;
storeDist
(
native_powr
(
alpha,
dist
)
,
dst
)
;
}
\ No newline at end of file
modules/ximgproc/src/opencl/dtfilter_flt.cl
deleted
100644 → 0
View file @
291c87a1
#
if
defined
(
_MSC_VER
)
//for
correct
syntax
highlighting
#
include
"OpenCLKernel.hpp"
#
define
cn
3
#
define
SrcVec
float3
#
define
NUM_ITERS
3
#
endif
#
define
getFloatn
(
n,
addr,
index
)
vload##n
(
index,
(
__global
const
float*
)(
addr
))
#
define
storeFloatn
(
n,
val,
addr,
index
)
vstore##n
(
val,
index,
(
__global
float*
)(
addr
))
#
define
PtrAdd
(
addr,
bytesNum,
Type
)
((
__global
Type
*
)((
__global
uchar*
)(
addr
)
+
bytesNum
))
#
define
PtrAddConst
(
addr,
bytesNum,
Type
)
((
__global
const
Type
*
)((
__global
const
uchar*
)(
addr
)
+
bytesNum
))
#
if
cn==3
#
define
ELEM_SIZE
(
int
)(
3*sizeof
(
float
))
#
define
getPix
(
addr,
index
)
vload3
(
index,
(
__global
const
float*
)(
addr
))
#
define
storePix
(
val,
addr,
index
)
vstore3
(
val,
index,
(
__global
float*
)(
addr
))
#
else
#
define
ELEM_SIZE
(
int
)
sizeof
(
SrcVec
)
#
define
getPix
(
addr,
index
)
(
*
(
__global
const
SrcVec*
)(
addr
+
(
index
)
*ELEM_SIZE
)
)
#
define
storePix
(
val,
addr,
index
)
(
*
(
__global
SrcVec*
)(
addr
+
(
index
)
*ELEM_SIZE
)
=
val
)
#
endif
#
define
getFloat
(
addr,
index
)
(
*
(
__global
const
float*
)(
addr
+
(
index
)
*sizeof
(
float
))
)
#
define
storeFloat
(
val,
addr,
index
)
(
*
(
__global
float*
)(
addr
+
(
index
)
*sizeof
(
float
))
=
val
)
#
define
getInt
(
addr,
index
)
(
*
(
__global
const
int*
)(
addr
+
(
index
)
*sizeof
(
int
))
)
#
define
storeInt
(
val,
addr,
index
)
(
*
(
__global
int*
)(
addr
+
(
index
)
*sizeof
(
int
))
=
val
)
#
define
getFloat4
(
addr,
index
)
getFloatn
(
4
,
addr,
index
)
#
define
storeFloat4
(
val,
addr,
index
)
storeFloatn
(
4
,
val,
addr,
index
)
#
define
NC_USE_INTEGRAL_SRC
#
undef
NC_USE_INTEGRAL_SRC
__kernel
void
integrate_cols_4f
(
__global
uchar
*src,
int
src_step,
int
src_offset,
__global
uchar
*isrc,
int
isrc_step,
int
isrc_offset,
int
rows,
int
col_chunks
)
{
int
j
=
get_global_id
(
0
)
;
if
(
!
(
j
>=
0
&&
j
<
col_chunks
)
)
return
;
src
+=
mad24
(
j,
(
int
)
sizeof
(
float4
)
,
src_offset
)
;
isrc
+=
mad24
(
j,
(
int
)
sizeof
(
float4
)
,
isrc_offset
)
;
float4
sum
=
0
;
storeFloat4
(
0
,
isrc,
0
)
;
isrc
+=
isrc_step
;
for
(
int
i
=
0
; i < rows; i++, src += src_step, isrc += isrc_step)
{
sum
+=
getFloat4
(
src,
0
)
;
storeFloat4
(
sum,
isrc,
0
)
;
}
}
__kernel
void
integrate_cols_with_dist
(
__global
const
uchar
*src,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
const
uchar
*dist,
int
dist_step,
int
dist_offset,
__global
uchar
*isrc,
int
isrc_step,
int
isrc_offset
)
{
int
j
=
get_global_id
(
0
)
;
if
(
!
(
j
>=
0
&&
j
<
src_cols
)
)
return
;
src
+=
mad24
(
j,
ELEM_SIZE,
src_offset
)
;
isrc
+=
mad24
(
j,
ELEM_SIZE,
isrc_offset
)
;
dist
+=
mad24
(
j,
(
int
)
sizeof
(
float
)
,
dist_offset
)
;
SrcVec
sum
=
0
;
storePix
(
0
,
isrc,
0
)
;
isrc
+=
isrc_step
;
for
(
int
i
=
0
; i < src_rows; i++, src += src_step, isrc += isrc_step, dist += dist_step)
{
sum
+=
0.5f
*
(
getPix
(
src,
0
)
+
getPix
(
src
+
src_step,
0
))
*
getFloat
(
dist,
0
)
;
storePix
(
sum,
isrc,
0
)
;
}
}
__kernel
void
filter_IC_hor
(
__global
const
uchar
*src,
int
src_step,
int
src_offset,
int
rows,
int
cols,
__global
const
uchar
*isrc,
int
isrc_step,
int
isrc_offset,
__global
const
int
*bounds,
int
bounds_step,
int
bounds_offset,
__global
const
float
*tail,
int
tail_step,
int
tail_offset,
__global
float
*dist,
int
dist_step,
int
dist_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
float
radius,
int
iterNum
)
{
int
i
=
get_global_id
(
0
)
;
int
j
=
get_global_id
(
1
)
;
if
(
!
(
i
>=
0
&&
i
<
rows
)
|
| !(j >= 0 && j < cols))
return;
dst += mad24(i, dst_step, dst_offset);
isrc += mad24(i, isrc_step, isrc_offset);
src += mad24(i, src_step, src_offset);
int ioffset = mad24(iterNum*rows + i, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset));
bounds = PtrAddConst(bounds, ioffset, int);
int il = bounds[0];
int ir = bounds[1];
int toffset = mad24(iterNum*rows + i, tail_step, mad24(j, 2*(int)sizeof(float), tail_offset));
tail = PtrAddConst(tail, toffset, float);
float tl = tail[0];
float tr = tail[1];
dist = PtrAdd(dist, mad24(i, dist_step, dist_offset), float);
float dl = tl / dist[il-1];
float dr = tr / dist[ir];
SrcVec res;
res = getPix(isrc, ir) - getPix(isrc, il); //center part
res += 0.5f*tl * ( dl*getPix(src, il - 1) + (2.0f - dl)*getPix(src, il) ); //left tail
res += 0.5f*tr * ( dr*getPix(src, ir + 1) + (2.0f - dr)*getPix(src, ir) ); //right tail
res /= radius;
storePix(res, dst, j);
}
__kernel void filter_NC_hor_by_bounds(__global const uchar *isrc, int isrc_step, int isrc_offset,
__global const uchar *bounds, int bounds_step, int bounds_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols,
int iterNum)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < rows) || !(j >= 0 && j < cols))
return;
dst += mad24(i, dst_step, dst_offset);
isrc += mad24(i, isrc_step, isrc_offset);
int ioffset = mad24(iterNum*rows + i, bounds_step, mad24(j, 2*(int)sizeof(int), bounds_offset));
__global const int *index = PtrAddConst(bounds, ioffset, int);
int li = index[0];
int hi = index[1];
SrcVec res = (getPix(isrc, hi+1) - getPix(isrc, li)) / (float)(hi - li + 1);
storePix(res, dst, j);
}
__kernel void filter_NC_hor(__global uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global const uchar *idist, int idist_step, int idist_offset,
__global uchar *dst, int dst_step, int dst_offset,
float radius)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (!(i >= 0 && i < src_rows) || !(j >= 0 && j < src_cols))
return;
src += mad24(i, src_step, src_offset);
idist += mad24(i, idist_step, idist_offset);
dst += mad24(i, dst_step, dst_offset);
int low_bound = j;
int high_bound = j;
float cur_idt_val = getFloat(idist, j);
float low_idt_val = cur_idt_val - radius;
float high_idt_val = cur_idt_val + radius;
SrcVec sum = getPix(src, j);
while (low_bound > 0 && low_idt_val < getFloat(idist, low_bound-1))
{
low_bound--;
sum += getPix(src, low_bound);
}
while (getFloat(idist, high_bound + 1) < high_idt_val)
{
high_bound++;
sum += getPix(src, high_bound);
}
storePix(sum / (float)(high_bound - low_bound + 1), dst, j);
}
__kernel void filter_NC_vert(__global const uchar *src, int src_step, int src_offset, int src_rows, int src_cols,
__global const uchar *idist, int idist_step, int idist_offset,
__global uchar *dst, int dst_step, int dst_offset,
float radius)
{
int j = get_global_id(0);
if (!(j >= 0 && j < src_cols))
return;
__global const uchar *src_col = src + mad24(j, (int)ELEM_SIZE, src_offset);
__global uchar *dst_col = dst + mad24(j, (int)ELEM_SIZE, dst_offset);
__global const uchar *idist_col = idist + mad24(j, (int)sizeof(float), idist_offset);
int low_bound = 0, high_bound = 0;
SrcVec sum = getPix(src_col, 0);
SrcVec res;
for (int i = 0; i < src_rows; i++)
{
float cur_idt_value = getFloat(idist_col + i*idist_step, 0);
float low_idt_value = cur_idt_value - radius;
float high_idt_value = cur_idt_value + radius;
while (getFloat(idist_col + low_bound*idist_step, 0) < low_idt_value)
{
sum -= getPix(src_col + low_bound*src_step, 0);
low_bound++;
}
while (getFloat(idist_col + (high_bound + 1)*idist_step, 0) < high_idt_value)
{
high_bound++;
sum += getPix(src_col + high_bound*src_step, 0);
}
res = sum / (float)(high_bound - low_bound + 1);
storePix(res, dst_col + i*dst_step, 0);
}
}
__kernel void filter_RF_vert(__global uchar *res, int res_step, int res_offset, int res_rows, int res_cols,
__global uchar *adist, int adist_step, int adist_offset,
int write_new_a_dist)
{
int j = get_global_id(0);
if (!(j >= 0 && j < res_cols))
return;
res += mad24(j, (int)ELEM_SIZE, res_offset);
adist += mad24(j, (int)sizeof(float), adist_offset);
SrcVec cur_val;
float ad;
int i;
cur_val = getPix(res + 0*res_step, 0);
for (i = 1; i < res_rows; i++)
{
ad = getFloat(adist + (i-1)*adist_step, 0);
cur_val = (1.0f - ad)*getPix(res + i*res_step, 0) + ad*cur_val;
storePix(cur_val, res + i*res_step, 0);
}
for (i = res_rows - 2; i >= 0; i--)
{
ad = getFloat(adist + i*adist_step, 0);
cur_val = (1.0f - ad)*getPix(res + i*res_step, 0) + ad*cur_val;
storePix(cur_val, res + i*res_step, 0);
if (write_new_a_dist)
storeFloat(ad*ad, adist + i*adist_step, 0);
}
}
#define genMinMaxPtrs(ptr, h) __global const uchar *ptr##_min = (__global const uchar *)ptr;\
__global const uchar *ptr##_max = (__global const uchar *)ptr + mad24(h, ptr##_step, ptr##_offset);
#define checkPtr(ptr, bound) (bound##_min <= (__global const uchar *)(ptr) && (__global const uchar *)(ptr) < bound##_max)
__kernel void filter_RF_block_init_fwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *adist, int adist_step, int adist_offset,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int bid = get_global_id(0);
int j = get_global_id(1);
if (j < 0 || j >= cols) return;
int startRow = max(1, bid*blockSize); //skip first row
int endRow = min(rows, (bid + 1)*blockSize);
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));;
adist += mad24(j, (int) sizeof(float), mad24(startRow-1, adist_step, adist_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
SrcVec cur_val = (startRow != 1) ? 0 : getPix(res - res_step, 0);
float weight0 = 1.0f;
for (int i = startRow; i < endRow; i++)
{
float ad = getFloat(adist, 0);
cur_val = (1.0f - ad)*getPix(res, 0) + ad*cur_val;
storePix(cur_val, res, 0);
weight0 *= ad;
storeFloat(weight0, weights, 0);
res += res_step;
adist += adist_step;
weights += weights_step;
}
}
__kernel void filter_RF_block_fill_borders_fwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int j = get_global_id(0);
if (j < 0 || j >= cols) return;
int startRow = 2*blockSize - 1;
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
int res_step_block = blockSize*res_step;
int weights_step_block = blockSize*weights_step;
SrcVec prev_pix = getPix(res - res_step_block, 0);
for (int i = startRow; i < rows; i += blockSize)
{
prev_pix = getPix(res, 0) + getFloat(weights, 0)*prev_pix;
storePix(prev_pix, res, 0);
res += res_step_block;
weights += weights_step_block;
}
}
__kernel void filter_RF_block_fill_fwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int i = get_global_id(0) + blockSize;
int j = get_global_id(1);
if (j < 0 || j >= cols || i < 0 || i >= rows) return;
if (i % blockSize == blockSize - 1) return; //to avoid rewriting bound pixels
int bid = i / blockSize;
int ref_row_id = bid*blockSize - 1;
__global uchar *res_ref_row = res + mad24(ref_row_id, res_step, res_offset);
__global uchar *res_dst_row = res + mad24(i, res_step, res_offset);
__global float *weights_row = PtrAdd(weights, mad24(i, weights_step, weights_offset), float);
storePix(getPix(res_dst_row, j) + weights_row[j]*getPix(res_ref_row, j), res_dst_row, j);
}
__kernel void filter_RF_block_init_bwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *adist, int adist_step, int adist_offset,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int bid = get_global_id(0);
int j = get_global_id(1);
if (j < 0 || j >= cols) return;
int startRow = rows-1 - max(1, bid*blockSize);
int endRow = max(-1, rows-1 - (bid+1)*blockSize);
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));;
adist += mad24(j, (int) sizeof(float), mad24(startRow, adist_step, adist_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
SrcVec cur_val = (startRow != rows-2) ? 0 : getPix(res + res_step, 0);
float weight0 = 1.0f;
for (int i = startRow; i > endRow; i--)
{
float ad = getFloat(adist, 0);
cur_val = (1.0f - ad)*getPix(res, 0) + ad*cur_val;
storePix(cur_val, res, 0);
weight0 *= ad;
storeFloat(weight0, weights, 0);
res -= res_step;
adist -= adist_step;
weights -= weights_step;
}
}
__kernel void filter_RF_block_fill_borders_bwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int j = get_global_id(0);
if (j < 0 || j >= cols) return;
int startRow = rows-1 - (2*blockSize - 1);
res += mad24(j, ELEM_SIZE, mad24(startRow, res_step, res_offset));
weights += mad24(j, (int) sizeof(float), mad24(startRow, weights_step, weights_offset));
int res_step_block = blockSize*res_step;
int weights_step_block = blockSize*weights_step;
SrcVec prev_pix = getPix(res + res_step_block, 0);
for (int i = startRow; i >= 0; i -= blockSize)
{
prev_pix = getPix(res, 0) + getFloat(weights, 0)*prev_pix;
storePix(prev_pix, res, 0);
res -= blockSize*res_step;
weights -= blockSize*weights_step;
}
}
__kernel void filter_RF_block_fill_bwd(__global uchar *res, int res_step, int res_offset, int rows, int cols,
__global uchar *weights, int weights_step, int weights_offset,
int blockSize)
{
int i = get_global_id(0) + blockSize;
int j = get_global_id(1);
if (j < 0 || j >= cols || i < 0 |
|
i
>=
rows
)
return
;
if
(
i
%
blockSize
==
blockSize
-
1
)
return
; //to avoid rewriting bound pixels
int
bid
=
i
/
blockSize
;
int
ref_row_id
=
rows-1
-
(
bid*blockSize
-
1
)
;
int
dst_row_id
=
rows-1
-
i
;
__global
uchar
*res_ref_row
=
res
+
mad24
(
ref_row_id,
res_step,
res_offset
)
;
__global
uchar
*res_dst_row
=
res
+
mad24
(
dst_row_id,
res_step,
res_offset
)
;
__global
float
*weights_row
=
PtrAdd
(
weights,
mad24
(
dst_row_id,
weights_step,
weights_offset
)
,
float
)
;
storePix
(
getPix
(
res_dst_row,
j
)
+
weights_row[j]*getPix
(
res_ref_row,
j
)
,
res_dst_row,
j
)
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment