Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
2068c458
Commit
2068c458
authored
Apr 27, 2013
by
Vadim Pisarevsky
Committed by
OpenCV Buildbot
Apr 27, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #844 from bitwangyaoyao:2.4_integral
parents
4f26f0c2
b386ea72
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
488 additions
and
74 deletions
+488
-74
imgproc.cpp
modules/ocl/src/imgproc.cpp
+78
-63
imgproc_integral.cl
modules/ocl/src/opencl/imgproc_integral.cl
+219
-2
imgproc_integral_sum.cl
modules/ocl/src/opencl/imgproc_integral_sum.cl
+191
-9
No files found.
modules/ocl/src/imgproc.cpp
View file @
2068c458
...
...
@@ -1011,10 +1011,8 @@ namespace cv
warpPerspective_gpu
(
src
,
dst
,
coeffs
,
interpolation
);
}
////////////////////////////////////////////////////////////////////////
// integral
void
integral
(
const
oclMat
&
src
,
oclMat
&
sum
,
oclMat
&
sqsum
)
{
CV_Assert
(
src
.
type
()
==
CV_8UC1
);
...
...
@@ -1028,42 +1026,53 @@ namespace cv
int
vcols
=
(
pre_invalid
+
src
.
cols
+
vlen
-
1
)
/
vlen
;
oclMat
t_sum
,
t_sqsum
;
t_sum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32SC1
);
t_sqsum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32FC1
);
int
w
=
src
.
cols
+
1
,
h
=
src
.
rows
+
1
;
sum
.
create
(
h
,
w
,
CV_32SC1
);
sqsum
.
create
(
h
,
w
,
CV_32FC1
);
int
sum_offset
=
sum
.
offset
/
vlen
,
sqsum_offset
=
sqsum
.
offset
/
vlen
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
pre_invalid
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
size_t
gt
[
3
]
=
{((
vcols
+
1
)
/
2
)
*
256
,
1
,
1
},
lt
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral
,
"integral_cols"
,
gt
,
lt
,
args
,
-
1
,
-
1
);
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sqsum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum_offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sqsum_offset
));
size_t
gt2
[
3
]
=
{
t_sum
.
cols
*
32
,
1
,
1
},
lt2
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral
,
"integral_rows"
,
gt2
,
lt2
,
args
,
-
1
,
-
1
);
//cout << "tested" << endl;
int
depth
;
if
(
src
.
cols
*
src
.
rows
<=
2901
*
2901
)
//2901 is the maximum size for int when all values are 255
{
t_sum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32SC1
);
sum
.
create
(
h
,
w
,
CV_32SC1
);
}
else
{
//Use float to prevent overflow
t_sum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32FC1
);
sum
.
create
(
h
,
w
,
CV_32FC1
);
}
t_sqsum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32FC1
);
sqsum
.
create
(
h
,
w
,
CV_32FC1
);
depth
=
sum
.
depth
();
int
sum_offset
=
sum
.
offset
/
vlen
;
int
sqsum_offset
=
sqsum
.
offset
/
vlen
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
pre_invalid
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
size_t
gt
[
3
]
=
{((
vcols
+
1
)
/
2
)
*
256
,
1
,
1
},
lt
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral
,
"integral_cols"
,
gt
,
lt
,
args
,
-
1
,
depth
);
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sqsum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum_offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sqsum_offset
));
size_t
gt2
[
3
]
=
{
t_sum
.
cols
*
32
,
1
,
1
},
lt2
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral
,
"integral_rows"
,
gt2
,
lt2
,
args
,
-
1
,
depth
);
}
void
integral
(
const
oclMat
&
src
,
oclMat
&
sum
)
{
CV_Assert
(
src
.
type
()
==
CV_8UC1
);
...
...
@@ -1073,34 +1082,40 @@ namespace cv
int
vcols
=
(
pre_invalid
+
src
.
cols
+
vlen
-
1
)
/
vlen
;
oclMat
t_sum
;
t_sum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32SC1
);
int
w
=
src
.
cols
+
1
,
h
=
src
.
rows
+
1
;
sum
.
create
(
h
,
w
,
CV_32SC1
);
int
sum_offset
=
sum
.
offset
/
vlen
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
pre_invalid
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
size_t
gt
[
3
]
=
{((
vcols
+
1
)
/
2
)
*
256
,
1
,
1
},
lt
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral_sum
,
"integral_sum_cols"
,
gt
,
lt
,
args
,
-
1
,
-
1
);
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum_offset
));
size_t
gt2
[
3
]
=
{
t_sum
.
cols
*
32
,
1
,
1
},
lt2
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral_sum
,
"integral_sum_rows"
,
gt2
,
lt2
,
args
,
-
1
,
-
1
);
//cout << "tested" << endl;
int
depth
;
if
(
src
.
cols
*
src
.
rows
<=
2901
*
2901
)
{
t_sum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32SC1
);
sum
.
create
(
h
,
w
,
CV_32SC1
);
}
else
{
t_sum
.
create
(
src
.
cols
,
src
.
rows
,
CV_32FC1
);
sum
.
create
(
h
,
w
,
CV_32FC1
);
}
depth
=
sum
.
depth
();
int
sum_offset
=
sum
.
offset
/
vlen
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
pre_invalid
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
size_t
gt
[
3
]
=
{((
vcols
+
1
)
/
2
)
*
256
,
1
,
1
},
lt
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral_sum
,
"integral_sum_cols"
,
gt
,
lt
,
args
,
-
1
,
depth
);
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
t_sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
t_sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
sum_offset
));
size_t
gt2
[
3
]
=
{
t_sum
.
cols
*
32
,
1
,
1
},
lt2
[
3
]
=
{
256
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_integral_sum
,
"integral_sum_rows"
,
gt2
,
lt2
,
args
,
-
1
,
depth
);
}
/////////////////////// corner //////////////////////////////
...
...
modules/ocl/src/opencl/imgproc_integral.cl
View file @
2068c458
...
...
@@ -60,7 +60,7 @@
#
define
GET_CONFLICT_OFFSET
(
lid
)
((
lid
)
>>
LOG_NUM_BANKS
)
kernel
void
integral_cols
(
__global
uchar4
*src,__global
int
*sum
,
__global
float
*sqsum,
kernel
void
integral_cols
_D4
(
__global
uchar4
*src,__global
int
*sum
,
__global
float
*sqsum,
int
src_offset,int
pre_invalid,int
rows,int
cols,int
src_step,int
dst_step
)
{
unsigned
int
lid
=
get_local_id
(
0
)
;
...
...
@@ -159,7 +159,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
}
kernel
void
integral_rows
(
__global
int4
*srcsum,__global
float4
*
srcsqsum,__global
int
*sum
,
kernel void integral_rows
_D4
(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
int sqsum_step,int sum_offset,int sqsum_offset)
{
...
...
@@ -275,3 +275,219 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
float4 sqsum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
__local float* sum_p;
__local float* sqsum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid |
|
gid
*
4
+
k
<
pre_invalid
)
continue
;
sum[loc_s0
+
k
*
dst_step
/
4]
=
sum_p[k]
;
sqsum[loc_s0
+
k
*
dst_step
/
4]
=
sqsum_p[k]
;
}
sum_p
=
(
__local
float*
)(
&
(
lm_sum[1][bf_loc]
))
;
sqsum_p
=
(
__local
float*
)(
&
(
lm_sqsum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
4
+
k
+
4
>=
cols
+
pre_invalid
)
break
;
sum[loc_s1
+
k
*
dst_step
/
4]
=
sum_p[k]
;
sqsum[loc_s1
+
k
*
dst_step
/
4]
=
sqsum_p[k]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
kernel
void
integral_rows_D5
(
__global
float4
*srcsum,__global
float4
*
srcsqsum,__global
float
*sum
,
__global
float
*sqsum,int
rows,int
cols,int
src_step,int
sum_step,
int
sqsum_step,int
sum_offset,int
sqsum_offset
)
{
unsigned
int
lid
=
get_local_id
(
0
)
;
unsigned
int
gid
=
get_group_id
(
0
)
;
float4
src_t[2],
sum_t[2]
;
float4
sqsrc_t[2],sqsum_t[2]
;
__local
float4
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
float4
lm_sqsum[2][LSIZE
+
LOG_LSIZE]
;
__local
float
*sum_p
;
__local
float
*sqsum_p
;
src_step
=
src_step
>>
4
;
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
{
src_t[0]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2]
:
(
float4
)
0
;
sqsrc_t[0]
=
i
+
lid
<
rows
?
srcsqsum[
(
lid+i
)
*
src_step
+
gid
*
2]
:
(
float4
)
0
;
src_t[1]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2
+
1]
:
(
float4
)
0
;
sqsrc_t[1]
=
i
+
lid
<
rows
?
srcsqsum[
(
lid+i
)
*
src_step
+
gid
*
2
+
1]
:
(
float4
)
0
;
sum_t[0]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sqsum_t[0]
=
(
i
==
0
?
(
float4
)
0
:
lm_sqsum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[1]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[1][LSIZE_2
+
LOG_LSIZE]
)
;
sqsum_t[1]
=
(
i
==
0
?
(
float4
)
0
:
lm_sqsum[1][LSIZE_2
+
LOG_LSIZE]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
bf_loc
=
lid
+
GET_CONFLICT_OFFSET
(
lid
)
;
lm_sum[0][bf_loc]
=
src_t[0]
;
lm_sqsum[0][bf_loc]
=
sqsrc_t[0]
;
lm_sum[1][bf_loc]
=
src_t[1]
;
lm_sqsum[1][bf_loc]
=
sqsrc_t[1]
;
int
offset
=
1
;
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
lm_sqsum[lid
>>
7][bi]
+=
lm_sqsum[lid
>>
7][ai]
;
}
offset
<<=
1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
<
2
)
{
lm_sum[lid][LSIZE_2
+
LOG_LSIZE]
=
0
;
lm_sqsum[lid][LSIZE_2
+
LOG_LSIZE]
=
0
;
}
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
offset
>>=
1
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
lm_sum[lid
>>
7][ai]
=
lm_sum[lid
>>
7][bi]
-
lm_sum[lid
>>
7][ai]
;
lm_sqsum[lid
>>
7][bi]
+=
lm_sqsum[lid
>>
7][ai]
;
lm_sqsum[lid
>>
7][ai]
=
lm_sqsum[lid
>>
7][bi]
-
lm_sqsum[lid
>>
7][ai]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
gid
==
0
&&
(
i
+
lid
)
<=
rows
)
{
sum[sum_offset
+
i
+
lid]
=
0
;
sqsum[sqsum_offset
+
i
+
lid]
=
0
;
}
if
(
i
+
lid
==
0
)
{
int
loc0
=
gid
*
2
*
sum_step
;
int
loc1
=
gid
*
2
*
sqsum_step
;
for
(
int
k
=
1
; k <= 8; k++)
{
if
(
gid
*
8
+
k
>
cols
)
break
;
sum[sum_offset
+
loc0
+
k
*
sum_step
/
4]
=
0
;
sqsum[sqsum_offset
+
loc1
+
k
*
sqsum_step
/
4]
=
0
;
}
}
int
loc_s0
=
sum_offset
+
gid
*
2
*
sum_step
+
sum_step
/
4
+
i
+
lid,
loc_s1
=
loc_s0
+
sum_step
;
int
loc_sq0
=
sqsum_offset
+
gid
*
2
*
sqsum_step
+
sqsum_step
/
4
+
i
+
lid,
loc_sq1
=
loc_sq0
+
sqsum_step
;
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
lm_sum[0][bf_loc]
+=
sum_t[0]
;
lm_sum[1][bf_loc]
+=
sum_t[1]
;
lm_sqsum[0][bf_loc]
+=
sqsum_t[0]
;
lm_sqsum[1][bf_loc]
+=
sqsum_t[1]
;
sum_p
=
(
__local
float*
)(
&
(
lm_sum[0][bf_loc]
))
;
sqsum_p
=
(
__local
float*
)(
&
(
lm_sqsum[0][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
k
>=
cols
)
break
;
sum[loc_s0
+
k
*
sum_step
/
4]
=
sum_p[k]
;
sqsum[loc_sq0
+
k
*
sqsum_step
/
4]
=
sqsum_p[k]
;
}
sum_p
=
(
__local
float*
)(
&
(
lm_sum[1][bf_loc]
))
;
sqsum_p
=
(
__local
float*
)(
&
(
lm_sqsum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
4
+
k
>=
cols
)
break
;
sum[loc_s1
+
k
*
sum_step
/
4]
=
sum_p[k]
;
sqsum[loc_sq1
+
k
*
sqsum_step
/
4]
=
sqsum_p[k]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
\ No newline at end of file
modules/ocl/src/opencl/imgproc_integral_sum.cl
View file @
2068c458
...
...
@@ -44,8 +44,13 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
define
LSIZE
256
#
define
LSIZE_1
255
#
define
LSIZE_2
254
...
...
@@ -56,8 +61,8 @@
#
define
GET_CONFLICT_OFFSET
(
lid
)
((
lid
)
>>
LOG_NUM_BANKS
)
kernel
void
integral_sum_cols
(
__global
uchar4
*src,__global
int
*sum
,
int
src_offset,int
pre_invalid,int
rows,int
cols,int
src_step,int
dst_step
)
kernel
void
integral_sum_cols
_D4
(
__global
uchar4
*src,__global
int
*sum
,
int
src_offset,int
pre_invalid,int
rows,int
cols,int
src_step,int
dst_step
)
{
unsigned
int
lid
=
get_local_id
(
0
)
;
unsigned
int
gid
=
get_group_id
(
0
)
;
...
...
@@ -114,7 +119,8 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
int
loc_s0
=
gid
*
dst_step
+
i
+
lid
-
1
-
pre_invalid
*
dst_step
/
4
,
loc_s1
=
loc_s0
+
dst_step
;
lm_sum[0][bf_loc]
+=
sum_t[0]
;
lm_sum[1][bf_loc]
+=
sum_t[1]
;
...
...
@@ -136,9 +142,9 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
}
kernel
void
integral_sum_rows
(
__global
int4
*srcsum,__global
int
*sum
,
int
rows,int
cols,int
src_step,int
sum_step,
int
sum_offset
)
kernel void integral_sum_rows
_D4
(__global int4 *srcsum,__global int *sum ,
int rows,int cols,int src_step,int sum_step,
int sum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
...
...
@@ -196,19 +202,20 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset
+
i
+
lid]
=
0
;
sum[sum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
for
(
int
k
=
1
;
k <= 8;
k++)
for(int k = 1;
k <= 8;
k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
}
}
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
...
...
@@ -228,3 +235,178 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float* sum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0);
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0);
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid |
|
gid
*
4
+
k
<
pre_invalid
)
continue
;
sum[loc_s0
+
k
*
dst_step
/
4]
=
sum_p[k]
;
}
sum_p
=
(
__local
float*
)(
&
(
lm_sum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
4
+
k
+
4
>=
cols
+
pre_invalid
)
break
;
sum[loc_s1
+
k
*
dst_step
/
4]
=
sum_p[k]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
kernel
void
integral_sum_rows_D5
(
__global
float4
*srcsum,__global
float
*sum
,
int
rows,int
cols,int
src_step,int
sum_step,
int
sum_offset
)
{
unsigned
int
lid
=
get_local_id
(
0
)
;
unsigned
int
gid
=
get_group_id
(
0
)
;
float4
src_t[2],
sum_t[2]
;
__local
float4
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
float
*sum_p
;
src_step
=
src_step
>>
4
;
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
{
src_t[0]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2]
:
(
float4
)
0
;
src_t[1]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2
+
1]
:
(
float4
)
0
;
sum_t[0]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[1]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[1][LSIZE_2
+
LOG_LSIZE]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
bf_loc
=
lid
+
GET_CONFLICT_OFFSET
(
lid
)
;
lm_sum[0][bf_loc]
=
src_t[0]
;
lm_sum[1][bf_loc]
=
src_t[1]
;
int
offset
=
1
;
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
}
offset
<<=
1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
<
2
)
{
lm_sum[lid][LSIZE_2
+
LOG_LSIZE]
=
0
;
}
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
offset
>>=
1
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
lm_sum[lid
>>
7][ai]
=
lm_sum[lid
>>
7][bi]
-
lm_sum[lid
>>
7][ai]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
gid
==
0
&&
(
i
+
lid
)
<=
rows
)
{
sum[sum_offset
+
i
+
lid]
=
0
;
}
if
(
i
+
lid
==
0
)
{
int
loc0
=
gid
*
2
*
sum_step
;
for
(
int
k
=
1
; k <= 8; k++)
{
if
(
gid
*
8
+
k
>
cols
)
break
;
sum[sum_offset
+
loc0
+
k
*
sum_step
/
4]
=
0
;
}
}
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
int
loc_s0
=
sum_offset
+
gid
*
2
*
sum_step
+
sum_step
/
4
+
i
+
lid,
loc_s1
=
loc_s0
+
sum_step
;
lm_sum[0][bf_loc]
+=
sum_t[0]
;
lm_sum[1][bf_loc]
+=
sum_t[1]
;
sum_p
=
(
__local
float*
)(
&
(
lm_sum[0][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
k
>=
cols
)
break
;
sum[loc_s0
+
k
*
sum_step
/
4]
=
sum_p[k]
;
}
sum_p
=
(
__local
float*
)(
&
(
lm_sum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
4
+
k
>=
cols
)
break
;
sum[loc_s1
+
k
*
sum_step
/
4]
=
sum_p[k]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment