Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
2338a895
Commit
2338a895
authored
Apr 12, 2013
by
peng xiao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Capitalize macro namings.
parent
1bea9ee2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
100 additions
and
100 deletions
+100
-100
brute_force_matcher.cpp
modules/ocl/src/brute_force_matcher.cpp
+8
-8
brute_force_match.cl
modules/ocl/src/opencl/brute_force_match.cl
+92
-92
No files found.
modules/ocl/src/brute_force_matcher.cpp
View file @
2338a895
...
...
@@ -78,7 +78,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType=%d -D block_size=%d -D max_desc_len
=%d"
,
distType
,
block_size
,
m_size
);
sprintf
(
opt
,
"-D
DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN
=%d"
,
distType
,
block_size
,
m_size
);
if
(
globalSize
[
0
]
!=
0
)
{
...
...
@@ -119,7 +119,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType=%d -D block_size
=%d"
,
distType
,
block_size
);
sprintf
(
opt
,
"-D
DIST_TYPE=%d -D BLOCK_SIZE
=%d"
,
distType
,
block_size
);
if
(
globalSize
[
0
]
!=
0
)
{
...
...
@@ -162,7 +162,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType=%d -D block_size=%d -D max_desc_len
=%d"
,
distType
,
block_size
,
m_size
);
sprintf
(
opt
,
"-D
DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN
=%d"
,
distType
,
block_size
,
m_size
);
if
(
globalSize
[
0
]
!=
0
)
{
...
...
@@ -202,7 +202,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType=%d -D block_size
=%d"
,
distType
,
block_size
);
sprintf
(
opt
,
"-D
DIST_TYPE=%d -D BLOCK_SIZE
=%d"
,
distType
,
block_size
);
if
(
globalSize
[
0
]
!=
0
)
{
...
...
@@ -300,7 +300,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType=%d -D block_size=%d -D max_desc_len
=%d"
,
distType
,
block_size
,
m_size
);
sprintf
(
opt
,
"-D
DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN
=%d"
,
distType
,
block_size
,
m_size
);
if
(
globalSize
[
0
]
!=
0
)
{
...
...
@@ -334,7 +334,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType=%d -D block_size
=%d"
,
distType
,
block_size
);
sprintf
(
opt
,
"-D
DIST_TYPE=%d -D BLOCK_SIZE
=%d"
,
distType
,
block_size
);
if
(
globalSize
[
0
]
!=
0
)
{
...
...
@@ -368,7 +368,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType
=%d"
,
distType
);
sprintf
(
opt
,
"-D
DIST_TYPE
=%d"
,
distType
);
if
(
globalSize
[
0
]
!=
0
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
query
.
data
));
...
...
@@ -401,7 +401,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
char
opt
[
OPT_SIZE
]
=
""
;
sprintf
(
opt
,
"-D
distType
=%d"
,
distType
);
sprintf
(
opt
,
"-D
DIST_TYPE
=%d"
,
distType
);
if
(
globalSize
[
0
]
!=
0
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
query
.
data
));
...
...
modules/ocl/src/opencl/brute_force_match.cl
View file @
2338a895
...
...
@@ -47,11 +47,11 @@
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics:enable
#
define
MAX_FLOAT
3.40282e+038f
#
ifndef
block_size
#
define
block_size
16
#
ifndef
BLOCK_SIZE
#
define
BLOCK_SIZE
16
#
endif
#
ifndef
max_desc_len
#
define
max_desc_len
64
#
ifndef
MAX_DESC_LEN
#
define
MAX_DESC_LEN
64
#
endif
int
bit1Count
(
float
x
)
...
...
@@ -66,15 +66,15 @@ int bit1Count(float x)
return
(
float
)
c
;
}
#
ifndef
distType
#
define
distType
0
#
ifndef
DIST_TYPE
#
define
DIST_TYPE
0
#
endif
#
if
(
distType
==
0
)
#
if
(
DIST_TYPE
==
0
)
#
define
DIST
(
x,
y
)
fabs
((
x
)
-
(
y
))
#
elif
(
distType
==
1
)
#
elif
(
DIST_TYPE
==
1
)
#
define
DIST
(
x,
y
)
(((
x
)
-
(
y
))
*
((
x
)
-
(
y
)))
#
elif
(
distType
==
2
)
#
elif
(
DIST_TYPE
==
2
)
#
define
DIST
(
x,
y
)
bit1Count
((
uint
)(
x
)
^
(
uint
)(
y
))
#
endif
...
...
@@ -87,9 +87,9 @@ float reduce_block(__local float *s_query,
{
float
result
=
0
;
#
pragma
unroll
for
(
int
j
=
0
; j <
block_size
; j++)
for
(
int
j
=
0
; j <
BLOCK_SIZE
; j++)
{
result
+=
DIST
(
s_query[lidy
*
block_size
+
j],
s_train[j
*
block_size
+
lidx]
)
;
result
+=
DIST
(
s_query[lidy
*
BLOCK_SIZE
+
j],
s_train[j
*
BLOCK_SIZE
+
lidx]
)
;
}
return
result
;
}
...
...
@@ -103,15 +103,15 @@ float reduce_multi_block(__local float *s_query,
{
float
result
=
0
;
#
pragma
unroll
for
(
int
j
=
0
; j <
block_size
; j++)
for
(
int
j
=
0
; j <
BLOCK_SIZE
; j++)
{
result
+=
DIST
(
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j],
s_train[j
*
block_size
+
lidx]
)
;
result
+=
DIST
(
s_query[lidy
*
MAX_DESC_LEN
+
block_index
*
BLOCK_SIZE
+
j],
s_train[j
*
BLOCK_SIZE
+
lidx]
)
;
}
return
result
;
}
/*
2dim
launch,
global
size:
dim0
is
(
query
rows
+
block_size
-
1
)
/
block_size
*
block_size,
dim1
is
block_size
local
size:
dim0
is
block_size,
dim1
is
block_size
.
/*
2dim
launch,
global
size:
dim0
is
(
query
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE,
dim1
is
BLOCK_SIZE
local
size:
dim0
is
BLOCK_SIZE,
dim1
is
BLOCK_SIZE
.
*/
__kernel
void
BruteForceMatch_UnrollMatch_D5
(
__global
float
*query,
...
...
@@ -133,15 +133,15 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
const
int
groupidx
=
get_group_id
(
0
)
;
__local
float
*s_query
=
sharebuffer
;
__local
float
*s_train
=
sharebuffer
+
block_size
*
max_desc_len
;
__local
float
*s_train
=
sharebuffer
+
BLOCK_SIZE
*
MAX_DESC_LEN
;
int
queryIdx
=
groupidx
*
block_size
+
lidy
;
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
//
load
the
query
into
local
memory.
#
pragma
unroll
for
(
int
i
=
0
; i <
max_desc_len / block_size
; i ++)
for
(
int
i
=
0
; i <
MAX_DESC_LEN / BLOCK_SIZE
; i ++)
{
int
loadx
=
lidx
+
i
*
block_size
;
s_query[lidy
*
max_desc_len
+
loadx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_query[lidy
*
MAX_DESC_LEN
+
loadx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
}
float
myBestDistance
=
MAX_FLOAT
;
...
...
@@ -149,15 +149,15 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
//
loopUnrolledCached
to
find
the
best
trainIdx
and
best
distance.
volatile
int
imgIdx
=
0
;
for
(
int
t
=
0
,
endt
=
(
train_rows
+
block_size
-
1
)
/
block_size
; t < endt; t++)
for
(
int
t
=
0
,
endt
=
(
train_rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
; t < endt; t++)
{
float
result
=
0
;
#
pragma
unroll
for
(
int
i
=
0
; i <
max_desc_len / block_size
; i++)
for
(
int
i
=
0
; i <
MAX_DESC_LEN / BLOCK_SIZE
; i++)
{
//load
a
block_size
*
block_size
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
block_size
;
s_train[lidx
*
block_size
+
lidy]
=
loadx
<
train_cols
?
train[min
(
t
*
block_size
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
train_cols
?
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -167,7 +167,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
int
trainIdx
=
t
*
block_size
+
lidx
;
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
result
<
myBestDistance/*
&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
...
...
@@ -179,11 +179,11 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float*
)(
sharebuffer
)
;
__local
int*
s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
__local
int*
s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
//find
BestMatch
s_distance
+=
lidy
*
block_size
;
s_trainIdx
+=
lidy
*
block_size
;
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
s_distance[lidx]
=
myBestDistance
;
s_trainIdx[lidx]
=
myBestTrainIdx
;
...
...
@@ -191,7 +191,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
//reduce
--
now
all
reduce
implement
in
each
threads.
#
pragma
unroll
for
(
int
k
=
0
; k <
block_size
; k++)
for
(
int
k
=
0
; k <
BLOCK_SIZE
; k++)
{
if
(
myBestDistance
>
s_distance[k]
)
{
...
...
@@ -225,30 +225,30 @@ __kernel void BruteForceMatch_Match_D5(
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
queryIdx
=
groupidx
*
block_size
+
lidy
;
const
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
float
myBestDistance
=
MAX_FLOAT
;
int
myBestTrainIdx
=
-1
;
__local
float
*s_query
=
sharebuffer
;
__local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
__local
float
*s_train
=
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
//
loop
for
(
int
t
=
0
; t < (train_rows +
block_size - 1) / block_size
; t++)
for
(
int
t
=
0
; t < (train_rows +
BLOCK_SIZE - 1) / BLOCK_SIZE
; t++)
{
//Dist
dist
;
float
result
=
0
;
for
(
int
i
=
0
; i < (query_cols +
block_size - 1) / block_size
; i++)
for
(
int
i
=
0
; i < (query_cols +
BLOCK_SIZE - 1) / BLOCK_SIZE
; i++)
{
const
int
loadx
=
lidx
+
i
*
block_size
;
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
//load
query
and
train
into
local
memory
s_query[lidy
*
block_size
+
lidx]
=
0
;
s_train[lidx
*
block_size
+
lidy]
=
0
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
0
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
0
;
if
(
loadx
<
query_cols
)
{
s_query[lidy
*
block_size
+
lidx]
=
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_train[lidx
*
block_size
+
lidy]
=
train[min
(
t
*
block_size
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -258,7 +258,7 @@ __kernel void BruteForceMatch_Match_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
const
int
trainIdx
=
t
*
block_size
+
lidx
;
const
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
result
<
myBestDistance
/*&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
...
...
@@ -271,18 +271,18 @@ __kernel void BruteForceMatch_Match_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float
*
)
sharebuffer
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
//findBestMatch
s_distance
+=
lidy
*
block_size
;
s_trainIdx
+=
lidy
*
block_size
;
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
s_distance[lidx]
=
myBestDistance
;
s_trainIdx[lidx]
=
myBestTrainIdx
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//reduce
--
now
all
reduce
implement
in
each
threads.
for
(
int
k
=
0
; k <
block_size
; k++)
for
(
int
k
=
0
; k <
BLOCK_SIZE
; k++)
{
if
(
myBestDistance
>
s_distance[k]
)
{
...
...
@@ -322,20 +322,20 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
groupidy
=
get_group_id
(
1
)
;
const
int
queryIdx
=
groupidy
*
block_size
+
lidy
;
const
int
trainIdx
=
groupidx
*
block_size
+
lidx
;
const
int
queryIdx
=
groupidy
*
BLOCK_SIZE
+
lidy
;
const
int
trainIdx
=
groupidx
*
BLOCK_SIZE
+
lidx
;
__local
float
*s_query
=
sharebuffer
;
__local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
__local
float
*s_train
=
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
float
result
=
0
;
for
(
int
i
=
0
; i <
max_desc_len / block_size
; ++i)
for
(
int
i
=
0
; i <
MAX_DESC_LEN / BLOCK_SIZE
; ++i)
{
//load
a
block_size
*
block_size
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
block_size
;
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_query[lidy
*
block_size
+
lidx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_train[lidx
*
block_size
+
lidy]
=
loadx
<
query_cols
?
train[min
(
groupidx
*
block_size
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
query_cols
?
train[min
(
groupidx
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -382,20 +382,20 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
groupidy
=
get_group_id
(
1
)
;
const
int
queryIdx
=
groupidy
*
block_size
+
lidy
;
const
int
trainIdx
=
groupidx
*
block_size
+
lidx
;
const
int
queryIdx
=
groupidy
*
BLOCK_SIZE
+
lidy
;
const
int
trainIdx
=
groupidx
*
BLOCK_SIZE
+
lidx
;
__local
float
*s_query
=
sharebuffer
;
__local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
__local
float
*s_train
=
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
float
result
=
0
;
for
(
int
i
=
0
; i < (query_cols +
block_size - 1) / block_size
; ++i)
for
(
int
i
=
0
; i < (query_cols +
BLOCK_SIZE - 1) / BLOCK_SIZE
; ++i)
{
//load
a
block_size
*
block_size
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
block_size
;
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_query[lidy
*
block_size
+
lidx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_train[lidx
*
block_size
+
lidy]
=
loadx
<
query_cols
?
train[min
(
groupidx
*
block_size
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
query_cols
?
train[min
(
groupidx
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -437,15 +437,15 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
queryIdx
=
groupidx
*
block_size
+
lidy
;
const
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
local
float
*s_query
=
sharebuffer
;
local
float
*s_train
=
sharebuffer
+
block_size
*
max_desc_len
;
local
float
*s_train
=
sharebuffer
+
BLOCK_SIZE
*
MAX_DESC_LEN
;
//
load
the
query
into
local
memory.
for
(
int
i
=
0
; i <
max_desc_len / block_size
; i ++)
for
(
int
i
=
0
; i <
MAX_DESC_LEN / BLOCK_SIZE
; i ++)
{
int
loadx
=
lidx
+
i
*
block_size
;
s_query[lidy
*
max_desc_len
+
loadx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_query[lidy
*
MAX_DESC_LEN
+
loadx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
}
float
myBestDistance1
=
MAX_FLOAT
;
...
...
@@ -455,15 +455,15 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
//loopUnrolledCached
volatile
int
imgIdx
=
0
;
for
(
int
t
=
0
; t < (train_rows +
block_size - 1) / block_size
; t++)
for
(
int
t
=
0
; t < (train_rows +
BLOCK_SIZE - 1) / BLOCK_SIZE
; t++)
{
float
result
=
0
;
for
(
int
i
=
0
; i <
max_desc_len / block_size
; i++)
for
(
int
i
=
0
; i <
MAX_DESC_LEN / BLOCK_SIZE
; i++)
{
const
int
loadX
=
lidx
+
i
*
block_size
;
//load
a
block_size
*
block_size
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
block_size
;
s_train[lidx
*
block_size
+
lidy]
=
loadx
<
train_cols
?
train[min
(
t
*
block_size
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
const
int
loadX
=
lidx
+
i
*
BLOCK_SIZE
;
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
train_cols
?
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -473,7 +473,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
const
int
trainIdx
=
t
*
block_size
+
lidx
;
const
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
)
{
...
...
@@ -495,11 +495,11 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
local
float
*s_distance
=
(
local
float
*
)
sharebuffer
;
local
int
*s_trainIdx
=
(
local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
local
int
*s_trainIdx
=
(
local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
//
find
BestMatch
s_distance
+=
lidy
*
block_size
;
s_trainIdx
+=
lidy
*
block_size
;
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
s_distance[lidx]
=
myBestDistance1
;
s_trainIdx[lidx]
=
myBestTrainIdx1
;
...
...
@@ -512,7 +512,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
if
(
lidx
==
0
)
{
for
(
int
i
=
0
; i <
block_size
; i++)
for
(
int
i
=
0
; i <
BLOCK_SIZE
; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance1
)
...
...
@@ -540,7 +540,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
if
(
lidx
==
0
)
{
for
(
int
i
=
0
; i <
block_size
; i++)
for
(
int
i
=
0
; i <
BLOCK_SIZE
; i++)
{
float
val
=
s_distance[i]
;
...
...
@@ -583,9 +583,9 @@ __kernel void BruteForceMatch_knnMatch_D5(
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
queryIdx
=
groupidx
*
block_size
+
lidy
;
const
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
local
float
*s_query
=
sharebuffer
;
local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
local
float
*s_train
=
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
float
myBestDistance1
=
MAX_FLOAT
;
float
myBestDistance2
=
MAX_FLOAT
;
...
...
@@ -593,20 +593,20 @@ __kernel void BruteForceMatch_knnMatch_D5(
int
myBestTrainIdx2
=
-1
;
//loop
for
(
int
t
=
0
; t < (train_rows +
block_size - 1) / block_size
; t++)
for
(
int
t
=
0
; t < (train_rows +
BLOCK_SIZE - 1) / BLOCK_SIZE
; t++)
{
float
result
=
0.0f
;
for
(
int
i
=
0
; i < (query_cols +
block_size -1) / block_size
; i++)
for
(
int
i
=
0
; i < (query_cols +
BLOCK_SIZE -1) / BLOCK_SIZE
; i++)
{
const
int
loadx
=
lidx
+
i
*
block_size
;
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
//load
query
and
train
into
local
memory
s_query[lidy
*
block_size
+
lidx]
=
0
;
s_train[lidx
*
block_size
+
lidy]
=
0
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
0
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
0
;
if
(
loadx
<
query_cols
)
{
s_query[lidy
*
block_size
+
lidx]
=
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_train[lidx
*
block_size
+
lidy]
=
train[min
(
t
*
block_size
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -616,7 +616,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
const
int
trainIdx
=
t
*
block_size
+
lidx
;
const
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
/*&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
...
...
@@ -638,11 +638,11 @@ __kernel void BruteForceMatch_knnMatch_D5(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float
*
)
sharebuffer
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
//findBestMatch
s_distance
+=
lidy
*
block_size
;
s_trainIdx
+=
lidy
*
block_size
;
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
s_distance[lidx]
=
myBestDistance1
;
s_trainIdx[lidx]
=
myBestTrainIdx1
;
...
...
@@ -655,7 +655,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
if
(
lidx
==
0
)
{
for
(
int
i
=
0
; i <
block_size
; i++)
for
(
int
i
=
0
; i <
BLOCK_SIZE
; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance1
)
...
...
@@ -683,7 +683,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
if
(
lidx
==
0
)
{
for
(
int
i
=
0
; i <
block_size
; i++)
for
(
int
i
=
0
; i <
BLOCK_SIZE
; i++)
{
float
val
=
s_distance[i]
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment