Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
3787388e
Commit
3787388e
authored
Sep 23, 2014
by
vbystricky
Committed by
vbystricky
Oct 17, 2014
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Optimize OpenCL version of BFMatcher
parent
22ff1e88
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
279 additions
and
705 deletions
+279
-705
matchers.cpp
modules/features2d/src/matchers.cpp
+125
-311
brute_force_match.cl
modules/features2d/src/opencl/brute_force_match.cl
+154
-394
No files found.
modules/features2d/src/matchers.cpp
View file @
3787388e
...
...
@@ -60,113 +60,58 @@ static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m)
m
.
create
(
rows
,
cols
,
type
);
}
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
>
static
bool
ocl_matchUnrolledCached
(
InputArray
_query
,
InputArray
_train
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
int
distType
)
static
bool
ocl_matchSingle
(
InputArray
query
,
InputArray
train
,
UMat
&
trainIdx
,
UMat
&
distance
,
int
distType
)
{
int
depth
=
_query
.
depth
();
cv
::
String
opts
;
opts
=
cv
::
format
(
"-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d"
,
ocl
::
typeToStr
(
depth
),
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
(
int
)
BLOCK_SIZE
,
(
int
)
MAX_DESC_LEN
);
ocl
::
Kernel
k
(
"BruteForceMatch_UnrollMatch"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
false
;
size_t
globalSize
[]
=
{(
_query
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
const
size_t
smemSize
=
(
BLOCK_SIZE
*
(
MAX_DESC_LEN
>=
BLOCK_SIZE
?
MAX_DESC_LEN
:
BLOCK_SIZE
)
+
BLOCK_SIZE
*
BLOCK_SIZE
)
*
sizeof
(
int
);
if
(
globalSize
[
0
]
!=
0
)
{
UMat
query
=
_query
.
getUMat
(),
train
=
_train
.
getUMat
();
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
query
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
train
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
smemSize
);
idx
=
k
.
set
(
idx
,
query
.
rows
);
idx
=
k
.
set
(
idx
,
query
.
cols
);
idx
=
k
.
set
(
idx
,
train
.
rows
);
idx
=
k
.
set
(
idx
,
train
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)
query
.
step
);
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
return
true
;
}
const
int
query_rows
=
query
.
rows
();
const
int
query_cols
=
query
.
cols
();
template
<
int
BLOCK_SIZE
>
static
bool
ocl_match
(
InputArray
_query
,
InputArray
_train
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
int
distType
)
{
int
depth
=
_query
.
depth
();
cv
::
String
opts
;
opts
=
cv
::
format
(
"-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d"
,
ocl
::
typeToStr
(
depth
),
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
(
int
)
BLOCK_SIZE
);
ocl
::
Kernel
k
(
"BruteForceMatch_Match"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
ensureSizeIsEnough
(
1
,
query_rows
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
query_rows
,
CV_32F
,
distance
);
size_t
globalSize
[]
=
{(
_query
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
const
size_t
smemSize
=
(
2
*
BLOCK_SIZE
*
BLOCK_SIZE
)
*
sizeof
(
int
);
ocl
::
Device
devDef
=
ocl
::
Device
::
getDefault
();
if
(
globalSize
[
0
]
!=
0
)
{
UMat
query
=
_query
.
getUMat
(),
train
=
_train
.
getUMat
();
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
query
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
train
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
smemSize
);
idx
=
k
.
set
(
idx
,
query
.
rows
);
idx
=
k
.
set
(
idx
,
query
.
cols
);
idx
=
k
.
set
(
idx
,
train
.
rows
);
idx
=
k
.
set
(
idx
,
train
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)
query
.
step
);
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
return
true
;
}
UMat
uquery
=
query
.
getUMat
(),
utrain
=
train
.
getUMat
();
int
kercn
=
1
;
if
(
devDef
.
isIntel
()
&&
(
0
==
(
uquery
.
step
%
4
))
&&
(
0
==
(
uquery
.
cols
%
4
))
&&
(
0
==
(
uquery
.
offset
%
4
))
&&
(
0
==
(
utrain
.
step
%
4
))
&&
(
0
==
(
utrain
.
cols
%
4
))
&&
(
0
==
(
utrain
.
offset
%
4
)))
kercn
=
4
;
static
bool
ocl_matchDispatcher
(
InputArray
query
,
InputArray
train
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
int
distType
)
{
int
query_cols
=
query
.
size
().
width
;
bool
is_cpu
=
ocl
::
Device
::
getDefault
().
type
()
==
ocl
::
Device
::
TYPE_CPU
;
int
block_size
=
16
;
int
max_desc_len
=
0
;
bool
is_cpu
=
devDef
.
type
()
==
ocl
::
Device
::
TYPE_CPU
;
if
(
query_cols
<=
64
)
{
if
(
!
ocl_matchUnrolledCached
<
16
,
64
>
(
query
,
train
,
trainIdx
,
distance
,
distType
))
return
false
;
}
max_desc_len
=
64
/
kercn
;
else
if
(
query_cols
<=
128
&&
!
is_cpu
)
{
if
(
!
ocl_matchUnrolledCached
<
16
,
128
>
(
query
,
train
,
trainIdx
,
distance
,
distType
))
return
false
;
}
else
{
if
(
!
ocl_match
<
16
>
(
query
,
train
,
trainIdx
,
distance
,
distType
))
return
false
;
}
return
true
;
}
max_desc_len
=
128
/
kercn
;
static
bool
ocl_matchSingle
(
InputArray
query
,
InputArray
train
,
UMat
&
trainIdx
,
UMat
&
distance
,
int
dstType
)
{
if
(
query
.
empty
()
||
train
.
empty
())
int
depth
=
query
.
depth
();
cv
::
String
opts
;
opts
=
cv
::
format
(
"-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d"
,
ocl
::
typeToStr
(
depth
),
ocl
::
typeToStr
(
CV_MAKETYPE
(
depth
,
kercn
)),
kercn
,
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
block_size
,
max_desc_len
);
ocl
::
Kernel
k
(
"BruteForceMatch_Match"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
int
query_rows
=
query
.
size
().
height
;
ensureSizeIsEnough
(
1
,
query_rows
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
query_rows
,
CV_32F
,
distance
);
return
ocl_matchDispatcher
(
query
,
train
,
trainIdx
,
distance
,
dstType
);
size_t
globalSize
[]
=
{(
query
.
size
().
height
+
block_size
-
1
)
/
block_size
*
block_size
,
block_size
};
size_t
localSize
[]
=
{
block_size
,
block_size
};
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
uquery
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
utrain
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
uquery
.
rows
);
idx
=
k
.
set
(
idx
,
uquery
.
cols
);
idx
=
k
.
set
(
idx
,
utrain
.
rows
);
idx
=
k
.
set
(
idx
,
utrain
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)(
uquery
.
step
/
sizeof
(
float
)));
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
static
bool
ocl_matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>
&
matches
)
...
...
@@ -213,121 +158,60 @@ static bool ocl_matchDownload(const UMat &trainIdx, const UMat &distance, std::v
return
ocl_matchConvert
(
trainIdxCPU
,
distanceCPU
,
matches
);
}
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
>
static
bool
ocl_knn_matchUnrolledCached
(
InputArray
_query
,
InputArray
_train
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
int
distType
)
static
bool
ocl_knnMatchSingle
(
InputArray
query
,
InputArray
train
,
UMat
&
trainIdx
,
UMat
&
distance
,
int
distType
)
{
int
depth
=
_query
.
depth
();
cv
::
String
opts
;
opts
=
cv
::
format
(
"-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d"
,
ocl
::
typeToStr
(
depth
),
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
(
int
)
BLOCK_SIZE
,
(
int
)
MAX_DESC_LEN
);
ocl
::
Kernel
k
(
"BruteForceMatch_knnUnrollMatch"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
false
;
size_t
globalSize
[]
=
{(
_query
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
const
size_t
smemSize
=
(
BLOCK_SIZE
*
(
MAX_DESC_LEN
>=
BLOCK_SIZE
?
MAX_DESC_LEN
:
BLOCK_SIZE
)
+
BLOCK_SIZE
*
BLOCK_SIZE
)
*
sizeof
(
int
);
if
(
globalSize
[
0
]
!=
0
)
{
UMat
query
=
_query
.
getUMat
(),
train
=
_train
.
getUMat
();
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
query
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
train
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
smemSize
);
idx
=
k
.
set
(
idx
,
query
.
rows
);
idx
=
k
.
set
(
idx
,
query
.
cols
);
idx
=
k
.
set
(
idx
,
train
.
rows
);
idx
=
k
.
set
(
idx
,
train
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)
query
.
step
);
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
return
true
;
}
const
int
query_rows
=
query
.
rows
();
const
int
query_cols
=
query
.
cols
();
template
<
int
BLOCK_SIZE
>
static
bool
ocl_knn_match
(
InputArray
_query
,
InputArray
_train
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
int
distType
)
{
int
depth
=
_query
.
depth
();
cv
::
String
opts
;
opts
=
format
(
"-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d"
,
ocl
::
typeToStr
(
depth
),
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
(
int
)
BLOCK_SIZE
);
ocl
::
Kernel
k
(
"BruteForceMatch_knnMatch"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
ensureSizeIsEnough
(
1
,
query_rows
,
CV_32SC2
,
trainIdx
);
ensureSizeIsEnough
(
1
,
query_rows
,
CV_32FC2
,
distance
);
size_t
globalSize
[]
=
{(
_query
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
const
size_t
smemSize
=
(
2
*
BLOCK_SIZE
*
BLOCK_SIZE
)
*
sizeof
(
int
);
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
if
(
globalSize
[
0
]
!=
0
)
{
UMat
query
=
_query
.
getUMat
(),
train
=
_train
.
getUMat
();
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
query
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
train
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
smemSize
);
idx
=
k
.
set
(
idx
,
query
.
rows
);
idx
=
k
.
set
(
idx
,
query
.
cols
);
idx
=
k
.
set
(
idx
,
train
.
rows
);
idx
=
k
.
set
(
idx
,
train
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)
query
.
step
);
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
return
true
;
}
ocl
::
Device
devDef
=
ocl
::
Device
::
getDefault
();
static
bool
ocl_match2Dispatcher
(
InputArray
query
,
InputArray
train
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
int
distType
)
{
bool
is_cpu
=
ocl
::
Device
::
getDefault
().
type
()
==
ocl
::
Device
::
TYPE_CPU
;
if
(
query
.
size
().
width
<=
64
)
{
if
(
!
ocl_knn_matchUnrolledCached
<
16
,
64
>
(
query
,
train
,
trainIdx
,
distance
,
distType
))
return
false
;
}
else
if
(
query
.
size
().
width
<=
128
&&
!
is_cpu
)
{
if
(
!
ocl_knn_matchUnrolledCached
<
16
,
128
>
(
query
,
train
,
trainIdx
,
distance
,
distType
))
return
false
;
}
else
{
if
(
!
ocl_knn_match
<
16
>
(
query
,
train
,
trainIdx
,
distance
,
distType
))
return
false
;
}
return
true
;
}
UMat
uquery
=
query
.
getUMat
(),
utrain
=
train
.
getUMat
();
int
kercn
=
1
;
if
(
devDef
.
isIntel
()
&&
(
0
==
(
uquery
.
step
%
4
))
&&
(
0
==
(
uquery
.
cols
%
4
))
&&
(
0
==
(
uquery
.
offset
%
4
))
&&
(
0
==
(
utrain
.
step
%
4
))
&&
(
0
==
(
utrain
.
cols
%
4
))
&&
(
0
==
(
utrain
.
offset
%
4
)))
kercn
=
4
;
static
bool
ocl_kmatchDispatcher
(
InputArray
query
,
InputArray
train
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
int
distType
)
{
return
ocl_match2Dispatcher
(
query
,
train
,
trainIdx
,
distance
,
distType
);
}
int
block_size
=
16
;
int
max_desc_len
=
0
;
bool
is_cpu
=
devDef
.
type
()
==
ocl
::
Device
::
TYPE_CPU
;
if
(
query_cols
<=
64
)
max_desc_len
=
64
/
kercn
;
else
if
(
query_cols
<=
128
&&
!
is_cpu
)
max_desc_len
=
128
/
kercn
;
static
bool
ocl_knnMatchSingle
(
InputArray
query
,
InputArray
train
,
UMat
&
trainIdx
,
UMat
&
distance
,
int
dstType
)
{
if
(
query
.
empty
()
||
train
.
empty
())
int
depth
=
query
.
depth
();
cv
::
String
opts
;
opts
=
cv
::
format
(
"-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d"
,
ocl
::
typeToStr
(
depth
),
ocl
::
typeToStr
(
CV_MAKETYPE
(
depth
,
kercn
)),
kercn
,
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
block_size
,
max_desc_len
);
ocl
::
Kernel
k
(
"BruteForceMatch_knnMatch"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
const
int
nQuery
=
query
.
size
().
height
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC2
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32FC2
,
distance
);
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
return
ocl_kmatchDispatcher
(
query
,
train
,
trainIdx
,
distance
,
dstType
);
size_t
globalSize
[]
=
{(
query_rows
+
block_size
-
1
)
/
block_size
*
block_size
,
block_size
};
size_t
localSize
[]
=
{
block_size
,
block_size
};
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
uquery
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
utrain
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
uquery
.
rows
);
idx
=
k
.
set
(
idx
,
uquery
.
cols
);
idx
=
k
.
set
(
idx
,
utrain
.
rows
);
idx
=
k
.
set
(
idx
,
utrain
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)(
uquery
.
step
/
sizeof
(
float
)));
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
static
bool
ocl_knnMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>
&
matches
,
bool
compactResult
)
...
...
@@ -383,134 +267,64 @@ static bool ocl_knnMatchDownload(const UMat &trainIdx, const UMat &distance, std
Mat
trainIdxCPU
=
trainIdx
.
getMat
(
ACCESS_READ
);
Mat
distanceCPU
=
distance
.
getMat
(
ACCESS_READ
);
if
(
ocl_knnMatchConvert
(
trainIdxCPU
,
distanceCPU
,
matches
,
compactResult
)
)
return
true
;
return
false
;
}
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
>
static
bool
ocl_matchUnrolledCached
(
InputArray
_query
,
InputArray
_train
,
float
maxDistance
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
const
UMat
&
nMatches
,
int
distType
)
{
int
depth
=
_query
.
depth
();
cv
::
String
opts
;
opts
=
format
(
"-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d"
,
ocl
::
typeToStr
(
depth
),
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
(
int
)
BLOCK_SIZE
,
(
int
)
MAX_DESC_LEN
);
ocl
::
Kernel
k
(
"BruteForceMatch_RadiusUnrollMatch"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
size_t
globalSize
[]
=
{(
_train
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
(
_query
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
const
size_t
smemSize
=
(
2
*
BLOCK_SIZE
*
BLOCK_SIZE
)
*
sizeof
(
int
);
if
(
globalSize
[
0
]
!=
0
)
{
UMat
query
=
_query
.
getUMat
(),
train
=
_train
.
getUMat
();
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
query
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
train
));
idx
=
k
.
set
(
idx
,
maxDistance
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
nMatches
));
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
smemSize
);
idx
=
k
.
set
(
idx
,
query
.
rows
);
idx
=
k
.
set
(
idx
,
query
.
cols
);
idx
=
k
.
set
(
idx
,
train
.
rows
);
idx
=
k
.
set
(
idx
,
train
.
cols
);
idx
=
k
.
set
(
idx
,
trainIdx
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)
query
.
step
);
idx
=
k
.
set
(
idx
,
(
int
)
trainIdx
.
step
);
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
return
true
;
}
//radius_match
template
<
int
BLOCK_SIZE
>
static
bool
ocl_radius_match
(
InputArray
_query
,
InputArray
_train
,
float
maxDistance
,
const
UMat
&
trainIdx
,
const
UMat
&
distance
,
const
UMat
&
nMatches
,
int
distType
)
{
int
depth
=
_query
.
depth
();
cv
::
String
opts
;
opts
=
format
(
"-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d"
,
ocl
::
typeToStr
(
depth
),
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
(
int
)
BLOCK_SIZE
);
ocl
::
Kernel
k
(
"BruteForceMatch_RadiusMatch"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
size_t
globalSize
[]
=
{(
_train
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
(
_query
.
size
().
height
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
const
size_t
smemSize
=
(
2
*
BLOCK_SIZE
*
BLOCK_SIZE
)
*
sizeof
(
int
);
if
(
globalSize
[
0
]
!=
0
)
{
UMat
query
=
_query
.
getUMat
(),
train
=
_train
.
getUMat
();
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
query
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
train
));
idx
=
k
.
set
(
idx
,
maxDistance
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
nMatches
));
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
smemSize
);
idx
=
k
.
set
(
idx
,
query
.
rows
);
idx
=
k
.
set
(
idx
,
query
.
cols
);
idx
=
k
.
set
(
idx
,
train
.
rows
);
idx
=
k
.
set
(
idx
,
train
.
cols
);
idx
=
k
.
set
(
idx
,
trainIdx
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)
query
.
step
);
idx
=
k
.
set
(
idx
,
(
int
)
trainIdx
.
step
);
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
return
true
;
}
static
bool
ocl_rmatchDispatcher
(
InputArray
query
,
InputArray
train
,
UMat
&
trainIdx
,
UMat
&
distance
,
UMat
&
nMatches
,
float
maxDistance
,
int
distType
)
{
bool
is_cpu
=
ocl
::
Device
::
getDefault
().
type
()
==
ocl
::
Device
::
TYPE_CPU
;
int
query_cols
=
query
.
size
().
width
;
if
(
query_cols
<=
64
)
{
if
(
!
ocl_matchUnrolledCached
<
16
,
64
>
(
query
,
train
,
maxDistance
,
trainIdx
,
distance
,
nMatches
,
distType
))
return
false
;
}
else
if
(
query_cols
<=
128
&&
!
is_cpu
)
{
if
(
!
ocl_matchUnrolledCached
<
16
,
128
>
(
query
,
train
,
maxDistance
,
trainIdx
,
distance
,
nMatches
,
distType
))
return
false
;
}
else
{
if
(
!
ocl_radius_match
<
16
>
(
query
,
train
,
maxDistance
,
trainIdx
,
distance
,
nMatches
,
distType
))
return
false
;
}
return
true
;
return
ocl_knnMatchConvert
(
trainIdxCPU
,
distanceCPU
,
matches
,
compactResult
);
}
static
bool
ocl_radiusMatchSingle
(
InputArray
query
,
InputArray
train
,
UMat
&
trainIdx
,
UMat
&
distance
,
UMat
&
nMatches
,
float
maxDistance
,
int
distType
)
{
if
(
query
.
empty
()
||
train
.
empty
())
return
false
;
const
int
nQuery
=
query
.
size
().
height
;
const
int
nTrain
=
train
.
size
().
height
;
const
int
query_rows
=
query
.
rows
()
;
const
int
train_rows
=
train
.
rows
()
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
ensureSizeIsEnough
(
1
,
query_rows
,
CV_32SC1
,
nMatches
);
if
(
trainIdx
.
empty
())
{
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
),
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
),
CV_32FC1
,
distance
);
ensureSizeIsEnough
(
query_rows
,
std
::
max
((
train_rows
/
100
),
10
),
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
query_rows
,
std
::
max
((
train_rows
/
100
),
10
),
CV_32FC1
,
distance
);
}
nMatches
.
setTo
(
Scalar
::
all
(
0
));
return
ocl_rmatchDispatcher
(
query
,
train
,
trainIdx
,
distance
,
nMatches
,
maxDistance
,
distType
);
ocl
::
Device
devDef
=
ocl
::
Device
::
getDefault
();
UMat
uquery
=
query
.
getUMat
(),
utrain
=
train
.
getUMat
();
int
kercn
=
1
;
if
(
devDef
.
isIntel
()
&&
(
0
==
(
uquery
.
step
%
4
))
&&
(
0
==
(
uquery
.
cols
%
4
))
&&
(
0
==
(
uquery
.
offset
%
4
))
&&
(
0
==
(
utrain
.
step
%
4
))
&&
(
0
==
(
utrain
.
cols
%
4
))
&&
(
0
==
(
utrain
.
offset
%
4
)))
kercn
=
4
;
int
block_size
=
16
;
int
depth
=
query
.
depth
();
cv
::
String
opts
;
opts
=
cv
::
format
(
"-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d"
,
ocl
::
typeToStr
(
depth
),
ocl
::
typeToStr
(
CV_MAKETYPE
(
depth
,
kercn
)),
kercn
,
depth
==
CV_32F
?
"-D T_FLOAT"
:
""
,
distType
,
block_size
);
ocl
::
Kernel
k
(
"BruteForceMatch_RadiusMatch"
,
ocl
::
features2d
::
brute_force_match_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
size_t
globalSize
[]
=
{(
train_rows
+
block_size
-
1
)
/
block_size
*
block_size
,
(
query_rows
+
block_size
-
1
)
/
block_size
*
block_size
,
1
};
size_t
localSize
[]
=
{
block_size
,
block_size
,
1
};
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
uquery
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
utrain
));
idx
=
k
.
set
(
idx
,
maxDistance
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
trainIdx
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
distance
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
nMatches
));
idx
=
k
.
set
(
idx
,
uquery
.
rows
);
idx
=
k
.
set
(
idx
,
uquery
.
cols
);
idx
=
k
.
set
(
idx
,
utrain
.
rows
);
idx
=
k
.
set
(
idx
,
utrain
.
cols
);
idx
=
k
.
set
(
idx
,
trainIdx
.
cols
);
idx
=
k
.
set
(
idx
,
(
int
)(
uquery
.
step
/
sizeof
(
float
)));
idx
=
k
.
set
(
idx
,
(
int
)(
trainIdx
.
step
/
sizeof
(
int
)));
return
k
.
run
(
2
,
globalSize
,
localSize
,
false
);
}
static
bool
ocl_radiusMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
const
Mat
&
_nMatches
,
...
...
modules/features2d/src/opencl/brute_force_match.cl
View file @
3787388e
...
...
@@ -59,39 +59,71 @@
#
define
MAX_DESC_LEN
64
#
endif
#
define
BLOCK_SIZE_ODD
(
BLOCK_SIZE
+
1
)
#
ifndef
SHARED_MEM_SZ
#
if
(
BLOCK_SIZE
<
MAX_DESC_LEN
)
#
define
SHARED_MEM_SZ
(
kercn
*
(
BLOCK_SIZE
*
MAX_DESC_LEN
+
BLOCK_SIZE
*
BLOCK_SIZE
))
#
else
#
define
SHARED_MEM_SZ
(
kercn
*
2
*
BLOCK_SIZE_ODD
*
BLOCK_SIZE
)
#
endif
#
endif
#
ifndef
DIST_TYPE
#
define
DIST_TYPE
2
#
endif
//
dirty
fix
for
non-template
support
#
if
(
DIST_TYPE
==
2
)
//
L1Dist
#
if
(
DIST_TYPE
==
2
)
//
L1Dist
#
ifdef
T_FLOAT
#
define
DIST
(
x,
y
)
fabs
((
x
)
-
(
y
))
typedef
float
value_type
;
typedef
float
result_type
;
#
if
(
8
==
kercn
)
typedef
float8
value_type
;
#
define
DIST
(
x,
y
)
{value_type
d
=
fabs
((
x
)
-
(
y
))
; result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
#
elif
(
4
==
kercn
)
typedef
float4
value_type
;
#
define
DIST
(
x,
y
)
{value_type
d
=
fabs
((
x
)
-
(
y
))
; result += d.s0 + d.s1 + d.s2 + d.s3;}
#
else
typedef
float
value_type
;
#
define
DIST
(
x,
y
)
result
+=
fabs
((
x
)
-
(
y
))
#
endif
#
else
#
define
DIST
(
x,
y
)
abs
((
x
)
-
(
y
))
typedef
int
value_type
;
typedef
int
result_type
;
#
if
(
8
==
kercn
)
typedef
int8
value_type
;
#
define
DIST
(
x,
y
)
{value_type
d
=
abs
((
x
)
-
(
y
))
; result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
#
elif
(
4
==
kercn
)
typedef
int4
value_type
;
#
define
DIST
(
x,
y
)
{value_type
d
=
abs
((
x
)
-
(
y
))
; result += d.s0 + d.s1 + d.s2 + d.s3;}
#
else
typedef
int
value_type
;
#
define
DIST
(
x,
y
)
result
+=
abs
((
x
)
-
(
y
))
#
endif
#
endif
#
define
DIST_RES
(
x
)
(
x
)
#
define
DIST_RES
(
x
)
(
x
)
#
elif
(
DIST_TYPE
==
4
)
//
L2Dist
#
define
DIST
(
x,
y
)
(((
x
)
-
(
y
))
*
((
x
)
-
(
y
)))
typedef
float
value_type
;
typedef
float
result_type
;
#
define
DIST_RES
(
x
)
sqrt
(
x
)
typedef
float
result_type
;
#
if
(
8
==
kercn
)
typedef
float8
value_type
;
#
define
DIST
(
x,
y
)
{value_type
d
=
((
x
)
-
(
y
))
; result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}
#
elif
(
4
==
kercn
)
typedef
float4
value_type
;
#
define
DIST
(
x,
y
)
{value_type
d
=
((
x
)
-
(
y
))
; result += dot(d, d);}
#
else
typedef
float
value_type
;
#
define
DIST
(
x,
y
)
{value_type
d
=
((
x
)
-
(
y
))
; result = mad(d, d, result);}
#
endif
#
define
DIST_RES
(
x
)
sqrt
(
x
)
#
elif
(
DIST_TYPE
==
6
)
//
Hamming
//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
inline
int
bit1Count
(
int
v
)
{
v
=
v
-
((
v
>>
1
)
&
0x55555555
)
; // reuse input as temporary
v
=
(
v
&
0x33333333
)
+
((
v
>>
2
)
&
0x33333333
)
; // temp
return
((
v
+
(
v
>>
4
)
&
0xF0F0F0F
)
*
0x1010101
)
>>
24
; // count
}
#
define
DIST
(
x,
y
)
bit1Count
(
(
x
)
^
(
y
)
)
typedef
int
value_type
;
typedef
int
result_type
;
#
define
DIST_RES
(
x
)
(
x
)
#
if
(
8
==
kercn
)
typedef
int8
value_type
;
#
elif
(
4
==
kercn
)
typedef
int4
value_type
;
#
else
typedef
int
value_type
;
#
endif
typedef
int
result_type
;
#
define
DIST
(
x,
y
)
result
+=
popcount
(
(
x
)
^
(
y
)
)
#
define
DIST_RES
(
x
)
(
x
)
#
endif
inline
result_type
reduce_block
(
...
...
@@ -105,9 +137,7 @@ inline result_type reduce_block(
#
pragma
unroll
for
(
int
j
=
0
; j < BLOCK_SIZE ; j++)
{
result
+=
DIST
(
s_query[lidy
*
BLOCK_SIZE
+
j],
s_train[j
*
BLOCK_SIZE
+
lidx]
)
;
DIST
(
s_query[lidy
*
BLOCK_SIZE_ODD
+
j],
s_train[j
*
BLOCK_SIZE_ODD
+
lidx]
)
;
}
return
DIST_RES
(
result
)
;
}
...
...
@@ -123,11 +153,9 @@ inline result_type reduce_block_match(
#
pragma
unroll
for
(
int
j
=
0
; j < BLOCK_SIZE ; j++)
{
result
+=
DIST
(
s_query[lidy
*
BLOCK_SIZE
+
j],
s_train[j
*
BLOCK_SIZE
+
lidx]
)
;
DIST
(
s_query[lidy
*
BLOCK_SIZE_ODD
+
j],
s_train[j
*
BLOCK_SIZE_ODD
+
lidx]
)
;
}
return
(
result
)
;
return
result
;
}
inline
result_type
reduce_multi_block
(
...
...
@@ -142,23 +170,16 @@ inline result_type reduce_multi_block(
#
pragma
unroll
for
(
int
j
=
0
; j < BLOCK_SIZE ; j++)
{
result
+=
DIST
(
s_query[lidy
*
MAX_DESC_LEN
+
block_index
*
BLOCK_SIZE
+
j],
s_train[j
*
BLOCK_SIZE
+
lidx]
)
;
DIST
(
s_query[lidy
*
MAX_DESC_LEN
+
block_index
*
BLOCK_SIZE
+
j],
s_train[j
*
BLOCK_SIZE
+
lidx]
)
;
}
return
result
;
}
/*
2dim
launch,
global
size:
dim0
is
(
query
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE,
dim1
is
BLOCK_SIZE
local
size:
dim0
is
BLOCK_SIZE,
dim1
is
BLOCK_SIZE.
*/
__kernel
void
BruteForceMatch_UnrollMatch
(
__kernel
void
BruteForceMatch_Match
(
__global
T
*query,
__global
T
*train,
//__global
float
*mask,
__global
int
*bestTrainIdx,
__global
float
*bestDistance,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
...
...
@@ -170,17 +191,26 @@ __kernel void BruteForceMatch_UnrollMatch(
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
queryIdx
=
mad24
(
BLOCK_SIZE,
groupidx,
lidy
)
;
const
int
queryOffset
=
min
(
queryIdx,
query_rows
-
1
)
*
step
;
__global
TN
*query_vec
=
(
__global
TN
*
)(
query
+
queryOffset
)
;
query_cols
/=
kercn
;
__local
float
sharebuffer[SHARED_MEM_SZ]
;
__local
value_type
*s_query
=
(
__local
value_type
*
)
sharebuffer
;
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
MAX_DESC_LEN
;
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
#
if
0
<
MAX_DESC_LEN
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
MAX_DESC_LEN
;
//
load
the
query
into
local
memory.
#
pragma
unroll
for
(
int
i
=
0
; i < MAX_DESC_LEN / BLOCK_SIZE; i
++)
for
(
int
i
=
0
; i < MAX_DESC_LEN / BLOCK_SIZE; i
++)
{
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_query[
lidy
*
MAX_DESC_LEN
+
loadx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
const
int
loadx
=
mad24
(
BLOCK_SIZE,
i,
lidx
)
;
s_query[
mad24
(
MAX_DESC_LEN,
lidy,
loadx
)
]
=
loadx
<
query_cols
?
query_vec[
loadx]
:
0
;
}
#
else
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE_ODD
*
BLOCK_SIZE
;
#
endif
float
myBestDistance
=
MAX_FLOAT
;
int
myBestTrainIdx
=
-1
;
...
...
@@ -189,12 +219,16 @@ __kernel void BruteForceMatch_UnrollMatch(
for
(
int
t
=
0
,
endt
=
(
train_rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
; t < endt; t++)
{
result_type
result
=
0
;
const
int
trainOffset
=
min
(
mad24
(
BLOCK_SIZE,
t
,
lidy
)
,
train_rows
-
1
)
*
step
;
__global
TN
*train_vec
=
(
__global
TN
*
)(
train
+
trainOffset
)
;
#
if
0
<
MAX_DESC_LEN
#
pragma
unroll
for
(
int
i
=
0
; i < MAX_DESC_LEN / BLOCK_SIZE
; i++)
for
(
int
i
=
0
; i < MAX_DESC_LEN / BLOCK_SIZE
; i++)
{
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_train[
lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
train_cols
?
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
const
int
loadx
=
mad24
(
BLOCK_SIZE,
i,
lidx
)
;
s_train[
mad24
(
BLOCK_SIZE,
lidx,
lidy
)
]
=
loadx
<
train_cols
?
train_vec[
loadx]
:
0
;
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -203,89 +237,18 @@ __kernel void BruteForceMatch_UnrollMatch(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
result
=
DIST_RES
(
result
)
;
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
result
<
myBestDistance/*
&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
myBestDistance
=
result
;
myBestTrainIdx
=
trainIdx
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float*
)(
sharebuffer
)
;
__local
int*
s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
//find
BestMatch
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
s_distance[lidx]
=
myBestDistance
;
s_trainIdx[lidx]
=
myBestTrainIdx
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//reduce
--
now
all
reduce
implement
in
each
threads.
#
pragma
unroll
for
(
int
k
=
0
; k < BLOCK_SIZE; k++)
{
if
(
myBestDistance
>
s_distance[k]
)
#
else
for
(
int
i
=
0
,
endq
=
(
query_cols
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
; i < endq; i++)
{
myBestDistance
=
s_distance[k]
;
myBestTrainIdx
=
s_trainIdx[k]
;
}
}
if
(
queryIdx
<
query_rows
&&
lidx
==
0
)
{
bestTrainIdx[queryIdx]
=
myBestTrainIdx
;
bestDistance[queryIdx]
=
myBestDistance
;
}
}
__kernel
void
BruteForceMatch_Match
(
__global
T
*query,
__global
T
*train,
//__global
float
*mask,
__global
int
*bestTrainIdx,
__global
float
*bestDistance,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
int
train_cols,
int
step
)
{
const
int
lidx
=
get_local_id
(
0
)
;
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
float
myBestDistance
=
MAX_FLOAT
;
int
myBestTrainIdx
=
-1
;
__local
value_type
*s_query
=
(
__local
value_type
*
)
sharebuffer
;
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
//
loop
for
(
int
t
=
0
; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
{
result_type
result
=
0
;
for
(
int
i
=
0
; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
{
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
const
int
loadx
=
mad24
(
i,
BLOCK_SIZE,
lidx
)
;
//load
query
and
train
into
local
memory
s_query[
lidy
*
BLOCK_SIZE
+
lidx
]
=
0
;
s_train[
lidx
*
BLOCK_SIZE
+
lidy
]
=
0
;
s_query[
mad24
(
BLOCK_SIZE_ODD,
lidy,
lidx
)
]
=
0
;
s_train[
mad24
(
BLOCK_SIZE_ODD,
lidx,
lidy
)
]
=
0
;
if
(
loadx
<
query_cols
)
{
s_query[
lidy
*
BLOCK_SIZE
+
lidx]
=
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_train[
lidx
*
BLOCK_SIZE
+
lidy]
=
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_query[
mad24
(
BLOCK_SIZE_ODD,
lidy,
lidx
)
]
=
query_vec[
loadx]
;
s_train[
mad24
(
BLOCK_SIZE_ODD,
lidx,
lidy
)
]
=
train_vec[
loadx]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -294,10 +257,10 @@ __kernel void BruteForceMatch_Match(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
#
endif
result
=
DIST_RES
(
result
)
;
const
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
const
int
trainIdx
=
mad24
(
BLOCK_SIZE,
t
,
lidx
)
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
result
<
myBestDistance
/*&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
...
...
@@ -309,17 +272,18 @@ __kernel void BruteForceMatch_Match(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float
*
)
sharebuffer
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE
_ODD
*
BLOCK_SIZE
)
;
//findBestMatch
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
s_distance
+=
lidy
*
BLOCK_SIZE
_ODD
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
_ODD
;
s_distance[lidx]
=
myBestDistance
;
s_trainIdx[lidx]
=
myBestTrainIdx
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//reduce
--
now
all
reduce
implement
in
each
threads.
#
pragma
unroll
for
(
int
k
=
0
; k < BLOCK_SIZE; k++)
{
if
(
myBestDistance
>
s_distance[k]
)
...
...
@@ -336,76 +300,14 @@ __kernel void BruteForceMatch_Match(
}
}
//radius_unrollmatch
__kernel
void
BruteForceMatch_RadiusUnrollMatch
(
__global
T
*query,
__global
T
*train,
float
maxDistance,
//__global
float
*mask,
__global
int
*bestTrainIdx,
__global
float
*bestDistance,
__global
int
*nMatches,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
int
train_cols,
int
bestTrainIdx_cols,
int
step,
int
ostep
)
{
const
int
lidx
=
get_local_id
(
0
)
;
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
groupidy
=
get_group_id
(
1
)
;
const
int
queryIdx
=
groupidy
*
BLOCK_SIZE
+
lidy
;
const
int
trainIdx
=
groupidx
*
BLOCK_SIZE
+
lidx
;
__local
value_type
*s_query
=
(
__local
value_type
*
)
sharebuffer
;
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
result_type
result
=
0
;
for
(
int
i
=
0
; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
{
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
query_cols
?
train[min
(
groupidx
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
result
+=
reduce_block
(
s_query,
s_train,
lidx,
lidy
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
convert_float
(
result
)
<
maxDistance/*
&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
int
ind
=
atom_inc
(
nMatches
+
queryIdx/*,
(
unsigned
int
)
-1*/
)
;
if
(
ind
<
bestTrainIdx_cols
)
{
bestTrainIdx[queryIdx
*
(
ostep
/
sizeof
(
int
))
+
ind]
=
trainIdx
;
bestDistance[queryIdx
*
(
ostep
/
sizeof
(
float
))
+
ind]
=
result
;
}
}
}
//radius_match
__kernel
void
BruteForceMatch_RadiusMatch
(
__global
T
*query,
__global
T
*train,
float
maxDistance,
//__global
float
*mask,
__global
int
*bestTrainIdx,
__global
float
*bestDistance,
__global
int
*nMatches,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
...
...
@@ -420,20 +322,34 @@ __kernel void BruteForceMatch_RadiusMatch(
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
groupidy
=
get_group_id
(
1
)
;
const
int
queryIdx
=
groupidy
*
BLOCK_SIZE
+
lidy
;
const
int
trainIdx
=
groupidx
*
BLOCK_SIZE
+
lidx
;
const
int
queryIdx
=
mad24
(
BLOCK_SIZE,
groupidy,
lidy
)
;
const
int
queryOffset
=
min
(
queryIdx,
query_rows
-
1
)
*
step
;
__global
TN
*query_vec
=
(
__global
TN
*
)(
query
+
queryOffset
)
;
const
int
trainIdx
=
mad24
(
BLOCK_SIZE,
groupidx,
lidx
)
;
const
int
trainOffset
=
min
(
mad24
(
BLOCK_SIZE,
groupidx,
lidy
)
,
train_rows
-
1
)
*
step
;
__global
TN
*train_vec
=
(
__global
TN
*
)(
train
+
trainOffset
)
;
query_cols
/=
kercn
;
__local
float
sharebuffer[SHARED_MEM_SZ]
;
__local
value_type
*s_query
=
(
__local
value_type
*
)
sharebuffer
;
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
_ODD
*
BLOCK_SIZE
;
result_type
result
=
0
;
for
(
int
i
=
0
; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
{
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
const
int
loadx
=
mad24
(
BLOCK_SIZE,
i,
lidx
)
;
s_query[lidy
*
BLOCK_SIZE
+
lidx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_train[lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
query_cols
?
train[min
(
groupidx
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
s_query[mad24
(
BLOCK_SIZE_ODD,
lidy,
lidx
)
]
=
0
;
s_train[mad24
(
BLOCK_SIZE_ODD,
lidx,
lidy
)
]
=
0
;
if
(
loadx
<
query_cols
)
{
s_query[mad24
(
BLOCK_SIZE_ODD,
lidy,
lidx
)
]
=
query_vec[loadx]
;
s_train[mad24
(
BLOCK_SIZE_ODD,
lidx,
lidy
)
]
=
train_vec[loadx]
;
}
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -442,28 +358,23 @@ __kernel void BruteForceMatch_RadiusMatch(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
convert_float
(
result
)
<
maxDistance/*
&&
mask
(
queryIdx,
trainIdx
)
*/
)
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
convert_float
(
result
)
<
maxDistance
)
{
int
ind
=
atom_inc
(
nMatches
+
queryIdx
)
;
if
(
ind
<
bestTrainIdx_cols
)
{
bestTrainIdx[
queryIdx
*
(
ostep
/
sizeof
(
int
))
+
ind
]
=
trainIdx
;
bestDistance[
queryIdx
*
(
ostep
/
sizeof
(
float
))
+
ind
]
=
result
;
bestTrainIdx[
mad24
(
queryIdx,
ostep,
ind
)
]
=
trainIdx
;
bestDistance[
mad24
(
queryIdx,
ostep,
ind
)
]
=
result
;
}
}
}
__kernel
void
BruteForceMatch_knnUnrollMatch
(
__kernel
void
BruteForceMatch_knnMatch
(
__global
T
*query,
__global
T
*train,
//__global
float
*mask,
__global
int2
*bestTrainIdx,
__global
float2
*bestDistance,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
...
...
@@ -475,31 +386,45 @@ __kernel void BruteForceMatch_knnUnrollMatch(
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
const
int
queryIdx
=
mad24
(
BLOCK_SIZE,
groupidx,
lidy
)
;
const
int
queryOffset
=
min
(
queryIdx,
query_rows
-
1
)
*
step
;
__global
TN
*query_vec
=
(
__global
TN
*
)(
query
+
queryOffset
)
;
query_cols
/=
kercn
;
__local
float
sharebuffer[SHARED_MEM_SZ]
;
__local
value_type
*s_query
=
(
__local
value_type
*
)
sharebuffer
;
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
MAX_DESC_LEN
;
#
if
0
<
MAX_DESC_LEN
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
MAX_DESC_LEN
;
//
load
the
query
into
local
memory.
#
pragma
unroll
for
(
int
i
=
0
; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
{
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_query[
lidy
*
MAX_DESC_LEN
+
loadx]
=
loadx
<
query_cols
?
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
int
loadx
=
mad24
(
BLOCK_SIZE,
i,
lidx
)
;
s_query[
mad24
(
MAX_DESC_LEN,
lidy,
loadx
)
]
=
loadx
<
query_cols
?
query_vec[
loadx]
:
0
;
}
#
else
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE_ODD
*
BLOCK_SIZE
;
#
endif
float
myBestDistance1
=
MAX_FLOAT
;
float
myBestDistance2
=
MAX_FLOAT
;
int
myBestTrainIdx1
=
-1
;
int
myBestTrainIdx2
=
-1
;
//loopUnrolledCached
for
(
int
t
=
0
; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
for
(
int
t
=
0
,
endt
=
(
train_rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
; t < endt ; t++)
{
result_type
result
=
0
;
int
trainOffset
=
min
(
mad24
(
BLOCK_SIZE,
t
,
lidy
)
,
train_rows
-
1
)
*
step
;
__global
TN
*train_vec
=
(
__global
TN
*
)(
train
+
trainOffset
)
;
#
if
0
<
MAX_DESC_LEN
#
pragma
unroll
for
(
int
i
=
0
; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
{
//load
a
BLOCK_SIZE
*
BLOCK_SIZE
block
into
local
train.
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
s_train[
lidx
*
BLOCK_SIZE
+
lidy]
=
loadx
<
train_cols
?
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
:
0
;
const
int
loadx
=
mad24
(
BLOCK_SIZE,
i,
lidx
)
;
s_train[
mad24
(
BLOCK_SIZE,
lidx,
lidy
)
]
=
loadx
<
train_cols
?
train_vec[
loadx]
:
0
;
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -508,143 +433,18 @@ __kernel void BruteForceMatch_knnUnrollMatch(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
result
=
DIST_RES
(
result
)
;
const
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
)
{
if
(
result
<
myBestDistance1
)
{
myBestDistance2
=
myBestDistance1
;
myBestTrainIdx2
=
myBestTrainIdx1
;
myBestDistance1
=
result
;
myBestTrainIdx1
=
trainIdx
;
}
else
if
(
result
<
myBestDistance2
)
{
myBestDistance2
=
result
;
myBestTrainIdx2
=
trainIdx
;
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
local
float
*
)
sharebuffer
;
__local
int
*s_trainIdx
=
(
local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
//
find
BestMatch
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
s_distance[lidx]
=
myBestDistance1
;
s_trainIdx[lidx]
=
myBestTrainIdx1
;
float
bestDistance1
=
MAX_FLOAT
;
float
bestDistance2
=
MAX_FLOAT
;
int
bestTrainIdx1
=
-1
;
int
bestTrainIdx2
=
-1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidx
==
0
)
{
for
(
int
i
=
0
; i < BLOCK_SIZE ; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance1
)
{
bestDistance2
=
bestDistance1
;
bestTrainIdx2
=
bestTrainIdx1
;
bestDistance1
=
val
;
bestTrainIdx1
=
s_trainIdx[i]
;
}
else
if
(
val
<
bestDistance2
)
{
bestDistance2
=
val
;
bestTrainIdx2
=
s_trainIdx[i]
;
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
s_distance[lidx]
=
myBestDistance2
;
s_trainIdx[lidx]
=
myBestTrainIdx2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidx
==
0
)
{
for
(
int
i
=
0
; i < BLOCK_SIZE ; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance2
)
{
bestDistance2
=
val
;
bestTrainIdx2
=
s_trainIdx[i]
;
}
}
}
myBestDistance1
=
bestDistance1
;
myBestDistance2
=
bestDistance2
;
myBestTrainIdx1
=
bestTrainIdx1
;
myBestTrainIdx2
=
bestTrainIdx2
;
if
(
queryIdx
<
query_rows
&&
lidx
==
0
)
{
bestTrainIdx[queryIdx]
=
(
int2
)(
myBestTrainIdx1,
myBestTrainIdx2
)
;
bestDistance[queryIdx]
=
(
float2
)(
myBestDistance1,
myBestDistance2
)
;
}
}
__kernel
void
BruteForceMatch_knnMatch
(
__global
T
*query,
__global
T
*train,
//__global
float
*mask,
__global
int2
*bestTrainIdx,
__global
float2
*bestDistance,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
int
train_cols,
int
step
)
{
const
int
lidx
=
get_local_id
(
0
)
;
const
int
lidy
=
get_local_id
(
1
)
;
const
int
groupidx
=
get_group_id
(
0
)
;
const
int
queryIdx
=
groupidx
*
BLOCK_SIZE
+
lidy
;
__local
value_type
*s_query
=
(
__local
value_type
*
)
sharebuffer
;
__local
value_type
*s_train
=
(
__local
value_type
*
)
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
;
float
myBestDistance1
=
MAX_FLOAT
;
float
myBestDistance2
=
MAX_FLOAT
;
int
myBestTrainIdx1
=
-1
;
int
myBestTrainIdx2
=
-1
;
//loop
for
(
int
t
=
0
; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
{
result_type
result
=
0.0f
;
for
(
int
i
=
0
; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
#
else
for
(
int
i
=
0
,
endq
=
(
query_cols
+
BLOCK_SIZE
-1
)
/
BLOCK_SIZE
; i < endq ; i++)
{
const
int
loadx
=
lidx
+
i
*
BLOCK_SIZE
;
const
int
loadx
=
mad24
(
BLOCK_SIZE,
i,
lidx
)
;
//load
query
and
train
into
local
memory
s_query[
lidy
*
BLOCK_SIZE
+
lidx
]
=
0
;
s_train[
lidx
*
BLOCK_SIZE
+
lidy
]
=
0
;
s_query[
mad24
(
BLOCK_SIZE_ODD,
lidy,
lidx
)
]
=
0
;
s_train[
mad24
(
BLOCK_SIZE_ODD,
lidx,
lidy
)
]
=
0
;
if
(
loadx
<
query_cols
)
{
s_query[
lidy
*
BLOCK_SIZE
+
lidx]
=
query[min
(
queryIdx,
query_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_train[
lidx
*
BLOCK_SIZE
+
lidy]
=
train[min
(
t
*
BLOCK_SIZE
+
lidy,
train_rows
-
1
)
*
(
step
/
sizeof
(
float
))
+
loadx]
;
s_query[
mad24
(
BLOCK_SIZE_ODD,
lidy,
lidx
)
]
=
query_vec[
loadx]
;
s_train[
mad24
(
BLOCK_SIZE_ODD,
lidx,
lidy
)
]
=
train_vec[
loadx]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -653,12 +453,12 @@ __kernel void BruteForceMatch_knnMatch(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
#
endif
result
=
DIST_RES
(
result
)
;
const
int
trainIdx
=
t
*
BLOCK_SIZE
+
lidx
;
const
int
trainIdx
=
mad24
(
BLOCK_SIZE,
t
,
lidx
)
;
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
/*&&
mask
(
queryIdx,
trainIdx
)
*/
)
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
)
{
if
(
result
<
myBestDistance1
)
{
...
...
@@ -678,12 +478,11 @@ __kernel void BruteForceMatch_knnMatch(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float
*
)
sharebuffer
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE
*
BLOCK_SIZE
)
;
//findBestMatch
s_distance
+=
lidy
*
BLOCK_SIZE
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE
;
__local
int
*s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
BLOCK_SIZE_ODD
*
BLOCK_SIZE
)
;
//
find
BestMatch
s_distance
+=
lidy
*
BLOCK_SIZE_ODD
;
s_trainIdx
+=
lidy
*
BLOCK_SIZE_ODD
;
s_distance[lidx]
=
myBestDistance1
;
s_trainIdx[lidx]
=
myBestTrainIdx1
;
...
...
@@ -746,44 +545,4 @@ __kernel void BruteForceMatch_knnMatch(
bestTrainIdx[queryIdx]
=
(
int2
)(
myBestTrainIdx1,
myBestTrainIdx2
)
;
bestDistance[queryIdx]
=
(
float2
)(
myBestDistance1,
myBestDistance2
)
;
}
}
kernel
void
BruteForceMatch_calcDistanceUnrolled
(
__global
T
*query,
__global
T
*train,
//__global
float
*mask,
__global
float
*allDist,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
int
train_cols,
int
step
)
{
/*
Todo
*/
}
kernel
void
BruteForceMatch_calcDistance
(
__global
T
*query,
__global
T
*train,
//__global
float
*mask,
__global
float
*allDist,
__local
float
*sharebuffer,
int
query_rows,
int
query_cols,
int
train_rows,
int
train_cols,
int
step
)
{
/*
Todo
*/
}
kernel
void
BruteForceMatch_findBestMatch
(
__global
float
*allDist,
__global
int
*bestTrainIdx,
__global
float
*bestDistance,
int
k
)
{
/*
Todo
*/
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment