Commit ba5e8bef authored by Li Peng's avatar Li Peng

fp16 ocl support for more layers

Signed-off-by: 's avatarLi Peng <peng.li@intel.com>
parent 3dd91688
...@@ -120,12 +120,16 @@ public: ...@@ -120,12 +120,16 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inputs_.depth() == CV_16S);
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
CV_Assert(blobs.size() >= 2); CV_Assert(blobs.size() >= 2);
CV_Assert(inputs.size() == 1); CV_Assert(inputs.size() == 1);
if (use_half && inputs[0].dims == 2)
return false;
if (umat_weight.empty()) if (umat_weight.empty())
{ {
umat_weight = weights_.getUMat(ACCESS_READ); umat_weight = weights_.getUMat(ACCESS_READ);
...@@ -139,6 +143,7 @@ public: ...@@ -139,6 +143,7 @@ public:
int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1; int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1; int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
if (inpBlob.dims == 2) if (inpBlob.dims == 2)
...@@ -154,8 +159,12 @@ public: ...@@ -154,8 +159,12 @@ public:
UMat src = inputs[ii].reshape(1, s.size(), &s[0]); UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
UMat dst = outputs[ii].reshape(1, s.size(), &s[0]); UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
String buildopt = format("-DNUM=%d", number); String buildopt = format("-DNUM=%d", number) + opts;
String kname = format("batch_norm%d", number); String kname = format("batch_norm%d", number);
if (number == 1)
buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
else
buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt); ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
if (kernel.empty()) if (kernel.empty())
return false; return false;
...@@ -181,7 +190,7 @@ public: ...@@ -181,7 +190,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -95,7 +95,7 @@ public: ...@@ -95,7 +95,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -307,8 +307,24 @@ public: ...@@ -307,8 +307,24 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
if (use_half)
{
std::vector<UMat> orig_inputs;
std::vector<UMat> orig_outputs;
inps.getUMatVector(orig_inputs);
outs.getUMatVector(orig_outputs);
inputs.resize(orig_inputs.size());
for (size_t i = 0; i < orig_inputs.size(); i++)
convertFp16(orig_inputs[i], inputs[i]);
}
else
{
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
}
std::vector<LabelBBox> allDecodedBBoxes; std::vector<LabelBBox> allDecodedBBoxes;
std::vector<Mat> allConfidenceScores; std::vector<Mat> allConfidenceScores;
...@@ -342,6 +358,12 @@ public: ...@@ -342,6 +358,12 @@ public:
{ {
// Set confidences to zeros. // Set confidences to zeros.
Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)}; Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
if (use_half)
{
std::vector<UMat> orig_outputs;
outs.getUMatVector(orig_outputs);
orig_outputs[0](ranges).setTo(0);
} else
outputs[0](ranges).setTo(0); outputs[0](ranges).setTo(0);
return true; return true;
} }
...@@ -360,9 +382,23 @@ public: ...@@ -360,9 +382,23 @@ public:
} }
CV_Assert(count == numKept); CV_Assert(count == numKept);
} }
if (use_half)
{
UMat half_umat;
convertFp16(umat, half_umat);
std::vector<UMat> orig_outputs;
outs.getUMatVector(orig_outputs);
orig_outputs.clear();
orig_outputs.push_back(half_umat);
outs.assign(orig_outputs);
} else {
outputs.clear(); outputs.clear();
outputs.push_back(umat); outputs.push_back(umat);
outs.assign(outputs); outs.assign(outputs);
}
return true; return true;
} }
#endif #endif
...@@ -372,7 +408,7 @@ public: ...@@ -372,7 +408,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -87,6 +87,9 @@ public: ...@@ -87,6 +87,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
...@@ -162,7 +165,7 @@ public: ...@@ -162,7 +165,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -288,9 +288,11 @@ public: ...@@ -288,9 +288,11 @@ public:
if (!_needsPermute) if (!_needsPermute)
return false; return false;
bool use_half = (inps.depth() == CV_16S);
String opts = format("-DDtype=%s", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc); ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
kernel.set(0, (int)_count); kernel.set(0, (int)_count);
kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i])); kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
...@@ -313,7 +315,7 @@ public: ...@@ -313,7 +315,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -316,6 +316,7 @@ public: ...@@ -316,6 +316,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
...@@ -340,9 +341,15 @@ public: ...@@ -340,9 +341,15 @@ public:
heights.copyTo(umat_heights); heights.copyTo(umat_heights);
} }
String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
else
opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
size_t nthreads = _layerHeight * _layerWidth; size_t nthreads = _layerHeight * _layerWidth;
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
kernel.set(0, (int)nthreads); kernel.set(0, (int)nthreads);
kernel.set(1, (float)_stepX); kernel.set(1, (float)_stepX);
kernel.set(2, (float)_stepY); kernel.set(2, (float)_stepY);
...@@ -375,7 +382,7 @@ public: ...@@ -375,7 +382,7 @@ public:
// set the variance. // set the variance.
{ {
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc); ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
int offset = total(shape(outputs[0]), 2); int offset = total(shape(outputs[0]), 2);
size_t nthreads = _layerHeight * _layerWidth * _numPriors; size_t nthreads = _layerHeight * _layerWidth * _numPriors;
kernel.set(0, (int)nthreads); kernel.set(0, (int)nthreads);
...@@ -395,7 +402,7 @@ public: ...@@ -395,7 +402,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -158,6 +158,9 @@ public: ...@@ -158,6 +158,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
...@@ -237,7 +240,7 @@ public: ...@@ -237,7 +240,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -127,7 +127,7 @@ public: ...@@ -127,7 +127,7 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
// TODO: implement a logistic activation to classification scores. // TODO: implement a logistic activation to classification scores.
if (useLogistic) if (useLogistic || inps.depth() == CV_16S)
return false; return false;
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
...@@ -191,7 +191,7 @@ public: ...@@ -191,7 +191,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -96,9 +96,10 @@ public: ...@@ -96,9 +96,10 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); String buildopt= format("-DDtype=%s ", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
...@@ -134,7 +135,7 @@ public: ...@@ -134,7 +135,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -219,7 +219,7 @@ public: ...@@ -219,7 +219,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -40,24 +40,27 @@ ...@@ -40,24 +40,27 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
#if NUM == 8 #if NUM == 8
#define load(src, index) vload8(0, src + index) #define load(src, index) vload8(0, src + index)
#define store(vec, dst, index) vstore8(vec, 0, dst + index) #define store(vec, dst, index) vstore8(vec, 0, dst + index)
#define vec_type Dtype8 #define float_type float8
#define convert_f convert_float8
#define BATCH_NORM batch_norm8 #define BATCH_NORM batch_norm8
#elif NUM == 4 #elif NUM == 4
#define load(src, index) vload4(0, src + index) #define load(src, index) vload4(0, src + index)
#define store(vec, dst, index) vstore4(vec, 0, dst + index) #define store(vec, dst, index) vstore4(vec, 0, dst + index)
#define vec_type Dtype4 #define float_type float4
#define convert_f convert_float4
#define BATCH_NORM batch_norm4 #define BATCH_NORM batch_norm4
#elif NUM == 1 #elif NUM == 1
#define load(src, index) src[index] #define load(src, index) src[index]
#define store(vec, dst, index) dst[index] = vec #define store(vec, dst, index) dst[index] = vec
#define vec_type Dtype #define float_type float
#define convert_f convert_float
#define BATCH_NORM batch_norm1 #define BATCH_NORM batch_norm1
#endif #endif
...@@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src, ...@@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src,
const int rows, const int rows,
const int cols, const int cols,
const int channels, const int channels,
__global const Dtype* weight, __global const float* weight,
__global const Dtype* bias, __global const float* bias,
__global Dtype* dst) __global Dtype* dst)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src, ...@@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src,
if (x >= rows || y >= cols) if (x >= rows || y >= cols)
return; return;
Dtype w = weight[x % channels]; float w = weight[x % channels];
Dtype b = bias[x % channels]; float b = bias[x % channels];
vec_type src_vec = load(src, index); float_type src_vec = convert_f(load(src, index));
vec_type dst_vec = src_vec * w + (vec_type)b; float_type dst_vec = src_vec * w + (float_type)b;
store(dst_vec, dst, index); store(convert_T(dst_vec), dst, index);
} }
...@@ -40,7 +40,9 @@ ...@@ -40,7 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void permute(const int nthreads, __kernel void permute(const int nthreads,
__global Dtype* bottom_data, __global Dtype* bottom_data,
......
...@@ -39,17 +39,18 @@ ...@@ -39,17 +39,18 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void prior_box(const int nthreads, __kernel void prior_box(const int nthreads,
const Dtype stepX, const float stepX,
const Dtype stepY, const float stepY,
__global const Dtype* _offsetsX, __global const float* _offsetsX,
__global const Dtype* _offsetsY, __global const float* _offsetsY,
const int offsetsX_size, const int offsetsX_size,
__global const Dtype* _widths, __global const float* _widths,
__global const Dtype* _heights, __global const float* _heights,
const int widths_size, const int widths_size,
__global Dtype* dst, __global Dtype* dst,
const int _layerHeight, const int _layerHeight,
...@@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads, ...@@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads,
outputPtr = dst + index * 4 * offsetsX_size * widths_size; outputPtr = dst + index * 4 * offsetsX_size * widths_size;
Dtype _boxWidth, _boxHeight; float _boxWidth, _boxHeight;
Dtype4 vec; Dtype4 vec;
for (int i = 0; i < widths_size; ++i) for (int i = 0; i < widths_size; ++i)
{ {
...@@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads, ...@@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads,
_boxHeight = _heights[i]; _boxHeight = _heights[i];
for (int j = 0; j < offsetsX_size; ++j) for (int j = 0; j < offsetsX_size; ++j)
{ {
float center_x = (w + _offsetsX[j]) * stepX; Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
float center_y = (h + _offsetsY[j]) * stepY; Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
...@@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads, ...@@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads,
__kernel void set_variance(const int nthreads, __kernel void set_variance(const int nthreads,
const int offset, const int offset,
const int variance_size, const int variance_size,
__global const Dtype* variance, __global const float* variance,
__global Dtype* dst) __global Dtype* dst)
{ {
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
...@@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads, ...@@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads,
if (variance_size == 1) if (variance_size == 1)
var_vec = (Dtype4)(variance[0]); var_vec = (Dtype4)(variance[0]);
else else
var_vec = vload4(0, variance); var_vec = convert_T(vload4(0, variance));
vstore4(var_vec, 0, dst + offset + index * 4); vstore4(var_vec, 0, dst + offset + index * 4);
} }
......
...@@ -39,6 +39,10 @@ ...@@ -39,6 +39,10 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void reorg(const int count, __kernel void reorg(const int count,
__global const Dtype* src, __global const Dtype* src,
const int channels, const int channels,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment