Commit 3dd91688 authored by Li Peng's avatar Li Peng

fp16 ocl support for googlenet

Signed-off-by: 's avatarLi Peng <peng.li@intel.com>
parent 329abb5b
...@@ -128,14 +128,14 @@ public: ...@@ -128,14 +128,14 @@ public:
for( i = 0; i < ninputs; i++ ) for( i = 0; i < ninputs; i++ )
{ {
Mat& inp = *inputs[i]; Mat& inp = *inputs[i];
CV_Assert( inp.isContinuous() && inp.type() == CV_32F && CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
inp.dims == 4 && inp.size[0] == output.size[0] && inp.dims == 4 && inp.size[0] == output.size[0] &&
inp.size[2] == output.size[2] && inp.size[2] == output.size[2] &&
inp.size[3] == output.size[3] ); inp.size[3] == output.size[3] );
nchannels += inp.size[1]; nchannels += inp.size[1];
} }
CV_Assert( nchannels == output.size[1] ); CV_Assert( nchannels == output.size[1] );
CV_Assert( output.isContinuous() && output.type() == CV_32F ); CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
cc.chptrs.resize(nchannels*batchsz); cc.chptrs.resize(nchannels*batchsz);
...@@ -186,6 +186,7 @@ public: ...@@ -186,6 +186,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
...@@ -199,11 +200,12 @@ public: ...@@ -199,11 +200,12 @@ public:
int num_concats = total(shape(inputs[0]), 0, cAxis); int num_concats = total(shape(inputs[0]), 0, cAxis);
int offset_concat_axis = 0; int offset_concat_axis = 0;
UMat& outMat = outputs[0]; UMat& outMat = outputs[0];
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
String kname = format("concat_%s", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt); ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
if (kernel.empty()) if (kernel.empty())
return false; return false;
...@@ -235,7 +237,7 @@ public: ...@@ -235,7 +237,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -94,7 +94,7 @@ public: ...@@ -94,7 +94,7 @@ public:
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height); CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
const Mat &input = *inputs[0]; const Mat &input = *inputs[0];
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F)); CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
CV_Assert(inputs[i]->type() == input.type()); CV_Assert(inputs[i]->type() == input.type());
...@@ -288,7 +288,7 @@ public: ...@@ -288,7 +288,7 @@ public:
newActiv = true; newActiv = true;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE; activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
if (preferableTarget == DNN_TARGET_OPENCL) if (IS_DNN_OPENCL_TARGET(preferableTarget))
{ {
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>(); Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
if (!activ_power.empty()) if (!activ_power.empty())
...@@ -842,6 +842,7 @@ public: ...@@ -842,6 +842,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
...@@ -860,6 +861,7 @@ public: ...@@ -860,6 +861,7 @@ public:
config.dilation = dilation; config.dilation = dilation;
config.group = inputs[0].size[1] / umat_blobs[0].size[1]; config.group = inputs[0].size[1] / umat_blobs[0].size[1];
config.bias_term = (hasBias()) ? true : false; config.bias_term = (hasBias()) ? true : false;
config.use_half = use_half;
convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config)); convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
} }
...@@ -964,7 +966,7 @@ public: ...@@ -964,7 +966,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
...@@ -1360,6 +1362,9 @@ public: ...@@ -1360,6 +1362,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
...@@ -1450,7 +1455,7 @@ public: ...@@ -1450,7 +1455,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -176,7 +176,7 @@ public: ...@@ -176,7 +176,7 @@ public:
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
func.applyOCL(inputs_arr, outputs_arr, internals_arr)) func.applyOCL(inputs_arr, outputs_arr, internals_arr))
...@@ -223,7 +223,12 @@ public: ...@@ -223,7 +223,12 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
static String oclGetTMacro(const UMat &m) static String oclGetTMacro(const UMat &m)
{ {
return String("-DT=") + ocl::typeToStr(m.type()) + String(" "); String str_name = ocl::typeToStr(m.type());
if (str_name == "short")
str_name = "half";
return format("-DT=%s -Dconvert_T=convert_%s ", str_name.c_str(), str_name.c_str());
} }
#endif #endif
...@@ -516,8 +521,28 @@ struct SigmoidFunctor ...@@ -516,8 +521,28 @@ struct SigmoidFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("SigmoidForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif
...@@ -561,8 +586,28 @@ struct ELUFunctor ...@@ -561,8 +586,28 @@ struct ELUFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("ELUForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif
...@@ -604,8 +649,28 @@ struct AbsValFunctor ...@@ -604,8 +649,28 @@ struct AbsValFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("AbsValForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif
......
...@@ -271,6 +271,9 @@ public: ...@@ -271,6 +271,9 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
if (inputs_.depth() == CV_16S && op != SUM)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
...@@ -284,10 +287,15 @@ public: ...@@ -284,10 +287,15 @@ public:
{ {
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { (size_t)channels / 4 * localsize[0] }; size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
String opts;
if (inputs_.depth() == CV_16S)
opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
else
opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
for (int i = 0; i < (inputs.size() - 1); ++i) for (int i = 0; i < (inputs.size() - 1); ++i)
{ {
String buildopt = format("-DLOOP=%d", i); String buildopt = format("-DLOOP=%d", i) + opts;
ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt); ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
int idx = 0; int idx = 0;
UMat inpMat = (i == 0) ? inputs[0] : UMat(); UMat inpMat = (i == 0) ? inputs[0] : UMat();
...@@ -306,6 +314,9 @@ public: ...@@ -306,6 +314,9 @@ public:
} }
else else
{ {
if (inputs_.depth() == CV_16S)
return false;
float coeff1 = coeffs.empty() ? 1.f : coeffs[0]; float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
float coeff2 = coeffs.empty() ? 1.f : coeffs[1]; float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
UMat mul0, mul1; UMat mul0, mul1;
...@@ -343,7 +354,7 @@ public: ...@@ -343,7 +354,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -140,7 +140,7 @@ public: ...@@ -140,7 +140,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
outputs_arr.isUMatVector() && outputs_arr.isUMatVector() &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -64,6 +64,7 @@ public: ...@@ -64,6 +64,7 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
Ptr<OCL4DNNInnerProduct<float> > innerProductOp; Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
std::vector<UMat> umat_blobs; std::vector<UMat> umat_blobs;
std::vector<UMat> half_blobs;
#endif #endif
FullyConnectedLayerImpl(const LayerParams& params) FullyConnectedLayerImpl(const LayerParams& params)
...@@ -277,6 +278,7 @@ public: ...@@ -277,6 +278,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
...@@ -293,6 +295,17 @@ public: ...@@ -293,6 +295,17 @@ public:
config.bias_term = bias; config.bias_term = bias;
config.M = outerSize; config.M = outerSize;
config.K = innerSize; config.K = innerSize;
config.use_half = use_half;
if (use_half)
{
half_blobs.resize(umat_blobs.size());
for (int i = 0; i < umat_blobs.size(); i++)
{
if (!umat_blobs[i].empty())
convertFp16(umat_blobs[i], half_blobs[i]);
}
}
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config)); innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
} }
...@@ -309,13 +322,15 @@ public: ...@@ -309,13 +322,15 @@ public:
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]); dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
dstMat.setTo(0.0f); dstMat.setTo(0.0f);
if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat)) if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
dstMat))
{ {
ret = false; ret = false;
break; break;
} }
if (bias && (outerSize > 1)) if (!use_half && bias && (outerSize > 1))
{ {
UMat& biases = umat_blobs[1]; UMat& biases = umat_blobs[1];
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
...@@ -353,7 +368,7 @@ public: ...@@ -353,7 +368,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -106,6 +106,7 @@ public: ...@@ -106,6 +106,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
...@@ -128,6 +129,7 @@ public: ...@@ -128,6 +129,7 @@ public:
config.height = inputs[0].size[2]; config.height = inputs[0].size[2];
config.width = inputs[0].size[3]; config.width = inputs[0].size[3];
config.norm_by_size = normBySize; config.norm_by_size = normBySize;
config.use_half = use_half;
lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config)); lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
} }
...@@ -146,7 +148,7 @@ public: ...@@ -146,7 +148,7 @@ public:
CV_Assert(inputs_arr.total() == outputs_arr.total()); CV_Assert(inputs_arr.total() == outputs_arr.total());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -102,6 +102,9 @@ public: ...@@ -102,6 +102,9 @@ public:
{ {
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ); UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ); UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
bool use_half = (inputs[0].depth() == CV_16S);
String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");
int splitDim = (acrossChannels) ? 1 : 2; int splitDim = (acrossChannels) ? 1 : 2;
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
...@@ -111,12 +114,11 @@ public: ...@@ -111,12 +114,11 @@ public:
int newRows = total(shape(inpMat), 0, splitDim); int newRows = total(shape(inpMat), 0, splitDim);
MatShape s = shape(newRows, inpMat.total() / newRows); MatShape s = shape(newRows, inpMat.total() / newRows);
UMat oneMat = UMat::ones(s[1], 1, CV_32F); UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
UMat meanMat = UMat(s[0], 1, CV_32F);
UMat tmpMat = UMat(s[0], s[1], CV_32F); UMat tmpMat = UMat(s[0], s[1], CV_32F);
float alpha = 1.0f / s[1]; float alpha = 1.0f / s[1];
String buildopt = "-DNUM=4"; String buildopt = "-DNUM=4" + opts;
ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt); ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] }; size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
...@@ -167,13 +169,14 @@ public: ...@@ -167,13 +169,14 @@ public:
int row_size = total(shape(inputs[0]), 0, splitDim); int row_size = total(shape(inputs[0]), 0, splitDim);
int plane_size = total(shape(inputs[0]), splitDim); int plane_size = total(shape(inputs[0]), splitDim);
if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0)) if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
{ return fast_forward_ocl(inputs, outputs);
bool ret = fast_forward_ocl(inputs, outputs);
return ret; if (inputs[0].depth() == CV_16S)
} return false;
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ); UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ); UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
{ {
...@@ -195,7 +198,7 @@ public: ...@@ -195,7 +198,7 @@ public:
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) }; size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
String buildopt = format("-DNUM=%d", number); String buildopt = format("-DNUM=%d", number) + opts;
if (normVariance) if (normVariance)
{ {
String kname = format("calc_mean%d", number); String kname = format("calc_mean%d", number);
...@@ -249,7 +252,7 @@ public: ...@@ -249,7 +252,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -147,6 +147,7 @@ public: ...@@ -147,6 +147,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
...@@ -164,6 +165,7 @@ public: ...@@ -164,6 +165,7 @@ public:
(type == AVE ? LIBDNN_POOLING_METHOD_AVE : (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
LIBDNN_POOLING_METHOD_STO); LIBDNN_POOLING_METHOD_STO);
config.avePoolPaddedArea = avePoolPaddedArea; config.avePoolPaddedArea = avePoolPaddedArea;
config.use_half = use_half;
poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config)); poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
} }
...@@ -189,7 +191,7 @@ public: ...@@ -189,7 +191,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -181,6 +181,7 @@ public: ...@@ -181,6 +181,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inputs_.depth() == CV_16S);
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
...@@ -188,6 +189,11 @@ public: ...@@ -188,6 +189,11 @@ public:
(total(shape(outputs[0]), 2) % 4 != 0)) (total(shape(outputs[0]), 2) % 4 != 0))
return false; return false;
String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
else
opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
const UMat& inpMat = inputs[0]; const UMat& inpMat = inputs[0];
for (size_t i = 0; i < outputs.size(); i++) for (size_t i = 0; i < outputs.size(); i++)
{ {
...@@ -196,7 +202,7 @@ public: ...@@ -196,7 +202,7 @@ public:
int rows = outputs[i].size[2]; int rows = outputs[i].size[2];
int cols = outputs[i].size[3]; int cols = outputs[i].size[3];
ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc); ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
size_t local[] = { 128 }; size_t local[] = { 128 };
size_t global[] = { (size_t)groups * channels / 4 * local[0] }; size_t global[] = { (size_t)groups * channels / 4 * local[0] };
int idx = 0; int idx = 0;
...@@ -222,7 +228,7 @@ public: ...@@ -222,7 +228,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -99,15 +99,16 @@ public: ...@@ -99,15 +99,16 @@ public:
softmaxOp.release(); softmaxOp.release();
} }
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns) bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
{ {
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
inps.getUMatVector(inputs); bool use_half = (inputs_.depth() == CV_16S);
outs.getUMatVector(outputs); inputs_.getUMatVector(inputs);
itns.getUMatVector(internals); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals);
if (softmaxOp.empty()) if (softmaxOp.empty())
{ {
...@@ -117,6 +118,7 @@ public: ...@@ -117,6 +118,7 @@ public:
config.axis = axisRaw; config.axis = axisRaw;
config.channels = inputs[0].size[axisRaw]; config.channels = inputs[0].size[axisRaw];
config.logsoftmax = logSoftMax; config.logsoftmax = logSoftMax;
config.use_half = use_half;
softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config)); softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
} }
...@@ -128,15 +130,13 @@ public: ...@@ -128,15 +130,13 @@ public:
return true; return true;
UMat& bufMat = internals[0]; UMat& bufMat = internals[0];
src.copyTo(dstMat);
int axis = clamp(axisRaw, src.dims); int axis = clamp(axisRaw, src.dims);
MatShape s = shape(src); MatShape s = shape(src);
size_t outerSize = total(s, 0, axis); size_t outerSize = total(s, 0, axis);
size_t channels = src.size[axis]; size_t channels = src.size[axis];
size_t innerSize = total(s, axis + 1); size_t innerSize = total(s, axis + 1);
String buildOpts = String("-DT=") + ocl::typeToStr(src.type()); String buildOpts = format("-DT=%s", use_half ? "half" : "float");
ocl::Kernel kmax, ksub, ksum, kdiv; ocl::Kernel kmax, ksub, ksum, kdiv;
if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts)) if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
...@@ -152,38 +152,31 @@ public: ...@@ -152,38 +152,31 @@ public:
if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts)) if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
return false; return false;
size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
size_t bufSize = internals[0].total(); size_t bufSize = internals[0].total();
size_t totalSize = src.total(); size_t totalSize = src.total();
// adjust local/global size size_t internal_globalSize[1] = { bufSize };
size_t internal_localSize[1] = { (bufSize == 1) ? 1 : wgSize }; size_t total_globalSize[1] = { totalSize };
size_t internal_globalSize[1] = { divUp(bufSize, (unsigned int)internal_localSize[0]) * internal_localSize[0] };
// adjust local/global size (total)
size_t total_localSize[1] = { (totalSize == 1) ? 1 : wgSize };
size_t total_globalSize[1] = { divUp(totalSize, (unsigned int)total_localSize[0]) * total_localSize[0] };
kmax.args((int)outerSize, (int)channels, (int)innerSize, kmax.args((int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
if (!kmax.run(1, internal_globalSize, internal_localSize, false)) if (!kmax.run(1, internal_globalSize, NULL, false))
return false; return false;
ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); ocl::KernelArg::PtrReadOnly(bufMat),
if (!ksub.run(1, total_globalSize, total_localSize, false)) ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
if (!ksub.run(1, total_globalSize, NULL, false))
return false; return false;
cv::exp(dstMat, dstMat);
ksum.args((int)outerSize, (int)channels, (int)innerSize, ksum.args((int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
if (!ksum.run(1, internal_globalSize, internal_localSize, false)) if (!ksum.run(1, internal_globalSize, NULL, false))
return false; return false;
kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
if (!kdiv.run(1, total_globalSize, total_localSize, false)) if (!kdiv.run(1, total_globalSize, NULL, false))
return false; return false;
return true; return true;
...@@ -195,7 +188,7 @@ public: ...@@ -195,7 +188,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
......
...@@ -59,7 +59,8 @@ struct OCL4DNNConvConfig ...@@ -59,7 +59,8 @@ struct OCL4DNNConvConfig
stride(1, 1), stride(1, 1),
dilation(1, 1), dilation(1, 1),
group(1), group(1),
bias_term(false) bias_term(false),
use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
MatShape out_shape; MatShape out_shape;
...@@ -69,6 +70,7 @@ struct OCL4DNNConvConfig ...@@ -69,6 +70,7 @@ struct OCL4DNNConvConfig
Size dilation; Size dilation;
int group; // = 1; int group; // = 1;
bool bias_term; // = false; bool bias_term; // = false;
bool use_half; // = false;
}; };
typedef enum { typedef enum {
...@@ -272,6 +274,8 @@ class OCL4DNNConvSpatial ...@@ -272,6 +274,8 @@ class OCL4DNNConvSpatial
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;
UMat swizzled_weights_umat; UMat swizzled_weights_umat;
UMat weights_half;
UMat bias_half;
UMat bottom_data2_; UMat bottom_data2_;
int32_t bottom_index_; int32_t bottom_index_;
...@@ -327,6 +331,7 @@ class OCL4DNNConvSpatial ...@@ -327,6 +331,7 @@ class OCL4DNNConvSpatial
ocl4dnnFusedActiv_t fused_activ_; ocl4dnnFusedActiv_t fused_activ_;
float power_; float power_;
bool fused_eltwise_; bool fused_eltwise_;
bool use_half_;
}; };
typedef enum { typedef enum {
...@@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig ...@@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig
channels(0), channels(0),
pool_method(LIBDNN_POOLING_METHOD_MAX), pool_method(LIBDNN_POOLING_METHOD_MAX),
global_pooling(false), global_pooling(false),
avePoolPaddedArea(false) avePoolPaddedArea(true),
use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
MatShape out_shape; MatShape out_shape;
...@@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig ...@@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig
ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX; ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
bool global_pooling; // = false; bool global_pooling; // = false;
bool avePoolPaddedArea; bool avePoolPaddedArea;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
...@@ -391,13 +398,14 @@ class OCL4DNNPool ...@@ -391,13 +398,14 @@ class OCL4DNNPool
int32_t pooled_height_; int32_t pooled_height_;
int32_t pooled_width_; int32_t pooled_width_;
bool avePoolPaddedArea; bool avePoolPaddedArea;
bool use_half;
}; };
struct OCL4DNNInnerProductConfig struct OCL4DNNInnerProductConfig
{ {
OCL4DNNInnerProductConfig() : OCL4DNNInnerProductConfig() :
num_output(0), M(0), K(0), num_output(0), M(0), K(0),
bias_term(false), transpose(false), phase_test(true) bias_term(false), transpose(false), phase_test(true), use_half(false)
{} {}
int num_output; int num_output;
int M; int M;
...@@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig ...@@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig
bool bias_term; bool bias_term;
bool transpose; // = false; bool transpose; // = false;
bool phase_test; // = true; bool phase_test; // = true;
bool use_half; // = false;
}; };
template<typename Dtype> template<typename Dtype>
...@@ -428,6 +437,7 @@ class OCL4DNNInnerProduct ...@@ -428,6 +437,7 @@ class OCL4DNNInnerProduct
bool transpose_; bool transpose_;
bool image_copied_; bool image_copied_;
bool phase_test_; bool phase_test_;
bool use_half_;
}; };
typedef enum { typedef enum {
...@@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig ...@@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig
lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS), lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
phase_test(true), phase_test(true),
local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false), local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
batch_size(0), channels(0), height(0), width(0) batch_size(0), channels(0), height(0), width(0), use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type; LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
...@@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig ...@@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig
int32_t channels; int32_t channels;
int32_t height; int32_t height;
int32_t width; int32_t width;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
...@@ -477,16 +488,18 @@ class OCL4DNNLRN ...@@ -477,16 +488,18 @@ class OCL4DNNLRN
int32_t height_; int32_t height_;
int32_t width_; int32_t width_;
bool norm_by_size_; bool norm_by_size_;
bool use_half_;
}; };
struct OCL4DNNSoftmaxConfig struct OCL4DNNSoftmaxConfig
{ {
OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false) OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
int axis; int axis;
int channels; int channels;
bool logsoftmax; bool logsoftmax;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
...@@ -506,6 +519,7 @@ class OCL4DNNSoftmax ...@@ -506,6 +519,7 @@ class OCL4DNNSoftmax
bool use_slm_; bool use_slm_;
bool log_softmax_; bool log_softmax_;
UMat scale_data_; UMat scale_data_;
bool use_half_;
}; };
}}} // namespace cv::dnn::ocl4dnn }}} // namespace cv::dnn::ocl4dnn
......
...@@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config ...@@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config
K_ = config.K; K_ = config.K;
phase_test_ = config.phase_test; phase_test_ = config.phase_test;
image_copied_ = false; image_copied_ = false;
use_half_ = config.use_half;
} }
template<typename Dtype> template<typename Dtype>
...@@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom, ...@@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
if (M_ <= max_image_size && if (M_ <= max_image_size &&
N_ <= max_image_size && N_ <= max_image_size &&
K_ <= max_image_size && K_ <= max_image_size &&
cv::traits::Depth<Dtype>::value == CV_32F &&
ocl::Device::getDefault().intelSubgroupsSupport()) ocl::Device::getDefault().intelSubgroupsSupport())
{ {
ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans, ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
M_, N_, K_, bottom, weight, UMat(), top, M_, N_, K_, bottom, weight, UMat(), top,
max_image_size); max_image_size);
} }
if (use_half_ && bias_term_)
{
UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
UMat newbias, tmpTop;
convertFp16(bias, newbias);
convertFp16(top, tmpTop);
cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
convertFp16(tmpTop, top);
}
return ret; return ret;
} }
} }
......
...@@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config) ...@@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
channels_ = config.channels; channels_ = config.channels;
height_ = config.height; height_ = config.height;
width_ = config.width; width_ = config.width;
use_half_ = config.use_half;
} }
template<typename Dtype> template<typename Dtype>
...@@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top) ...@@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
int32_t n_threads = num_ * height_ * width_; int32_t n_threads = num_ * height_ * width_;
size_t global_work_size_[1] = {(size_t)n_threads}; size_t global_work_size_[1] = {(size_t)n_threads};
String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : ""; String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
opts += format("-D Dtype=%s", (use_half_) ? "half" : "float");
ocl::Kernel oclk_lrn_fill; ocl::Kernel oclk_lrn_fill;
if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts)) String kname = format("lrn_full_no_scale_%s", (use_half_) ? "half" : "float");
if (!oclk_lrn_fill.create(kname.c_str(), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
return false; return false;
oclk_lrn_fill.set(argIdx++, n_threads); oclk_lrn_fill.set(argIdx++, n_threads);
......
...@@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config) ...@@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
channels_ = config.channels; channels_ = config.channels;
pool_method_ = config.pool_method; pool_method_ = config.pool_method;
avePoolPaddedArea = config.avePoolPaddedArea; avePoolPaddedArea = config.avePoolPaddedArea;
use_half = config.use_half;
for (int i = 0; i < spatial_dims; ++i) for (int i = 0; i < spatial_dims; ++i)
{ {
...@@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom, ...@@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
case LIBDNN_POOLING_METHOD_MAX: case LIBDNN_POOLING_METHOD_MAX:
{ {
bool haveMask = !top_mask.empty(); bool haveMask = !top_mask.empty();
String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
kname += (use_half) ? "_half" : "_float";
ocl::Kernel oclk_max_pool_forward( ocl::Kernel oclk_max_pool_forward(
haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"), kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format(" -D Dtype=%s -D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d%s", " -D PAD_W=%d -D PAD_H=%d%s",
(use_half) ? "half" : "float",
kernel_w_, kernel_h_, kernel_w_, kernel_h_,
stride_w_, stride_h_, stride_w_, stride_h_,
pad_w_, pad_h_, pad_w_, pad_h_,
...@@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom, ...@@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
{ {
CV_Assert(top_mask.empty()); CV_Assert(top_mask.empty());
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"), String kname = format("ave_pool_forward_%s", (use_half) ? "half" : "float");
ocl::Kernel oclk_ave_pool_forward(
kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format(" -D Dtype=%s -D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d%s", " -D PAD_W=%d -D PAD_H=%d%s",
(use_half) ? "half" : "float",
kernel_w_, kernel_h_, kernel_w_, kernel_h_,
stride_w_, stride_h_, stride_w_, stride_h_,
pad_w_, pad_h_, pad_w_, pad_h_,
...@@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom, ...@@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
{ {
CV_Assert(top_mask.empty()); CV_Assert(top_mask.empty());
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"), String kname = format("sto_pool_forward_test_%s", (use_half) ? "half" : "float");
ocl::Kernel oclk_sto_pool_forward(
kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d", " -D STRIDE_W=%d -D STRIDE_H=%d",
......
...@@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config) ...@@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
softmax_axis_ = config.axis; softmax_axis_ = config.axis;
channels_ = config.channels; channels_ = config.channels;
log_softmax_ = config.logsoftmax; log_softmax_ = config.logsoftmax;
use_half_ = config.use_half;
inner_num_ = 1; inner_num_ = 1;
outer_num_ = 1; outer_num_ = 1;
...@@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top) ...@@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
if (log_softmax_) opts += " -DLOG_SOFTMAX "; if (log_softmax_) opts += " -DLOG_SOFTMAX ";
if (use_slm_) if (use_slm_)
kname = CL_KERNEL_SELECT("softmax_forward_slm"); kname = "softmax_forward_slm";
else else
kname = CL_KERNEL_SELECT("softmax_forward"); kname = "softmax_forward";
kname += format("%s", (use_half_) ? "_half" : "_float");
opts += format(" -D Dtype=%s -D DTYPE_MAX=%s", (use_half_) ? "half" : "float",
(use_half_) ? "HALF_MAX" : "FLT_MAX");
if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts)) if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
return false; return false;
......
...@@ -40,9 +40,17 @@ ...@@ -40,9 +40,17 @@
// //
//M*/ //M*/
#define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type)
#define KERNEL_ARG_DTYPE float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void ReLUForward(const int count, __global const T* in, __global T* out __kernel void ReLUForward(const int count, __global const T* in, __global T* out
#ifndef RELU_NO_SLOPE #ifndef RELU_NO_SLOPE
, T negative_slope , KERNEL_ARG_DTYPE negative_slope
#endif #endif
) { ) {
int index = get_global_id(0); int index = get_global_id(0);
...@@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out ...@@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
} }
__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out, __kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
const T minValue, const T maxValue) const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
{ {
int index = get_global_id(0); int index = get_global_id(0);
if(index < count) if(index < count)
{ {
T x = in[index]; T x = in[index];
out[index] = clamp(x, minValue, maxValue); out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
} }
} }
__kernel void PReLUForward(const int count, const int channels, const int plane_size, __kernel void PReLUForward(const int count, const int channels, const int plane_size,
__global const T* in, __global T* out, __global const T* slope_data) __global const T* in, __global T* out,
__global const KERNEL_ARG_DTYPE* slope_data)
{ {
int index = get_global_id(0); int index = get_global_id(0);
int c = (index / plane_size) % channels; int c = (index / plane_size) % channels;
...@@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out) ...@@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out)
out[index] = fabs(in[index]); out[index] = fabs(in[index]);
} }
__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) { __kernel void PowForward(const int n, __global const T* in, __global T* out,
const KERNEL_ARG_DTYPE power,
const KERNEL_ARG_DTYPE scale,
const KERNEL_ARG_DTYPE shift)
{
int index = get_global_id(0); int index = get_global_id(0);
if (index < n) if (index < n)
out[index] = pow(shift + scale * in[index], power); out[index] = pow(shift + scale * in[index], power);
} }
__kernel void ELUForward(const int n, __global const T* in, __global T* out)
{
int index = get_global_id(0);
if (index < n)
{
T src = in[index];
out[index] = (src >= 0.f) ? src : exp(src) - 1;
}
}
...@@ -39,22 +39,29 @@ ...@@ -39,22 +39,29 @@
// //
//M*/ //M*/
__kernel void concat(const int nthreads, #if defined(cl_khr_fp16)
__global const Dtype* in_data, #pragma OPENCL EXTENSION cl_khr_fp16 : enable
const int num_concats, #endif
const int concat_size,
const int top_concat_axis,
const int bottom_concat_axis,
const int offset_concat_axis,
__global Dtype* out_data) {
for (int index = get_global_id(0); index < nthreads; #define CONCAT(A,B) A##_##B
index += get_global_size(0)) { #define TEMPLATE(name,type) CONCAT(name,type)
const int total_concat_size = concat_size * bottom_concat_axis;
const int concat_num = index / total_concat_size; __kernel void TEMPLATE(concat, Dtype)(const int nthreads,
const int concat_index = index % total_concat_size; __global const Dtype* in_data,
const int top_index = concat_index const int num_concats,
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size; const int concat_size,
out_data[top_index] = in_data[index]; const int top_concat_axis,
} const int bottom_concat_axis,
const int offset_concat_axis,
__global Dtype* out_data)
{
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
{
const int total_concat_size = concat_size * bottom_concat_axis;
const int concat_num = index / total_concat_size;
const int concat_index = index % total_concat_size;
const int top_index = concat_index +
(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
out_data[top_index] = in_data[index];
}
} }
...@@ -40,27 +40,29 @@ ...@@ -40,27 +40,29 @@
// //
//M*/ //M*/
#if APPLY_BIAS #if defined(cl_khr_fp16)
#define BIAS_KERNEL_ARG __global Dtype * biases_base, #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#else
#define BIAS_KERNEL_ARG
#endif #endif
#define KERNEL_ARG_DTYPE float
#define TYPE_FLOAT 1
#define TYPE_HALF 2
#if defined(FUSED_CONV_RELU) #if defined(FUSED_CONV_RELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope))) #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
#define FUSED_ARG Dtype negative_slope, #define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
#elif defined(FUSED_CONV_PRELU) #elif defined(FUSED_CONV_PRELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c]))) #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
#define FUSED_ARG __global const Dtype *negative_slope, #define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
#elif defined(FUSED_CONV_POWER) #elif defined(FUSED_CONV_POWER)
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power) #define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
#define FUSED_ARG Dtype power, #define FUSED_ARG KERNEL_ARG_DTYPE power,
#elif defined(FUSED_CONV_TANH) #elif defined(FUSED_CONV_TANH)
#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x) #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
#define FUSED_ARG #define FUSED_ARG
#elif defined(FUSED_CONV_RELU6) #elif defined(FUSED_CONV_RELU6)
#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value)) #define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
#define FUSED_ARG Dtype min_value, Dtype max_value, #define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
#else #else
#define ACTIVATION_RELU_FUNCTION(x, c) (x) #define ACTIVATION_RELU_FUNCTION(x, c) (x)
#define FUSED_ARG #define FUSED_ARG
...@@ -74,6 +76,11 @@ ...@@ -74,6 +76,11 @@
#define ELTWISE_DATA_ARG #define ELTWISE_DATA_ARG
#endif #endif
#if APPLY_BIAS
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
#else
#define BIAS_KERNEL_ARG
#endif
#define __CAT(x, y) x##y #define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y) #define CAT(x, y) __CAT(x, y)
...@@ -97,6 +104,16 @@ ...@@ -97,6 +104,16 @@
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
#if defined(convolve_simd) || defined(Conv_Interleaved) #if defined(convolve_simd) || defined(Conv_Interleaved)
#if TYPE == TYPE_HALF
#define INT_TYPE ushort
#define INT_TYPE2 ushort2
#define INT_TYPE4 ushort4
#define INT_TYPE8 ushort8
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
#else
#define INT_TYPE uint #define INT_TYPE uint
#define INT_TYPE2 uint2 #define INT_TYPE2 uint2
#define INT_TYPE4 uint4 #define INT_TYPE4 uint4
...@@ -106,6 +123,7 @@ ...@@ -106,6 +123,7 @@
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8 #define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read #define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
#endif #endif
#endif
#ifdef KERNEL_BASIC #ifdef KERNEL_BASIC
...@@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float ...@@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float
float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
typedef struct float0 { float s0; } float0; //never used but makes compiler happy. typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
typedef struct half1 { half s0; } half1;
typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; } half10;
typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; } half11;
typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; } half12;
typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
#define OUT_PITCH_X output_width #define OUT_PITCH_X output_width
#define ROW_PITCH input_width #define ROW_PITCH input_width
......
...@@ -40,9 +40,9 @@ ...@@ -40,9 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
__kernel void op_sum4(__global const Dtype * A, __kernel void op_sum4(__global const Dtype * A,
__global const Dtype * B, __global const Dtype * B,
...@@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A, ...@@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A,
a2 = vload4(i, src0_read + 2 * A_col_size); a2 = vload4(i, src0_read + 2 * A_col_size);
a3 = vload4(i, src0_read + 3 * A_col_size); a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 = a0 * coeff1 + b0 * coeff2; dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
dot1 = a1 * coeff1 + b1 * coeff2; dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
dot2 = a2 * coeff1 + b2 * coeff2; dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
dot3 = a3 * coeff1 + b3 * coeff2; dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
#else #else
a0 = vload4(i, dst0_read); a0 = vload4(i, dst0_read);
a1 = vload4(i, dst0_read + A_col_size); a1 = vload4(i, dst0_read + A_col_size);
a2 = vload4(i, dst0_read + 2 * A_col_size); a2 = vload4(i, dst0_read + 2 * A_col_size);
a3 = vload4(i, dst0_read + 3 * A_col_size); a3 = vload4(i, dst0_read + 3 * A_col_size);
dot0 = a0 + b0 * coeff2; dot0 = a0 + b0 * (Dtype4)coeff2;
dot1 = a1 + b1 * coeff2; dot1 = a1 + b1 * (Dtype4)coeff2;
dot2 = a2 + b2 * coeff2; dot2 = a2 + b2 * (Dtype4)coeff2;
dot3 = a3 + b3 * coeff2; dot3 = a3 + b3 * (Dtype4)coeff2;
#endif #endif
vstore4(dot0, i, dst0_read); vstore4(dot0, i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(dot1, i, dst0_read + A_col_size);
......
This diff is collapsed.
This diff is collapsed.
...@@ -40,16 +40,20 @@ ...@@ -40,16 +40,20 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x, __kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
const int offx, __global Dtype* y, const int offx, __global Dtype* y,
const int offy) { const int offy) {
for (int index = get_global_id(0); index < n; index += get_global_size(0)) { for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype src = x[offx + index]; Dtype src = x[offx + index];
Dtype dst = y[offy + index]; Dtype dst = y[offy + index];
y[offy + index] = alpha * src + dst; y[offy + index] = convert_Dtype(alpha) * src + dst;
} }
} }
...@@ -39,41 +39,45 @@ ...@@ -39,41 +39,45 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
__kernel void TEMPLATE(matvec_mul4,Dtype)( __kernel void TEMPLATE(matvec_mul4,Dtype)(
__global const float * A, __global const Dtype * A,
int offA, int offA,
unsigned int A_col_size, unsigned int A_col_size,
unsigned int trail_item, unsigned int trail_item,
__global const float * v, __global const Dtype * v,
int offv, int offv,
float alpha, KERNEL_ARG_DTYPE alpha,
float beta, KERNEL_ARG_DTYPE beta,
__global float4 * result, __global Dtype4* result,
int offr, int offr,
__local float4 * work) __local Dtype4* work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global float *src0_read = A + row_gid * 4 * A_col_size + offA; const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
const __global float *src1_read = v + offv; const __global Dtype *src1_read = v + offv;
result = (__global float4*)((__global float*)result + offr); result = (__global Dtype4*)((__global Dtype*)result + offr);
float4 dot0 = (float4)(0.f); Dtype4 dot0 = (Dtype4)(0.f);
float4 dot1 = (float4)(0.f); Dtype4 dot1 = (Dtype4)(0.f);
float4 dot2 = (float4)(0.f); Dtype4 dot2 = (Dtype4)(0.f);
float4 dot3 = (float4)(0.f); Dtype4 dot3 = (Dtype4)(0.f);
unsigned int i = lid; unsigned int i = lid;
while( i < A_col_size / 4) { while( i < A_col_size / 4) {
const float4 a0 = vload4(i, src0_read); const Dtype4 a0 = vload4(i, src0_read);
const float4 a1 = vload4(i, src0_read + A_col_size); const Dtype4 a1 = vload4(i, src0_read + A_col_size);
const float4 a2 = vload4(i, src0_read + 2 * A_col_size); const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
const float4 a3 = vload4(i, src0_read + 3 * A_col_size); const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
const float4 b0 = vload4(i, src1_read); const Dtype4 b0 = vload4(i, src1_read);
dot0 += a0 * b0; dot0 += a0 * b0;
dot1 += a1 * b0; dot1 += a1 * b0;
...@@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)( ...@@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
{ {
if(trail_item != 0) if(trail_item != 0)
{ {
const __global float *src0_trail = src0_read + i * 4; const __global Dtype *src0_trail = src0_read + i * 4;
const __global float *src1_trail = src1_read + i * 4; const __global Dtype *src1_trail = src1_read + i * 4;
for(unsigned int i = 0; i < trail_item; ++i) { for(unsigned int i = 0; i < trail_item; ++i) {
const float at0 = src0_trail[i]; const Dtype at0 = src0_trail[i];
const float at1 = src0_trail[i + A_col_size]; const Dtype at1 = src0_trail[i + A_col_size];
const float at2 = src0_trail[i + 2 * A_col_size]; const Dtype at2 = src0_trail[i + 2 * A_col_size];
const float at3 = src0_trail[i + 3 * A_col_size]; const Dtype at3 = src0_trail[i + 3 * A_col_size];
const float bt = src1_trail[i]; const Dtype bt = src1_trail[i];
work[lid].s0 += at0 * bt; work[lid].s0 += at0 * bt;
work[lid].s1 += at1 * bt; work[lid].s1 += at1 * bt;
...@@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)( ...@@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
} }
if(lid == 0) { if(lid == 0) {
if(beta == (Dtype)0) if(beta == (Dtype)0)
result[row_gid] = alpha * work[0]; result[row_gid] = convert_Dtype(alpha) * work[0];
else else
result[row_gid] = alpha * work[0] + beta * result[row_gid]; result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
} }
} }
/* This kernel used for the trailing rows when row_of_A %4 !=0 */ /* This kernel used for the trailing rows when row_of_A %4 !=0 */
__kernel void TEMPLATE(matvec_mul1,Dtype)( __kernel void TEMPLATE(matvec_mul1,Dtype)(
__global const float * A, __global const Dtype * A,
int offA, int offA,
unsigned int A_col_size, unsigned int A_col_size,
unsigned int row_offset, unsigned int row_offset,
unsigned int trail_item, unsigned int trail_item,
__global const float * v, __global const Dtype * v,
int offv, int offv,
float alpha, KERNEL_ARG_DTYPE alpha,
float beta, KERNEL_ARG_DTYPE beta,
__global float * result, __global Dtype * result,
int offr, int offr,
__local float * work) __local Dtype * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA; const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
const __global float *src1_read = v + + offv; const __global Dtype *src1_read = v + + offv;
result = result + offr; result = result + offr;
float4 dot0 = (float4)(0.f); Dtype4 dot0 = (Dtype4)(0.f);
unsigned int i = lid; unsigned int i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const float4 a0 = vload4(i, src0_read); const Dtype4 a0 = vload4(i, src0_read);
const float4 b0 = vload4(i, src1_read); const Dtype4 b0 = vload4(i, src1_read);
dot0 += a0 * b0; dot0 += a0 * b0;
i += get_local_size(0); i += get_local_size(0);
...@@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)( ...@@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
{ {
if(trail_item != 0) if(trail_item != 0)
{ {
const __global float *src0_trail = src0_read + i * 4; const __global Dtype *src0_trail = src0_read + i * 4;
const __global float *src1_trail = src1_read + i * 4; const __global Dtype *src1_trail = src1_read + i * 4;
for(unsigned int i = 0; i < trail_item; ++i) { for(unsigned int i = 0; i < trail_item; ++i) {
const float at0 = src0_trail[i]; const Dtype at0 = src0_trail[i];
const float bt = src1_trail[i]; const Dtype bt = src1_trail[i];
work[lid] += at0 * bt; work[lid] += at0 * bt;
} }
...@@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)( ...@@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
if(lid == 0) { if(lid == 0) {
if(beta == (Dtype)0) { if(beta == (Dtype)0) {
result[row_gid+row_offset] = alpha * work[0]; result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
} else { } else {
result[row_gid+row_offset] *= beta; result[row_gid+row_offset] *= convert_Dtype(beta);
result[row_gid+row_offset] += alpha * work[0]; result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
} }
} }
} }
...@@ -40,7 +40,11 @@ ...@@ -40,7 +40,11 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define Dtype float
#define Dtype4 float4 #define Dtype4 float4
#define Dtype8 float8 #define Dtype8 float8
...@@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src, ...@@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src,
store(dst_vec, dst, index); store(dst_vec, dst, index);
} }
__kernel void MEAN_FUSE(__global const Dtype * A, __kernel void MEAN_FUSE(__global const T * A,
unsigned int A_col_size, unsigned int A_col_size,
float alpha, float alpha,
__global Dtype4 * result, __global T4 * mean,
__global Dtype * B, __global Dtype * tmp,
__local Dtype4 * work) __local Dtype4 * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global Dtype *src0_read = A + row_gid * 4 * A_col_size; const __global T *src0_read = A + row_gid * 4 * A_col_size;
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size; __global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
Dtype4 dot0, dot1, dot2, dot3; Dtype4 dot0, dot1, dot2, dot3;
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
...@@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A, ...@@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
const Dtype4 b0 = (Dtype4)1.f; const Dtype4 b0 = (Dtype4)1.f;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src0_read); const T4 a0 = vload4(i, src0_read);
const Dtype4 a1 = vload4(i, src0_read + A_col_size); const T4 a1 = vload4(i, src0_read + A_col_size);
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 += a0; dot0 += convert_float4(a0);
dot1 += a1; dot1 += convert_float4(a1);
dot2 += a2; dot2 += convert_float4(a2);
dot3 += a3; dot3 += convert_float4(a3);
i += get_local_size(0); i += get_local_size(0);
} }
...@@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A, ...@@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
if(lid == 0) if(lid == 0)
{ {
result[row_gid] = alpha * work[0]; mean[row_gid] = convert_T(alpha * work[0]);
} }
Dtype4 sum = work[0] * alpha; Dtype4 sum = work[0] * alpha;
i = lid; i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src0_read); const T4 a0 = vload4(i, src0_read);
const Dtype4 a1 = vload4(i, src0_read + A_col_size); const T4 a1 = vload4(i, src0_read + A_col_size);
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 = native_powr(a0 - (Dtype4)sum.x, 2); dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2);
dot1 = native_powr(a1 - (Dtype4)sum.y, 2); dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2);
dot2 = native_powr(a2 - (Dtype4)sum.z, 2); dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2);
dot3 = native_powr(a3 - (Dtype4)sum.w, 2); dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2);
vstore4(dot0, i, dst0_read); vstore4(dot0, i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(dot1, i, dst0_read + A_col_size);
...@@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A, ...@@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
} }
__kernel void MVN_FUSE(__global const Dtype * tmp, __kernel void MVN_FUSE(__global const Dtype * tmp,
__global const Dtype * A, __global const T * A,
__global const Dtype4 * mean, __global const T4 * mean,
unsigned int A_col_size, unsigned int A_col_size,
const float alpha_val, const float alpha_val,
const float eps, const float eps,
const float relu_slope, const float relu_slope,
__global const Dtype4 * bnorm_weight, __global const Dtype4 * bnorm_weight,
__global const Dtype4 * bnorm_bias, __global const Dtype4 * bnorm_bias,
__global Dtype * B, __global T * B,
__local Dtype4 * work) __local Dtype4 * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size; const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
const __global Dtype *src1_read = A + row_gid * 4 * A_col_size; const __global T *src1_read = A + row_gid * 4 * A_col_size;
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size; __global T *dst0_read = B + row_gid * 4 * A_col_size;
Dtype4 dot0, dot1, dot2, dot3; Dtype4 dot0, dot1, dot2, dot3;
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
...@@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp, ...@@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
Dtype4 mean_val = mean[row_gid]; Dtype4 mean_val = convert_float4(mean[row_gid]);
Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps; Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
Dtype4 alpha = (Dtype4)1.f / dev_val; Dtype4 alpha = (Dtype4)1.f / dev_val;
...@@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp, ...@@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
i = lid; i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src1_read); const T4 a0 = vload4(i, src1_read);
const Dtype4 a1 = vload4(i, src1_read + A_col_size); const T4 a1 = vload4(i, src1_read + A_col_size);
const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size); const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size); const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x; dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y; dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z; dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w; dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
dot0 = dot0 * w.x + (Dtype4)b.x; dot0 = dot0 * w.x + (Dtype4)b.x;
dot1 = dot1 * w.y + (Dtype4)b.y; dot1 = dot1 * w.y + (Dtype4)b.y;
...@@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp, ...@@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
dot3 = select(new3, dot3, dot3 > (Dtype4)0.f); dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
#endif #endif
vstore4(dot0, i, dst0_read); vstore4(convert_T(dot0), i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(convert_T(dot1), i, dst0_read + A_col_size);
vstore4(dot2, i, dst0_read + 2 * A_col_size); vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
vstore4(dot3, i, dst0_read + 3 * A_col_size); vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
i += get_local_size(0); i += get_local_size(0);
} }
......
...@@ -42,14 +42,18 @@ ...@@ -42,14 +42,18 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in, __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
const int num, const int channels, const int num, const int channels,
const int height, const int width, const int size, const int height, const int width, const int size,
const Dtype alpha_over_size, const Dtype k, const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
__global Dtype* const out, __global Dtype* const out,
const Dtype negative_beta) { const KERNEL_ARG_DTYPE negative_beta) {
for (int index = get_global_id(0); index < nthreads; for (int index = get_global_id(0); index < nthreads;
index += get_global_size(0)) { index += get_global_size(0)) {
// find out the local offset // find out the local offset
...@@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con ...@@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
const int step = height * width; const int step = height * width;
__global const Dtype* in_off = in + offset; __global const Dtype* in_off = in + offset;
__global Dtype* out_off = out + offset; __global Dtype* out_off = out + offset;
Dtype scale_val; KERNEL_ARG_DTYPE scale_val;
int head = 0; int head = 0;
const int pre_pad = (size - 1) / 2; const int pre_pad = (size - 1) / 2;
const int post_pad = size - pre_pad - 1; const int post_pad = size - pre_pad - 1;
Dtype accum_scale = 0; KERNEL_ARG_DTYPE accum_scale = 0;
// fill the scale at [n, :, h, w] // fill the scale at [n, :, h, w]
// accumulate values // accumulate values
while (head < post_pad && head < channels) { while (head < post_pad && head < channels) {
...@@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con ...@@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
* in_off[(head - size) * step]; * in_off[(head - size) * step];
} }
scale_val = k + accum_scale * alpha_over_size; scale_val = k + accum_scale * alpha_over_size;
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
++head; ++head;
} }
// subtract only // subtract only
...@@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con ...@@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
* in_off[(head - size) * step]; * in_off[(head - size) * step];
} }
scale_val = k + accum_scale * alpha_over_size; scale_val = k + accum_scale * alpha_over_size;
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
++head; ++head;
} }
} }
......
...@@ -42,7 +42,10 @@ ...@@ -42,7 +42,10 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#if defined KERNEL_MAX_POOL #if defined KERNEL_MAX_POOL
......
...@@ -40,9 +40,9 @@ ...@@ -40,9 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
__kernel void slice(__global const Dtype* src, __kernel void slice(__global const Dtype* src,
const int src_plane_size, const int src_plane_size,
......
...@@ -24,6 +24,10 @@ ...@@ -24,6 +24,10 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void kernel_channel_max(const int num, const int channels, __kernel void kernel_channel_max(const int num, const int channels,
const int spatial_dim, __global const T* data, __global T* out) { const int spatial_dim, __global const T* data, __global T* out) {
int index = get_global_id(0); int index = get_global_id(0);
...@@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels, ...@@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels,
__kernel void kernel_channel_subtract(const int count, __kernel void kernel_channel_subtract(const int count,
const int num, const int channels, const int num, const int channels,
const int spatial_dim, __global const T* channel_max, __global T* data) { const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
int index = get_global_id(0); int index = get_global_id(0);
if(index < count) { if(index < count) {
int n = index / channels / spatial_dim; int n = index / channels / spatial_dim;
int s = index % spatial_dim; int s = index % spatial_dim;
data[index] -= channel_max[n * spatial_dim + s]; data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
} }
} }
......
...@@ -42,12 +42,15 @@ ...@@ -42,12 +42,15 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
#if defined(cl_intel_subgroups) #if defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable #pragma OPENCL EXTENSION cl_intel_subgroups : enable
#endif #endif
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels, __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
const int spatial_dim, const int spatial_dim,
__global Dtype* scale, __global Dtype* scale,
...@@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann ...@@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int n = get_global_id(1); int n = get_global_id(1);
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
get_global_size(0), ++s) { get_global_size(0), ++s) {
float maxval = -FLT_MAX; Dtype maxval = -DTYPE_MAX;
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
Dtype tmp = data[(n * channels + c) * spatial_dim + s]; Dtype tmp = data[(n * channels + c) * spatial_dim + s];
maxval = max((Dtype)tmp, (Dtype)maxval); maxval = max((Dtype)tmp, (Dtype)maxval);
} }
maxval = sub_group_reduce_max(maxval * 100000); maxval = sub_group_reduce_max(maxval);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
} }
...@@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann ...@@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale_tmp[s] = maxval / 100000; scale_tmp[s] = maxval;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann ...@@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
sum += out_tmp[c * spatial_dim + s]; sum += out_tmp[c * spatial_dim + s];
} }
sum = sub_group_reduce_add(sum * 100000); sum = sub_group_reduce_add(sum);
group_tmp[get_sub_group_id() * spatial_dim + s] = sum; group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann ...@@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale_tmp[s] = sum / 100000; scale_tmp[s] = sum;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels, ...@@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim; __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
get_global_size(0), ++s) { get_global_size(0), ++s) {
float maxval = -FLT_MAX; Dtype maxval = -DTYPE_MAX;
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
Dtype tmp = data[(n * channels + c) * spatial_dim + s]; Dtype tmp = data[(n * channels + c) * spatial_dim + s];
maxval = max((Dtype)tmp, (Dtype)maxval); maxval = max((Dtype)tmp, (Dtype)maxval);
} }
maxval = sub_group_reduce_max(maxval * 100000); maxval = sub_group_reduce_max(maxval);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
} }
...@@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels, ...@@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale[n * spatial_dim + s] = maxval / 100000; scale[n * spatial_dim + s] = maxval;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
...@@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels, ...@@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
sum += out[n * channels * spatial_dim + c * spatial_dim + s]; sum += out[n * channels * spatial_dim + c * spatial_dim + s];
} }
sum = sub_group_reduce_add(sum * 100000); sum = sub_group_reduce_add(sum);
group_tmp[get_sub_group_id() * spatial_dim + s] = sum; group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
...@@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels, ...@@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale[n * spatial_dim + s] = sum / 100000; scale[n * spatial_dim + s] = sum;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment