fp16 ocl support for googlenet

Signed-off-by: Li Peng <peng.li@intel.com>

fp16 ocl support for googlenet
Signed-off-by: Li Peng <peng.li@intel.com>
3dd91688 · Li Peng · 329abb5b · 3dd91688 · 3dd91688 · 3dd91688
Commit 3dd91688 authored Apr 26, 2018 by Li Peng
32 changed files
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -128,14 +128,14 @@ public:
            for( i = 0; i < ninputs; i++ )
            {
                Mat& inp = *inputs[i];
-                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
                           inp.dims == 4 && inp.size[0] == output.size[0] &&
                           inp.size[2] == output.size[2] &&
                           inp.size[3] == output.size[3] );
                nchannels += inp.size[1];
            }
            CV_Assert( nchannels == output.size[1] );
-            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
            cc.chptrs.resize(nchannels*batchsz);
@@ -186,6 +186,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);
@@ -199,11 +200,12 @@ public:
        int num_concats = total(shape(inputs[0]), 0, cAxis);
        int offset_concat_axis = 0;
        UMat& outMat = outputs[0];
-        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
+        String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
+        String kname = format("concat_%s", use_half ? "half" : "float");
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt);
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
            if (kernel.empty())
                return false;
@@ -235,7 +237,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -94,7 +94,7 @@ public:
        CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
        const Mat &input = *inputs[0];
-        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
+        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
        for (size_t i = 0; i < inputs.size(); i++)
        {
            CV_Assert(inputs[i]->type() == input.type());
@@ -288,7 +288,7 @@ public:
        newActiv = true;
        activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
-        if (preferableTarget == DNN_TARGET_OPENCL)
+        if (IS_DNN_OPENCL_TARGET(preferableTarget))
        {
            Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
            if (!activ_power.empty())
@@ -842,6 +842,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);
@@ -860,6 +861,7 @@ public:
            config.dilation = dilation;
            config.group = inputs[0].size[1] / umat_blobs[0].size[1];
            config.bias_term = (hasBias()) ? true : false;
+            config.use_half = use_half;
            convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
        }
@@ -964,7 +966,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
@@ -1360,6 +1362,9 @@ public:
        std::vector<UMat> outputs;
        std::vector<UMat> internals;
+        if (inputs_.depth() == CV_16S)
+            return false;
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
        internals_.getUMatVector(internals);
@@ -1450,7 +1455,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -176,7 +176,7 @@ public:
    {
        CV_TRACE_FUNCTION();
-        CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   func.applyOCL(inputs_arr, outputs_arr, internals_arr))
@@ -223,7 +223,12 @@ public:
 #ifdef HAVE_OPENCL
 static String oclGetTMacro(const UMat &m)
 {
-    return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
+    String str_name = ocl::typeToStr(m.type());
+    if (str_name == "short")
+        str_name = "half";
+    return format("-DT=%s -Dconvert_T=convert_%s ", str_name.c_str(), str_name.c_str());
 }
 #endif
@@ -516,8 +521,28 @@ struct SigmoidFunctor
 #ifdef HAVE_OPENCL
    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
-        // TODO: implement OCL version
+        std::vector<UMat> inputs;
-        return false;
+        std::vector<UMat> outputs;
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+            ocl::Kernel kernel("SigmoidForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+        return true;
    }
 #endif
@@ -561,8 +586,28 @@ struct ELUFunctor
 #ifdef HAVE_OPENCL
    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
-        // TODO: implement OCL version
+        std::vector<UMat> inputs;
-        return false;
+        std::vector<UMat> outputs;
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+            ocl::Kernel kernel("ELUForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+        return true;
    }
 #endif
@@ -604,8 +649,28 @@ struct AbsValFunctor
 #ifdef HAVE_OPENCL
    bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
    {
-        // TODO: implement OCL version
+        std::vector<UMat> inputs;
-        return false;
+        std::vector<UMat> outputs;
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+            ocl::Kernel kernel("AbsValForward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+        return true;
    }
 #endif

--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -271,6 +271,9 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        if (inputs_.depth() == CV_16S && op != SUM)
+            return false;
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
@@ -284,10 +287,15 @@ public:
                    {
                        size_t localsize[] = { 128 };
                        size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
+                        String opts;
+                        if (inputs_.depth() == CV_16S)
+                            opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
+                        else
+                            opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
                        for (int i = 0; i < (inputs.size() - 1); ++i)
                        {
-                            String buildopt = format("-DLOOP=%d", i);
+                            String buildopt = format("-DLOOP=%d", i) + opts;
                            ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
                            int idx = 0;
                            UMat inpMat = (i == 0) ? inputs[0] : UMat();
@@ -306,6 +314,9 @@ public:
                    }
                    else
                    {
+                        if (inputs_.depth() == CV_16S)
+                            return false;
                        float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
                        float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
                        UMat mul0, mul1;
@@ -343,7 +354,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -140,7 +140,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   outputs_arr.isUMatVector() &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -64,6 +64,7 @@ public:
 #ifdef HAVE_OPENCL
    Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
    std::vector<UMat> umat_blobs;
+    std::vector<UMat> half_blobs;
 #endif
    FullyConnectedLayerImpl(const LayerParams& params)
@@ -277,6 +278,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);
@@ -293,6 +295,17 @@ public:
            config.bias_term = bias;
            config.M = outerSize;
            config.K = innerSize;
+            config.use_half = use_half;
+            if (use_half)
+            {
+                half_blobs.resize(umat_blobs.size());
+                for (int i = 0; i < umat_blobs.size(); i++)
+                {
+                    if (!umat_blobs[i].empty())
+                        convertFp16(umat_blobs[i], half_blobs[i]);
+                }
+            }
            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
        }
@@ -309,13 +322,15 @@ public:
            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
            dstMat.setTo(0.0f);
-            if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat))
+            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
+                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
+                                         dstMat))
            {
                ret = false;
                break;
            }
-            if (bias && (outerSize > 1))
+            if (!use_half && bias && (outerSize > 1))
            {
                UMat& biases = umat_blobs[1];
                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
@@ -353,7 +368,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -106,6 +106,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);
@@ -128,6 +129,7 @@ public:
            config.height = inputs[0].size[2];
            config.width = inputs[0].size[3];
            config.norm_by_size = normBySize;
+            config.use_half = use_half;
            lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
        }
@@ -146,7 +148,7 @@ public:
        CV_Assert(inputs_arr.total() == outputs_arr.total());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -102,6 +102,9 @@ public:
    {
        UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
        UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
+        bool use_half = (inputs[0].depth() == CV_16S);
+        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
+                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");
        int splitDim = (acrossChannels) ? 1 : 2;
        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
@@ -111,12 +114,11 @@ public:
            int newRows = total(shape(inpMat), 0, splitDim);
            MatShape s = shape(newRows, inpMat.total() / newRows);
-            UMat oneMat = UMat::ones(s[1], 1, CV_32F);
+            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
-            UMat meanMat = UMat(s[0], 1, CV_32F);
            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
            float alpha = 1.0f / s[1];
-            String buildopt = "-DNUM=4";
+            String buildopt = "-DNUM=4" + opts;
            ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
            size_t localsize[] = { 128 };
            size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
@@ -167,13 +169,14 @@ public:
        int row_size = total(shape(inputs[0]), 0, splitDim);
        int plane_size = total(shape(inputs[0]), splitDim);
        if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
-        {
+            return fast_forward_ocl(inputs, outputs);
-            bool ret = fast_forward_ocl(inputs, outputs);
-            return ret;
+        if (inputs[0].depth() == CV_16S)
-        }
+            return false;
        UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
        UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
+        String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
        {
@@ -195,7 +198,7 @@ public:
            int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
            size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
-            String buildopt = format("-DNUM=%d", number);
+            String buildopt = format("-DNUM=%d", number) + opts;
            if (normVariance)
            {
                String kname = format("calc_mean%d", number);
@@ -249,7 +252,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -147,6 +147,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        bool use_half = (inps.depth() == CV_16S);
        inps.getUMatVector(inputs);
        outs.getUMatVector(outputs);
@@ -164,6 +165,7 @@ public:
                                (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
                                               LIBDNN_POOLING_METHOD_STO);
            config.avePoolPaddedArea = avePoolPaddedArea;
+            config.use_half = use_half;
            poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
        }
@@ -189,7 +191,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -181,6 +181,7 @@ public:
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
+        bool use_half = (inputs_.depth() == CV_16S);
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
@@ -188,6 +189,11 @@ public:
            (total(shape(outputs[0]), 2) % 4 != 0))
            return false;
+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
        const UMat& inpMat = inputs[0];
        for (size_t i = 0; i < outputs.size(); i++)
        {
@@ -196,7 +202,7 @@ public:
            int rows = outputs[i].size[2];
            int cols = outputs[i].size[3];
-            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc);
+            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
            size_t local[] = { 128 };
            size_t global[] = { (size_t)groups * channels / 4 * local[0] };
            int idx = 0;
@@ -222,7 +228,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -99,15 +99,16 @@ public:
        softmaxOp.release();
    }
-    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns)
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
    {
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
        std::vector<UMat> internals;
-        inps.getUMatVector(inputs);
+        bool use_half = (inputs_.depth() == CV_16S);
-        outs.getUMatVector(outputs);
+        inputs_.getUMatVector(inputs);
-        itns.getUMatVector(internals);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
        if (softmaxOp.empty())
        {
@@ -117,6 +118,7 @@ public:
            config.axis = axisRaw;
            config.channels = inputs[0].size[axisRaw];
            config.logsoftmax = logSoftMax;
+            config.use_half = use_half;
            softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
        }
@@ -128,15 +130,13 @@ public:
            return true;
        UMat& bufMat = internals[0];
-        src.copyTo(dstMat);
        int axis = clamp(axisRaw, src.dims);
        MatShape s = shape(src);
        size_t outerSize = total(s, 0, axis);
        size_t channels = src.size[axis];
        size_t innerSize = total(s, axis + 1);
-        String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+        String buildOpts = format("-DT=%s", use_half ? "half" : "float");
        ocl::Kernel kmax, ksub, ksum, kdiv;
        if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
@@ -152,38 +152,31 @@ public:
        if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
            return false;
-        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
        size_t bufSize = internals[0].total();
        size_t totalSize = src.total();
-        // adjust local/global size
+        size_t internal_globalSize[1] = { bufSize };
-        size_t internal_localSize[1] = { (bufSize == 1) ? 1 : wgSize };
+        size_t total_globalSize[1] = { totalSize };
-        size_t internal_globalSize[1] = { divUp(bufSize, (unsigned int)internal_localSize[0]) * internal_localSize[0] };
-        // adjust local/global size (total)
-        size_t total_localSize[1] = { (totalSize == 1) ? 1 : wgSize };
-        size_t total_globalSize[1] = { divUp(totalSize, (unsigned int)total_localSize[0]) * total_localSize[0] };
        kmax.args((int)outerSize, (int)channels, (int)innerSize,
-                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
-        if (!kmax.run(1, internal_globalSize, internal_localSize, false))
+        if (!kmax.run(1, internal_globalSize, NULL, false))
            return false;
        ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
-                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+                  ocl::KernelArg::PtrReadOnly(bufMat),
-        if (!ksub.run(1, total_globalSize, total_localSize, false))
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
+        if (!ksub.run(1, total_globalSize, NULL, false))
            return false;
-        cv::exp(dstMat, dstMat);
        ksum.args((int)outerSize, (int)channels, (int)innerSize,
                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
-        if (!ksum.run(1, internal_globalSize, internal_localSize, false))
+        if (!ksum.run(1, internal_globalSize, NULL, false))
            return false;
        kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
-        if (!kdiv.run(1, total_globalSize, total_localSize, false))
+        if (!kdiv.run(1, total_globalSize, NULL, false))
            return false;
        return true;
@@ -195,7 +188,7 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))

--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -59,7 +59,8 @@ struct OCL4DNNConvConfig
        stride(1, 1),
        dilation(1, 1),
        group(1),
-        bias_term(false)
+        bias_term(false),
+        use_half(false)
    {}
    MatShape in_shape;
    MatShape out_shape;
@@ -69,6 +70,7 @@ struct OCL4DNNConvConfig
    Size dilation;
    int group; // = 1;
    bool bias_term; // = false;
+    bool use_half; // = false;
 };
 typedef enum {
@@ -272,6 +274,8 @@ class OCL4DNNConvSpatial
        int32_t group_;
        bool bias_term_;
        UMat swizzled_weights_umat;
+        UMat weights_half;
+        UMat bias_half;
        UMat bottom_data2_;
        int32_t bottom_index_;
@@ -327,6 +331,7 @@ class OCL4DNNConvSpatial
        ocl4dnnFusedActiv_t fused_activ_;
        float power_;
        bool fused_eltwise_;
+        bool use_half_;
 };
 typedef enum {
@@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig
        channels(0),
        pool_method(LIBDNN_POOLING_METHOD_MAX),
        global_pooling(false),
-        avePoolPaddedArea(false)
+        avePoolPaddedArea(true),
+        use_half(false)
    {}
    MatShape in_shape;
    MatShape out_shape;
@@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig
    ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
    bool global_pooling; // = false;
    bool avePoolPaddedArea;
+    bool use_half;
 };
 template<typename Dtype>
@@ -391,13 +398,14 @@ class OCL4DNNPool
        int32_t pooled_height_;
        int32_t pooled_width_;
        bool avePoolPaddedArea;
+        bool use_half;
 };
 struct OCL4DNNInnerProductConfig
 {
    OCL4DNNInnerProductConfig() :
        num_output(0), M(0), K(0),
-        bias_term(false), transpose(false), phase_test(true)
+        bias_term(false), transpose(false), phase_test(true), use_half(false)
    {}
    int num_output;
    int M;
@@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig
    bool bias_term;
    bool transpose; // = false;
    bool phase_test; // = true;
+    bool use_half; // = false;
 };
 template<typename Dtype>
@@ -428,6 +437,7 @@ class OCL4DNNInnerProduct
        bool transpose_;
        bool image_copied_;
        bool phase_test_;
+        bool use_half_;
 };
 typedef enum {
@@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig
        lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
        phase_test(true),
        local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
-        batch_size(0), channels(0), height(0), width(0)
+        batch_size(0), channels(0), height(0), width(0), use_half(false)
    {}
    MatShape in_shape;
    LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
@@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig
    int32_t channels;
    int32_t height;
    int32_t width;
+    bool use_half;
 };
 template<typename Dtype>
@@ -477,16 +488,18 @@ class OCL4DNNLRN
        int32_t height_;
        int32_t width_;
        bool norm_by_size_;
+        bool use_half_;
 };
 struct OCL4DNNSoftmaxConfig
 {
-    OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false)
+    OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
    {}
    MatShape in_shape;
    int axis;
    int channels;
    bool logsoftmax;
+    bool use_half;
 };
 template<typename Dtype>
@@ -506,6 +519,7 @@ class OCL4DNNSoftmax
        bool use_slm_;
        bool log_softmax_;
        UMat scale_data_;
+        bool use_half_;
 };
 }}} // namespace cv::dnn::ocl4dnn

--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config
    K_ = config.K;
    phase_test_ = config.phase_test;
    image_copied_ = false;
+    use_half_ = config.use_half;
 }
 template<typename Dtype>
@@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
        if (M_ <= max_image_size &&
            N_ <= max_image_size &&
            K_ <= max_image_size &&
-            cv::traits::Depth<Dtype>::value == CV_32F &&
            ocl::Device::getDefault().intelSubgroupsSupport())
        {
            ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
                                           M_, N_, K_, bottom, weight, UMat(), top,
                                           max_image_size);
        }
+        if (use_half_ && bias_term_)
+        {
+            UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
+            UMat newbias, tmpTop;
+            convertFp16(bias, newbias);
+            convertFp16(top, tmpTop);
+            cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
+            convertFp16(tmpTop, top);
+        }
        return ret;
    }
 }

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
@@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
    channels_ = config.channels;
    height_ = config.height;
    width_ = config.width;
+    use_half_ = config.use_half;
 }
 template<typename Dtype>
@@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
    int32_t n_threads = num_ * height_ * width_;
    size_t global_work_size_[1] = {(size_t)n_threads};
    String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
+    opts += format("-D Dtype=%s", (use_half_) ? "half" : "float");
    ocl::Kernel oclk_lrn_fill;
-    if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
+    String kname = format("lrn_full_no_scale_%s", (use_half_) ? "half" : "float");
+    if (!oclk_lrn_fill.create(kname.c_str(), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
        return false;
    oclk_lrn_fill.set(argIdx++, n_threads);

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
    channels_ = config.channels;
    pool_method_ = config.pool_method;
    avePoolPaddedArea = config.avePoolPaddedArea;
+    use_half = config.use_half;
    for (int i = 0; i < spatial_dims; ++i)
    {
@@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
    case LIBDNN_POOLING_METHOD_MAX:
        {
            bool haveMask = !top_mask.empty();
+            String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
+            kname += (use_half) ? "_half" : "_float";
            ocl::Kernel oclk_max_pool_forward(
-                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
+                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       (use_half) ? "half" : "float",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
@@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
        {
            CV_Assert(top_mask.empty());
-            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
+            String kname = format("ave_pool_forward_%s", (use_half) ? "half" : "float");
+            ocl::Kernel oclk_ave_pool_forward(
+                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
-                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                format(" -D Dtype=%s -D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       (use_half) ? "half" : "float",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
@@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
        {
            CV_Assert(top_mask.empty());
-            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
+            String kname = format("sto_pool_forward_test_%s", (use_half) ? "half" : "float");
+            ocl::Kernel oclk_sto_pool_forward(
+                kname.c_str(),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d",

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
@@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
    softmax_axis_ = config.axis;
    channels_ = config.channels;
    log_softmax_ = config.logsoftmax;
+    use_half_ = config.use_half;
    inner_num_ = 1;
    outer_num_ = 1;
@@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
        if (log_softmax_) opts += " -DLOG_SOFTMAX ";
        if (use_slm_)
-            kname = CL_KERNEL_SELECT("softmax_forward_slm");
+            kname = "softmax_forward_slm";
        else
-            kname = CL_KERNEL_SELECT("softmax_forward");
+            kname = "softmax_forward";
+        kname += format("%s", (use_half_) ? "_half" : "_float");
+        opts += format(" -D Dtype=%s -D DTYPE_MAX=%s", (use_half_) ? "half" : "float",
+                       (use_half_) ? "HALF_MAX" : "FLT_MAX");
        if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
            return false;

--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@@ -40,9 +40,17 @@
 //
 //M*/
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define KERNEL_ARG_DTYPE float
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 #ifndef RELU_NO_SLOPE
-, T negative_slope
+, KERNEL_ARG_DTYPE negative_slope
 #endif
 ) {
  int index = get_global_id(0);
@@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 }
 __kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
-                           const T minValue, const T maxValue)
+                           const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
 {
  int index = get_global_id(0);
  if(index < count)
  {
    T x = in[index];
-    out[index] = clamp(x, minValue, maxValue);
+    out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
  }
 }
 __kernel void PReLUForward(const int count, const int channels, const int plane_size,
-                           __global const T* in, __global T* out, __global const T* slope_data)
+                           __global const T* in, __global T* out,
+                           __global const KERNEL_ARG_DTYPE* slope_data)
 {
  int index = get_global_id(0);
  int c = (index / plane_size) % channels;
@@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out)
    out[index] = fabs(in[index]);
 }
-__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) {
+__kernel void PowForward(const int n, __global const T* in, __global T* out,
+                         const KERNEL_ARG_DTYPE power,
+                         const KERNEL_ARG_DTYPE scale,
+                         const KERNEL_ARG_DTYPE shift)
+{
  int index = get_global_id(0);
  if (index < n)
    out[index] = pow(shift + scale * in[index], power);
 }
+__kernel void ELUForward(const int n, __global const T* in, __global T* out)
+{
+  int index = get_global_id(0);
+  if (index < n)
+  {
+    T src = in[index];
+    out[index] = (src >= 0.f) ? src : exp(src) - 1;
+  }
+}
--- a/modules/dnn/src/opencl/concat.cl
+++ b/modules/dnn/src/opencl/concat.cl
@@ -39,22 +39,29 @@
 //
 //M*/
-__kernel void concat(const int nthreads,
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+__kernel void TEMPLATE(concat, Dtype)(const int nthreads,
                                      __global const Dtype* in_data,
                                      const int num_concats,
                                      const int concat_size,
                                      const int top_concat_axis,
                                      const int bottom_concat_axis,
                                      const int offset_concat_axis,
-                     __global Dtype* out_data) {
+                                      __global Dtype* out_data)
+{
-  for (int index = get_global_id(0); index < nthreads;
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
-      index += get_global_size(0)) {
+    {
        const int total_concat_size = concat_size * bottom_concat_axis;
        const int concat_num = index / total_concat_size;
        const int concat_index = index % total_concat_size;
-    const int top_index = concat_index
+        const int top_index = concat_index +
-        + (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+                              (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
        out_data[top_index] = in_data[index];
    }
 }
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -40,27 +40,29 @@
 //
 //M*/
-#if APPLY_BIAS
+#if defined(cl_khr_fp16)
-#define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#else
-#define BIAS_KERNEL_ARG
 #endif
+#define KERNEL_ARG_DTYPE float
+#define TYPE_FLOAT  1
+#define TYPE_HALF   2
 #if defined(FUSED_CONV_RELU)
-#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope)))
+#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
-#define FUSED_ARG Dtype negative_slope,
+#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
 #elif defined(FUSED_CONV_PRELU)
-#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
+#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
-#define FUSED_ARG __global const Dtype *negative_slope,
+#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
 #elif defined(FUSED_CONV_POWER)
-#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
+#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
-#define FUSED_ARG Dtype power,
+#define FUSED_ARG KERNEL_ARG_DTYPE power,
 #elif defined(FUSED_CONV_TANH)
 #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
 #define FUSED_ARG
 #elif defined(FUSED_CONV_RELU6)
-#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value))
+#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
-#define FUSED_ARG Dtype min_value, Dtype max_value,
+#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
 #else
 #define ACTIVATION_RELU_FUNCTION(x, c) (x)
 #define FUSED_ARG
@@ -74,6 +76,11 @@
 #define ELTWISE_DATA_ARG
 #endif
+#if APPLY_BIAS
+#define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#else
+#define BIAS_KERNEL_ARG
+#endif
 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
@@ -97,6 +104,16 @@
 #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
 #if defined(convolve_simd) || defined(Conv_Interleaved)
+#if TYPE == TYPE_HALF
+#define INT_TYPE ushort
+#define INT_TYPE2 ushort2
+#define INT_TYPE4 ushort4
+#define INT_TYPE8 ushort8
+#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
+#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
+#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
+#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
+#else
 #define INT_TYPE uint
 #define INT_TYPE2 uint2
 #define INT_TYPE4 uint4
@@ -106,6 +123,7 @@
 #define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
 #define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
 #endif
+#endif
 #ifdef KERNEL_BASIC
@@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float
                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
 typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
+typedef struct half1 { half s0; } half1;
+typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
+typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
+typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
+typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
+typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; } half10;
+typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; } half11;
+typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; } half12;
+typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
+typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
+typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
+                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
+typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 #define OUT_PITCH_X output_width
 #define ROW_PITCH input_width

--- a/modules/dnn/src/opencl/eltwise.cl
+++ b/modules/dnn/src/opencl/eltwise.cl
@@ -40,9 +40,9 @@
 //
 //M*/
-#define Dtype float
+#if defined(cl_khr_fp16)
-#define Dtype4 float4
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define Dtype8 float8
+#endif
 __kernel void op_sum4(__global const Dtype * A,
                      __global const Dtype * B,
@@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A,
        a2 = vload4(i, src0_read + 2 * A_col_size);
        a3 = vload4(i, src0_read + 3 * A_col_size);
-        dot0 = a0 * coeff1 + b0 * coeff2;
+        dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
-        dot1 = a1 * coeff1 + b1 * coeff2;
+        dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
-        dot2 = a2 * coeff1 + b2 * coeff2;
+        dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
-        dot3 = a3 * coeff1 + b3 * coeff2;
+        dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
 #else
        a0 = vload4(i, dst0_read);
        a1 = vload4(i, dst0_read + A_col_size);
        a2 = vload4(i, dst0_read + 2 * A_col_size);
        a3 = vload4(i, dst0_read + 3 * A_col_size);
-        dot0 = a0 + b0 * coeff2;
+        dot0 = a0 + b0 * (Dtype4)coeff2;
-        dot1 = a1 + b1 * coeff2;
+        dot1 = a1 + b1 * (Dtype4)coeff2;
-        dot2 = a2 + b2 * coeff2;
+        dot2 = a2 + b2 * (Dtype4)coeff2;
-        dot3 = a3 + b3 * coeff2;
+        dot3 = a3 + b3 * (Dtype4)coeff2;
 #endif
        vstore4(dot0, i, dst0_read);
        vstore4(dot1, i, dst0_read + A_col_size);

--- a/modules/dnn/src/opencl/gemm_buffer.cl
+++ b/modules/dnn/src/opencl/gemm_buffer.cl
--- a/modules/dnn/src/opencl/gemm_image.cl
+++ b/modules/dnn/src/opencl/gemm_image.cl
--- a/modules/dnn/src/opencl/math.cl
+++ b/modules/dnn/src/opencl/math.cl
@@ -40,16 +40,20 @@
 //
 //M*/
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float
-__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x,
+__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
                                   const int offx, __global Dtype* y,
                                   const int offy) {
  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
    Dtype src = x[offx + index];
    Dtype dst = y[offy + index];
-    y[offy + index] = alpha * src + dst;
+    y[offy + index] = convert_Dtype(alpha) * src + dst;
  }
 }
--- a/modules/dnn/src/opencl/matvec_mul.cl
+++ b/modules/dnn/src/opencl/matvec_mul.cl
@@ -39,41 +39,45 @@
 //
 //M*/
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float
 __kernel void TEMPLATE(matvec_mul4,Dtype)(
-          __global const float * A,
+          __global const Dtype * A,
          int offA,
          unsigned int A_col_size,
          unsigned int trail_item,
-          __global const float * v,
+          __global const Dtype * v,
          int offv,
-          float alpha,
+          KERNEL_ARG_DTYPE alpha,
-          float beta,
+          KERNEL_ARG_DTYPE beta,
-          __global float4 * result,
+          __global Dtype4* result,
          int offr,
-          __local float4 * work)
+          __local Dtype4* work)
 {
  unsigned int row_gid = get_group_id(0);
  unsigned int lid = get_local_id(0);
-  const __global float *src0_read = A + row_gid * 4 * A_col_size + offA;
+  const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
-  const __global float *src1_read = v + offv;
+  const __global Dtype *src1_read = v + offv;
-  result = (__global float4*)((__global float*)result + offr);
+  result = (__global Dtype4*)((__global Dtype*)result + offr);
-  float4 dot0 = (float4)(0.f);
+  Dtype4 dot0 = (Dtype4)(0.f);
-  float4 dot1 = (float4)(0.f);
+  Dtype4 dot1 = (Dtype4)(0.f);
-  float4 dot2 = (float4)(0.f);
+  Dtype4 dot2 = (Dtype4)(0.f);
-  float4 dot3 = (float4)(0.f);
+  Dtype4 dot3 = (Dtype4)(0.f);
  unsigned int i = lid;
  while( i < A_col_size / 4) {
-    const float4 a0 = vload4(i, src0_read);
+    const Dtype4 a0 = vload4(i, src0_read);
-    const float4 a1 = vload4(i, src0_read + A_col_size);
+    const Dtype4 a1 = vload4(i, src0_read + A_col_size);
-    const float4 a2 = vload4(i, src0_read + 2 * A_col_size);
+    const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
-    const float4 a3 = vload4(i, src0_read + 3 * A_col_size);
+    const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
-    const float4 b0 = vload4(i, src1_read);
+    const Dtype4 b0 = vload4(i, src1_read);
    dot0 += a0 * b0;
    dot1 += a1 * b0;
@@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
  {
    if(trail_item != 0)
    {
-      const __global float *src0_trail = src0_read + i * 4;
+      const __global Dtype *src0_trail = src0_read + i * 4;
-      const __global float *src1_trail = src1_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
      for(unsigned int i = 0; i < trail_item; ++i) {
-        const float at0 = src0_trail[i];
+        const Dtype at0 = src0_trail[i];
-        const float at1 = src0_trail[i + A_col_size];
+        const Dtype at1 = src0_trail[i + A_col_size];
-        const float at2 = src0_trail[i + 2 * A_col_size];
+        const Dtype at2 = src0_trail[i + 2 * A_col_size];
-        const float at3 = src0_trail[i + 3 * A_col_size];
+        const Dtype at3 = src0_trail[i + 3 * A_col_size];
-        const float bt = src1_trail[i];
+        const Dtype bt = src1_trail[i];
        work[lid].s0 += at0 * bt;
        work[lid].s1 += at1 * bt;
@@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
  }
  if(lid == 0) {
    if(beta == (Dtype)0)
-      result[row_gid] = alpha * work[0];
+      result[row_gid] = convert_Dtype(alpha) * work[0];
    else
-      result[row_gid] = alpha * work[0] + beta * result[row_gid];
+      result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
  }
 }
 /* This kernel used for the trailing rows when row_of_A %4 !=0 */
 __kernel void TEMPLATE(matvec_mul1,Dtype)(
-          __global const float * A,
+          __global const Dtype * A,
          int offA,
          unsigned int A_col_size,
          unsigned int row_offset,
          unsigned int trail_item,
-          __global const float * v,
+          __global const Dtype * v,
          int offv,
-          float alpha,
+          KERNEL_ARG_DTYPE alpha,
-          float beta,
+          KERNEL_ARG_DTYPE beta,
-          __global float * result,
+          __global Dtype * result,
          int offr,
-          __local float * work)
+          __local Dtype * work)
 {
  unsigned int row_gid = get_group_id(0);
  unsigned int lid = get_local_id(0);
-  const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
+  const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
-  const __global float *src1_read = v + + offv;
+  const __global Dtype *src1_read = v + + offv;
  result = result + offr;
-  float4 dot0 = (float4)(0.f);
+  Dtype4 dot0 = (Dtype4)(0.f);
  unsigned int i = lid;
  while( i < A_col_size / 4)
  {
-    const float4 a0 = vload4(i, src0_read);
+    const Dtype4 a0 = vload4(i, src0_read);
-    const float4 b0 = vload4(i, src1_read);
+    const Dtype4 b0 = vload4(i, src1_read);
    dot0 += a0 * b0;
    i += get_local_size(0);
@@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
  {
    if(trail_item != 0)
    {
-      const __global float *src0_trail = src0_read + i * 4;
+      const __global Dtype *src0_trail = src0_read + i * 4;
-      const __global float *src1_trail = src1_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
      for(unsigned int i = 0; i < trail_item; ++i) {
-        const float at0 = src0_trail[i];
+        const Dtype at0 = src0_trail[i];
-        const float bt = src1_trail[i];
+        const Dtype bt = src1_trail[i];
        work[lid] += at0 * bt;
      }
@@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
  if(lid == 0) {
    if(beta == (Dtype)0) {
-      result[row_gid+row_offset] = alpha * work[0];
+      result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
    } else {
-      result[row_gid+row_offset] *= beta;
+      result[row_gid+row_offset] *= convert_Dtype(beta);
-      result[row_gid+row_offset] += alpha * work[0];
+      result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
    }
  }
 }
--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@@ -40,6 +40,10 @@
 //
 //M*/
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 #define Dtype  float
 #define Dtype4 float4
 #define Dtype8 float8
@@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src,
    store(dst_vec, dst, index);
 }
-__kernel void MEAN_FUSE(__global const Dtype * A,
+__kernel void MEAN_FUSE(__global const T * A,
                        unsigned int A_col_size,
                        float alpha,
-                        __global Dtype4 * result,
+                        __global T4 * mean,
-                        __global Dtype * B,
+                        __global Dtype * tmp,
                        __local Dtype4 * work)
 {
    unsigned int row_gid = get_group_id(0);
    unsigned int lid = get_local_id(0);
-    const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
+    const __global T *src0_read = A + row_gid * 4 * A_col_size;
-    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
    Dtype4 dot0, dot1, dot2, dot3;
    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
@@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
    const Dtype4 b0 = (Dtype4)1.f;
    while( i < A_col_size / 4)
    {
-        const Dtype4 a0 = vload4(i, src0_read);
+        const T4 a0 = vload4(i, src0_read);
-        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
-        dot0 += a0;
+        dot0 += convert_float4(a0);
-        dot1 += a1;
+        dot1 += convert_float4(a1);
-        dot2 += a2;
+        dot2 += convert_float4(a2);
-        dot3 += a3;
+        dot3 += convert_float4(a3);
        i += get_local_size(0);
    }
@@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
    if(lid == 0)
    {
-        result[row_gid] = alpha * work[0];
+        mean[row_gid] = convert_T(alpha * work[0]);
    }
    Dtype4 sum = work[0] * alpha;
    i = lid;
    while( i < A_col_size / 4)
    {
-        const Dtype4 a0 = vload4(i, src0_read);
+        const T4 a0 = vload4(i, src0_read);
-        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
-        dot0 = native_powr(a0 - (Dtype4)sum.x, 2);
+        dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2);
-        dot1 = native_powr(a1 - (Dtype4)sum.y, 2);
+        dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2);
-        dot2 = native_powr(a2 - (Dtype4)sum.z, 2);
+        dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2);
-        dot3 = native_powr(a3 - (Dtype4)sum.w, 2);
+        dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2);
        vstore4(dot0, i, dst0_read);
        vstore4(dot1, i, dst0_read + A_col_size);
@@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
 }
 __kernel void MVN_FUSE(__global const Dtype * tmp,
-                       __global const Dtype * A,
+                       __global const T * A,
-                       __global const Dtype4 * mean,
+                       __global const T4 * mean,
                       unsigned int A_col_size,
                       const float alpha_val,
                       const float eps,
                       const float relu_slope,
                       __global const Dtype4 * bnorm_weight,
                       __global const Dtype4 * bnorm_bias,
-                       __global Dtype * B,
+                       __global T * B,
                       __local Dtype4 * work)
 {
    unsigned int row_gid = get_group_id(0);
    unsigned int lid = get_local_id(0);
    const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
-    const __global Dtype *src1_read = A + row_gid * 4 * A_col_size;
+    const __global T *src1_read = A + row_gid * 4 * A_col_size;
-    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    __global T *dst0_read = B + row_gid * 4 * A_col_size;
    Dtype4 dot0, dot1, dot2, dot3;
    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
@@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-    Dtype4 mean_val = mean[row_gid];
+    Dtype4 mean_val = convert_float4(mean[row_gid]);
    Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
    Dtype4 alpha = (Dtype4)1.f / dev_val;
@@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
    i = lid;
    while( i < A_col_size / 4)
    {
-        const Dtype4 a0 = vload4(i, src1_read);
+        const T4 a0 = vload4(i, src1_read);
-        const Dtype4 a1 = vload4(i, src1_read + A_col_size);
+        const T4 a1 = vload4(i, src1_read + A_col_size);
-        const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size);
+        const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
-        const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size);
+        const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
-        dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x;
+        dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
-        dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y;
+        dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
-        dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z;
+        dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
-        dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w;
+        dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
        dot0 = dot0 * w.x + (Dtype4)b.x;
        dot1 = dot1 * w.y + (Dtype4)b.y;
@@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
        dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
 #endif
-        vstore4(dot0, i, dst0_read);
+        vstore4(convert_T(dot0), i, dst0_read);
-        vstore4(dot1, i, dst0_read + A_col_size);
+        vstore4(convert_T(dot1), i, dst0_read + A_col_size);
-        vstore4(dot2, i, dst0_read + 2 * A_col_size);
+        vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
-        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+        vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
        i += get_local_size(0);
    }

--- a/modules/dnn/src/opencl/ocl4dnn_lrn.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
@@ -42,14 +42,18 @@
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#define KERNEL_ARG_DTYPE float
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
                             const int num, const int channels,
                             const int height, const int width, const int size,
-                             const Dtype alpha_over_size, const Dtype k,
+                             const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
                             __global Dtype* const out,
-                             const Dtype negative_beta) {
+                             const KERNEL_ARG_DTYPE negative_beta) {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0)) {
    // find out the local offset
@@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
    const int step = height * width;
    __global const Dtype* in_off = in + offset;
    __global Dtype* out_off = out + offset;
-    Dtype scale_val;
+    KERNEL_ARG_DTYPE scale_val;
    int head = 0;
    const int pre_pad = (size - 1) / 2;
    const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
+    KERNEL_ARG_DTYPE accum_scale = 0;
    // fill the scale at [n, :, h, w]
    // accumulate values
    while (head < post_pad && head < channels) {
@@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
            * in_off[(head - size) * step];
      }
      scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
      ++head;
    }
    // subtract only
@@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
            * in_off[(head - size) * step];
      }
      scale_val = k + accum_scale * alpha_over_size;
-      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
      ++head;
    }
  }

--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -42,7 +42,10 @@
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 #if defined KERNEL_MAX_POOL

--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@@ -40,9 +40,9 @@
 //
 //M*/
-#define Dtype float
+#if defined(cl_khr_fp16)
-#define Dtype4 float4
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define Dtype8 float8
+#endif
 __kernel void slice(__global const Dtype* src,
                    const int src_plane_size,

--- a/modules/dnn/src/opencl/softmax.cl
+++ b/modules/dnn/src/opencl/softmax.cl
@@ -24,6 +24,10 @@
 * POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 __kernel void kernel_channel_max(const int num, const int channels,
    const int spatial_dim, __global const T* data, __global T* out) {
  int index = get_global_id(0);
@@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels,
 __kernel void kernel_channel_subtract(const int count,
    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_max, __global T* data) {
+    const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
  int index = get_global_id(0);
  if(index < count) {
    int n = index / channels / spatial_dim;
    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
+    data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
  }
 }

--- a/modules/dnn/src/opencl/softmax_loss.cl
+++ b/modules/dnn/src/opencl/softmax_loss.cl
@@ -42,12 +42,15 @@
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
-#define Dtype float
 #if defined(cl_intel_subgroups)
 #pragma OPENCL EXTENSION  cl_intel_subgroups : enable
 #endif
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
 __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
                                   const int spatial_dim,
                                   __global Dtype* scale,
@@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
  int n = get_global_id(1);
  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
      get_global_size(0), ++s) {
-    float maxval = -FLT_MAX;
+    Dtype maxval = -DTYPE_MAX;
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
      maxval = max((Dtype)tmp, (Dtype)maxval);
    }
-    maxval = sub_group_reduce_max(maxval * 100000);
+    maxval = sub_group_reduce_max(maxval);
    //if (get_sub_group_local_id() == 0)
    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
  }
@@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
    int s = index / get_max_sub_group_size();
    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale_tmp[s] = maxval / 100000;
+    scale_tmp[s] = maxval;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
@@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      sum += out_tmp[c * spatial_dim + s];
    }
-    sum = sub_group_reduce_add(sum * 100000);
+    sum = sub_group_reduce_add(sum);
    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
@@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
    int s = index / get_max_sub_group_size();
    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale_tmp[s] = sum / 100000;
+    scale_tmp[s] = sum;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
@@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
  __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
      get_global_size(0), ++s) {
-    float maxval = -FLT_MAX;
+    Dtype maxval = -DTYPE_MAX;
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
      maxval = max((Dtype)tmp, (Dtype)maxval);
    }
-    maxval = sub_group_reduce_max(maxval * 100000);
+    maxval = sub_group_reduce_max(maxval);
    //if (get_sub_group_local_id() == 0)
    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
  }
@@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
    int s = index / get_max_sub_group_size();
    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale[n * spatial_dim + s] = maxval / 100000;
+    scale[n * spatial_dim + s] = maxval;
  }
  barrier(CLK_GLOBAL_MEM_FENCE);
@@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
      sum += out[n * channels * spatial_dim + c * spatial_dim + s];
    }
-    sum = sub_group_reduce_add(sum * 100000);
+    sum = sub_group_reduce_add(sum);
    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
  }
  barrier(CLK_GLOBAL_MEM_FENCE);
@@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
    int s = index / get_max_sub_group_size();
    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
    //if (get_sub_group_local_id() == 0)
-    scale[n * spatial_dim + s] = sum / 100000;
+    scale[n * spatial_dim + s] = sum;
  }
  barrier(CLK_GLOBAL_MEM_FENCE);