Merge pull request #13520 from wzw-intel:hang

* dnn/Vulkan: fix GPU hang for heavy convolution tasks Intel i915 driver will declare GPU hang if the compute shader takes too long to complete. See https://bugs.freedesktop.org/show_bug.cgi?id=108947 for details. The idea in this commit is to divide heavy task into several light ones and run compute shader multiple times to make each run take short time enough. TODO: Add more efficient compute shader Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com> * dnn/Vulkan: add a more efficient conv shader

Merge pull request #13520 from wzw-intel:hang
* dnn/Vulkan: fix GPU hang for heavy convolution tasks Intel i915 driver will declare GPU hang if the compute shader takes too long to complete. See https://bugs.freedesktop.org/show_bug.cgi?id=108947 for details. The idea in this commit is to divide heavy task into several light ones and run compute shader multiple times to make each run take short time enough. TODO: Add more efficient compute shader Signed-off-by: Wu Zhiwen <zhiwen.wu@intel.com> * dnn/Vulkan: add a more efficient conv shader
3d44e9ad · WuZhiwen · Alexander Alekhin · 73959fed · 3d44e9ad · 3d44e9ad
Commit 3d44e9ad authored Dec 27, 2018 by WuZhiwen Committed by Alexander Alekhin Dec 27, 2018
12 changed files
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1505,32 +1505,6 @@ struct Net::Impl
                continue;
            }
-            if (ld.type == "Convolution")
-            {
-                std::vector<MatShape> in_shapes;
-                std::vector<MatShape> out_shapes;
-                CV_Assert(ld.inputBlobs.size() == ld.outputBlobs.size());
-                for (int i = 0; i < ld.inputBlobs.size(); i++)
-                {
-                    in_shapes.push_back(shape(*ld.inputBlobs[i]));
-                    out_shapes.push_back(shape(ld.outputBlobs[i]));
-                }
-                int64 flops = layer->getFLOPS(in_shapes, out_shapes);
-                // FIXME
-                //
-                // This is a workaround for GPU hang on heavy convolution workload ( > 10 GFLOPS).
-                // For the long time task, vkWaitForFences() return without error but next call on
-                // vkQueueSubmit() return -4, i.e. "VK_ERROR_DEVICE_LOST" and driver reports GPU hang.
-                //
-                // Need more investigation on root cause of GPU hang and need to optimize convolution shader
-                // to reduce process time.
-                if (flops > CV_BIG_INT(10) * 1000 * 1000 * 1000)
-                {
-                    continue;
-                }
-            }
            ld.skip = false;
            try

--- a/modules/dnn/src/vkcom/include/op_base.hpp
+++ b/modules/dnn/src/vkcom/include/op_base.hpp
@@ -31,7 +31,7 @@ protected:
    void createDescriptorSetLayout(int buffer_num);
    void createDescriptorSet(int buffer_num);
    void createShaderModule(const uint32_t* spv, size_t sz, const std::string& source = std::string());
-    void createPipeline(size_t push_constants_size = 0);
+    void createPipeline(size_t push_constants_size = 0, VkSpecializationInfo* specialization_info = 0);
    void createCommandBuffer();
    void recordCommandBuffer(void* push_constants = NULL, size_t push_constants_size = 0);
    void runCommandBuffer();

--- a/modules/dnn/src/vkcom/include/op_conv.hpp
+++ b/modules/dnn/src/vkcom/include/op_conv.hpp
@@ -18,7 +18,8 @@ namespace cv { namespace dnn { namespace vkcom {
 enum ConvShaderType
 {
    kConvShaderTypeBasic = 0,
-    kConvShaderTypeIDLF = 1,
+    kConvShaderType48,
+    kConvShaderTypeDepthWise,
    kConvShaderTypeNum
 };

--- a/modules/dnn/src/vkcom/shader/conv.comp
+++ b/modules/dnn/src/vkcom/shader/conv.comp
@@ -32,15 +32,18 @@ layout(push_constant) uniform pushBlock {
    int M;
    int K;
    int N;
+    int basic_shader_batch_idx;
+    int basic_shader_partition_idx;
+    int basic_shader_partition_size;
 } p;
 layout(local_size_x = LOCAL_SZ_X, local_size_y = 1, local_size_z = 1) in;
 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
+    int gy = int(gl_GlobalInvocationID.y) + p.basic_shader_partition_idx * p.basic_shader_partition_size;
-    int gz = int(gl_GlobalInvocationID.z);
+    int gz = p.basic_shader_batch_idx;
-    if(gx < p.M && gy < p.N && gz < p.batch)
+    if(gx < p.M && gy < p.N)
    {
        float sum = 0.0f;
        int output_y = gx / p.out_w;

--- a/modules/dnn/src/vkcom/shader/conv48.comp
+++ b/modules/dnn/src/vkcom/shader/conv48.comp
+#version 450
+layout (constant_id = 0) const int LOCAL_SZ_X = 0;
+layout (constant_id = 1) const int LOCAL_SZ_Y = 0;
+layout (constant_id = 2) const int LOCAL_SZ_Z = 0;
+layout (constant_id = 3) const int IN_H = 0;
+layout (constant_id = 4) const int IN_W = 0;
+layout (constant_id = 5) const int OUT_W = 0;
+layout (constant_id = 6) const int STRIDE_H = 0;
+layout (constant_id = 7) const int STRIDE_W = 0;
+layout (constant_id = 8) const int PAD_H = 0;
+layout (constant_id = 9) const int PAD_W = 0;
+layout (constant_id = 10) const int FILTER_H = 0;
+layout (constant_id = 11) const int FILTER_W = 0;
+layout (constant_id = 12) const int CHANNELS = 0;
+layout (constant_id = 13) const int BATCH = 0;
+layout (constant_id = 14) const int M = 0;
+layout (constant_id = 15) const int K = 0;
+layout (constant_id = 16) const int N = 0;
+layout (constant_id = 17) const int TAIL_M = 0;
+layout (constant_id = 18) const int DILATION_H = 0;
+layout (constant_id = 19) const int DILATION_W = 0;
+#if defined(ACTIVATION_RELU)
+#define ACTIVATION_FUNCTION(x)  clamp(x, vec4(0.0), vec4(999999999.0))
+#elif defined(ACTIVATION_RELU1)
+#define ACTIVATION_FUNCTION(x)  clamp(x, vec4(-1.0), vec4(1.0))
+#elif defined(ACTIVATION_RELU6)
+#define ACTIVATION_FUNCTION(x)  clamp(x, vec4(0.0), vec4(6.0))
+#else
+#define ACTIVATION_FUNCTION(x)  (x)
+#endif
+layout(binding = 0) readonly buffer Input0{
+    float data[];
+} src0;
+layout(binding = 1) readonly buffer Input1 {
+    vec4 data[];
+} bias;
+layout(binding = 2) readonly buffer Input3{
+    vec4 data[];
+} src1;
+layout(binding = 3) writeonly buffer Output{
+    vec4 data[];
+} out0;
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+#define VEC_SIZE 4
+#define BLOCK_H 4
+#define BLOCK_W 8
+#define FILTER_AREA (FILTER_H * FILTER_W)
+#define LOAD_A(elm_idx, a_component) \
+            src0_x = org_x + ((i * VEC_SIZE + elm_idx) % FILTER_W) * DILATION_W; \
+            src0_y = org_y + (((i * VEC_SIZE + elm_idx) % FILTER_AREA) / FILTER_W) * DILATION_H; \
+            src0_z = (i * VEC_SIZE + elm_idx) / FILTER_AREA; \
+            if(src0_y >= 0 && src0_y < IN_H && src0_x >= 0 && src0_x < IN_W) \
+            { \
+                a_component = src0.data[input_batch_offset + src0_z * (IN_H * IN_W) + src0_y * IN_W + src0_x]; \
+            }
+#define A_MULTIPLY_BTILE(a, sliver_num, comp) \
+            dst_x = (out_y + sliver_num) % OUT_W; \
+            dst_y = (out_y + sliver_num) / OUT_W; \
+            org_y = dst_y * STRIDE_H - PAD_H; \
+            org_x = dst_x * STRIDE_W - PAD_W; \
+            LOAD_A(0, a.x); \
+            LOAD_A(1, a.y); \
+            LOAD_A(2, a.z); \
+            LOAD_A(3, a.w); \
+            dot0.comp += dot(brow0, a); \
+            dot1.comp += dot(brow1, a); \
+            dot2.comp += dot(brow2, a); \
+            dot3.comp += dot(brow3, a); \
+            dot4.comp += dot(brow4, a); \
+            dot5.comp += dot(brow5, a); \
+            dot6.comp += dot(brow6, a); \
+            dot7.comp += dot(brow7, a);
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+    int out_x = BLOCK_W * gx;
+    int out_y = BLOCK_H * gy;
+    int input_batch_offset  = gz * IN_H * IN_W * CHANNELS;
+    int output_batch_offset = gz * M * N / VEC_SIZE;
+    if (out_x < N && gy < M / BLOCK_H)
+    {
+        int width0 = K / VEC_SIZE;
+        int width1 = N / VEC_SIZE;
+        int src1_read0_offset = out_x * width0;
+        vec4 dot0 = vec4(0.f);
+        vec4 dot1 = vec4(0.f);
+        vec4 dot2 = vec4(0.f);
+        vec4 dot3 = vec4(0.f);
+        vec4 dot4 = vec4(0.f);
+        vec4 dot5 = vec4(0.f);
+        vec4 dot6 = vec4(0.f);
+        vec4 dot7 = vec4(0.f);
+        int i = 0;
+        do
+        {
+            int dst_x, dst_y, org_x, org_y, src0_x, src0_y, src0_z;
+            vec4 a0 = vec4(0.f), a1 = vec4(0.f), a2 = vec4(0.f), a3 = vec4(0.f);
+            vec4 brow0 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow1 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow2 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow3 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow4 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow5 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow6 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            vec4 brow7 = src1.data[src1_read0_offset]; src1_read0_offset += width0;
+            src1_read0_offset += 1 - BLOCK_W * width0;
+            A_MULTIPLY_BTILE(a0, 0, x);
+            A_MULTIPLY_BTILE(a1, 1, y);
+            A_MULTIPLY_BTILE(a2, 2, z);
+            A_MULTIPLY_BTILE(a3, 3, w);
+            i++;
+        }
+        while( i < width0 );
+        vec4 bias_val;
+        bias_val = bias.data[2 * int(gl_GlobalInvocationID.x)];
+        dot0 += bias_val.xxxx; dot1 += bias_val.yyyy; dot2 += bias_val.zzzz; dot3 += bias_val.wwww;
+        bias_val = bias.data[2 * int(gl_GlobalInvocationID.x) + 1];
+        dot4 += bias_val.xxxx; dot5 += bias_val.yyyy; dot6 += bias_val.zzzz; dot7 += bias_val.wwww;
+        out0.data[output_batch_offset + (out_x + 0) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot0);
+        out0.data[output_batch_offset + (out_x + 1) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot1);
+        out0.data[output_batch_offset + (out_x + 2) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot2);
+        out0.data[output_batch_offset + (out_x + 3) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot3);
+        out0.data[output_batch_offset + (out_x + 4) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot4);
+        out0.data[output_batch_offset + (out_x + 5) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot5);
+        out0.data[output_batch_offset + (out_x + 6) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot6);
+        out0.data[output_batch_offset + (out_x + 7) * M / VEC_SIZE + gy] = ACTIVATION_FUNCTION(dot7);
+    }
+}
--- a/modules/dnn/src/vkcom/shader/conv48_spv.cpp
+++ b/modules/dnn/src/vkcom/shader/conv48_spv.cpp
--- a/modules/dnn/src/vkcom/shader/conv_spv.cpp
+++ b/modules/dnn/src/vkcom/shader/conv_spv.cpp
--- a/modules/dnn/src/vkcom/shader/dw_conv.comp
+++ b/modules/dnn/src/vkcom/shader/dw_conv.comp
@@ -19,6 +19,9 @@ layout(push_constant) uniform pushBlock {
    int M;
    int K;
    int N;
+    int basic_shader_batch_idx;
+    int basic_shader_partition_idx;
+    int basic_shader_partition_size;
 } p;
 layout(binding = 0) readonly buffer Input0{
@@ -36,7 +39,7 @@ layout(binding = 3) writeonly buffer Output{
 layout(local_size_x = LOCAL_SZ_X, local_size_y = 1, local_size_z = 1) in;
 /*
-   Each work item compute batch * multiplier output cell along the output depth dimension and batch
+   Each work item compute one output cell
 */
 void main()
 {
@@ -51,7 +54,7 @@ void main()
        int org_x = gx * p.stride_w - p.pad_w;
        int weight_off = gz * p.filter_h * p.filter_w;
-        int input_off = gz * p.in_h * p.in_w + org_y * p.in_w + org_x;
+        int input_off = (p.basic_shader_batch_idx * p.channels + gz) * p.in_h * p.in_w + org_y * p.in_w + org_x;
        for(int y = 0; y < p.filter_h; y++)
        {
            for(int x = 0; x < p.filter_w; x++)
@@ -65,7 +68,7 @@ void main()
            input_off += p.in_w * p.dilation_h;
        }
-        int offset = gz * p.out_h * p.out_w + gy * p.out_w + gx;
+        int offset = (p.basic_shader_batch_idx * p.channels + gz) * p.out_h * p.out_w + gy * p.out_w + gx;
        if (p.has_bias == 1)
            out_buffer[offset] = sum + bias_data[gz];
        else

--- a/modules/dnn/src/vkcom/shader/dw_conv_spv.cpp
+++ b/modules/dnn/src/vkcom/shader/dw_conv_spv.cpp
--- a/modules/dnn/src/vkcom/shader/spv_shader.hpp
+++ b/modules/dnn/src/vkcom/shader/spv_shader.hpp
@@ -11,8 +11,9 @@
 namespace cv { namespace dnn { namespace vkcom {
-extern const unsigned int dw_conv_spv[1655];
+extern const unsigned int dw_conv_spv[1760];
 extern const unsigned int permute_spv[765];
+extern const unsigned int conv48_spv[7458];
 extern const unsigned int lrn_spv[1845];
 extern const unsigned int concat_spv[541];
 extern const unsigned int avg_pool_spv[1538];
@@ -20,7 +21,7 @@ extern const unsigned int softmax_spv[1496];
 extern const unsigned int prior_box_spv[1480];
 extern const unsigned int max_pool_spv[1449];
 extern const unsigned int relu_spv[502];
-extern const unsigned int conv_spv[1859];
+extern const unsigned int conv_spv[1894];
 }}} // namespace cv::dnn::vkcom

--- a/modules/dnn/src/vkcom/src/op_base.cpp
+++ b/modules/dnn/src/vkcom/src/op_base.cpp
@@ -103,7 +103,7 @@ void OpBase::createShaderModule(const uint32_t* spv, size_t sz, const std::strin
    VK_CHECK_RESULT(vkCreateShaderModule(device_, &create_info, NULL, &module_));
 }
-void OpBase::createPipeline(size_t push_constants_size)
+void OpBase::createPipeline(size_t push_constants_size, VkSpecializationInfo* specialization_info)
 {
    // create pipeline
    VkPipelineShaderStageCreateInfo stage_create_info = {};
@@ -111,6 +111,7 @@ void OpBase::createPipeline(size_t push_constants_size)
    stage_create_info.stage = VK_SHADER_STAGE_COMPUTE_BIT;
    stage_create_info.module = module_;
    stage_create_info.pName = "main";
+    stage_create_info.pSpecializationInfo = specialization_info;
    VkPushConstantRange push_constant_ranges[1] = {};
    push_constant_ranges[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
    push_constant_ranges[0].offset = 0;

--- a/modules/dnn/src/vkcom/src/op_conv.cpp
+++ b/modules/dnn/src/vkcom/src/op_conv.cpp
@@ -14,7 +14,35 @@ namespace cv { namespace dnn { namespace vkcom {
 #ifdef HAVE_VULKAN
-#define LOCAL_SZ_X 256
+#define DEFAULT_LOCAL_SZ 256
+#define MAX_COMPUTE_GFLOPS 10
+// TODO: query group count from vulkan device
+#define MAX_GROUP_COUNT_X 65535
+#define MAX_GROUP_COUNT_Y 65535
+#define MAX_GROUP_COUNT_Z 65535
+struct ShaderConstant {
+    int lsz_x;
+    int lsz_y;
+    int lsz_z;
+    int in_h;
+    int in_w;
+    int out_w;
+    int stride_h;
+    int stride_w;
+    int pad_h;
+    int pad_w;
+    int filter_h;
+    int filter_w;
+    int channels;
+    int batch;
+    int m;
+    int k;
+    int n;
+    int tail_m;
+    int dilation_h;
+    int dilation_w;
+};
 struct ShaderParam {
    int in_h;
@@ -35,6 +63,9 @@ struct ShaderParam {
    int M;
    int K;
    int N;
+    int basic_shader_batch_idx;
+    int basic_shader_partition_idx;
+    int basic_shader_partition_size;
 };
 OpConv::OpConv(const int out_channel, const bool has_bias,
@@ -115,26 +146,100 @@ bool OpConv::forward(Tensor& in, Tensor& filter_weights, Tensor& bias, Tensor& o
    in_channel_= in_shape[kShapeIdxChannel];
    out_height_ = out_shape[kShapeIdxHeight];
    out_width_ = out_shape[kShapeIdxWidth];
+    int M = out_height_ * out_width_;
-    dwconv_ = (out_channel_ == in_channel_ && in_channel_ == group_);
+    int K = filter_height_ * filter_width_ * in_channel_;
-    if (dwconv_ == false)
+    int N = out_channel_;
-        assert(group_ == 1); // TODO: support group > 1
    if (pipeline_ == VK_NULL_HANDLE)
    {
-        config_.local_size_x = LOCAL_SZ_X;
+        config_.local_size_x = DEFAULT_LOCAL_SZ;
        config_.local_size_y = 1;
        config_.local_size_z = 1;
        config_.block_height = 1;
        config_.block_width  = 1;
        config_.block_depth  = 1;
-        config_.shader_type  = kConvShaderTypeBasic;
+        if ((N % 8 == 0) && (K % 4 == 0) && (M % 4) == 0)
+        {
+            assert(group_ == 1); // TODO: support group > 1
+            config_.shader_type  = kConvShaderType48;
+            config_.local_size_x = 1;
+            config_.local_size_y = DEFAULT_LOCAL_SZ;
+            config_.local_size_z = 1;
+            config_.block_height = 4;
+            config_.block_width  = 8;
+            createShaderModule(conv48_spv, sizeof(conv48_spv));
+            // specialization constants
+            VkSpecializationInfo spec_info;
+            ShaderConstant shader_constant;
+#define SPECIALIZATION_CONST_NUM 20
+            VkSpecializationMapEntry entry[SPECIALIZATION_CONST_NUM];
+#define SET_SPEC_CONST_ENTRY(n_, id_, offset_, size_) \
+            entry[n_].constantID = id_; \
+            entry[n_].offset = offset_; \
+            entry[n_].size = size_;
+            shader_constant.lsz_x = config_.local_size_x;
+            shader_constant.lsz_y = config_.local_size_y;
+            shader_constant.lsz_z = config_.local_size_z;
+            shader_constant.in_h  = in_height_;
+            shader_constant.in_w  = in_width_;
+            shader_constant.out_w = out_width_;
+            shader_constant.stride_h = stride_height_;
+            shader_constant.stride_w = stride_width_;
+            shader_constant.pad_h = padding_top_;
+            shader_constant.pad_w = padding_left_;
+            shader_constant.filter_h = filter_height_;
+            shader_constant.filter_w = filter_width_;
+            shader_constant.channels = in_channel_;
+            shader_constant.batch = batch_;
+            shader_constant.m = M;
+            shader_constant.k = K;
+            shader_constant.n = N;
+            shader_constant.tail_m = M % 4;
+            shader_constant.dilation_h = dilation_height_;
+            shader_constant.dilation_w = dilation_width_;
+            SET_SPEC_CONST_ENTRY(0, 0, offsetof(ShaderConstant,lsz_x), sizeof(int));
+            SET_SPEC_CONST_ENTRY(1, 1, offsetof(ShaderConstant,lsz_y), sizeof(int));
+            SET_SPEC_CONST_ENTRY(2, 2, offsetof(ShaderConstant,lsz_z), sizeof(int));
+            SET_SPEC_CONST_ENTRY(3, 3, offsetof(ShaderConstant,in_h), sizeof(int));
+            SET_SPEC_CONST_ENTRY(4, 4, offsetof(ShaderConstant,in_w), sizeof(int));
+            SET_SPEC_CONST_ENTRY(5, 5, offsetof(ShaderConstant,out_w), sizeof(int));
+            SET_SPEC_CONST_ENTRY(6, 6, offsetof(ShaderConstant,stride_h), sizeof(int));
+            SET_SPEC_CONST_ENTRY(7, 7, offsetof(ShaderConstant,stride_w), sizeof(int));
+            SET_SPEC_CONST_ENTRY(8, 8, offsetof(ShaderConstant,pad_h), sizeof(int));
+            SET_SPEC_CONST_ENTRY(9, 9, offsetof(ShaderConstant,pad_w), sizeof(int));
+            SET_SPEC_CONST_ENTRY(10, 10, offsetof(ShaderConstant,filter_h), sizeof(int));
+            SET_SPEC_CONST_ENTRY(11, 11, offsetof(ShaderConstant,filter_w), sizeof(int));
+            SET_SPEC_CONST_ENTRY(12, 12, offsetof(ShaderConstant,channels), sizeof(int));
+            SET_SPEC_CONST_ENTRY(13, 13, offsetof(ShaderConstant,batch), sizeof(int));
+            SET_SPEC_CONST_ENTRY(14, 14, offsetof(ShaderConstant,m), sizeof(int));
+            SET_SPEC_CONST_ENTRY(15, 15, offsetof(ShaderConstant,k), sizeof(int));
+            SET_SPEC_CONST_ENTRY(16, 16, offsetof(ShaderConstant,n), sizeof(int));
+            SET_SPEC_CONST_ENTRY(17, 17, offsetof(ShaderConstant,tail_m), sizeof(int));
+            SET_SPEC_CONST_ENTRY(18, 18, offsetof(ShaderConstant,dilation_h), sizeof(int));
+            SET_SPEC_CONST_ENTRY(19, 19, offsetof(ShaderConstant,dilation_w), sizeof(int));
-        if (dwconv_)
+            spec_info.mapEntryCount = SPECIALIZATION_CONST_NUM;
+            spec_info.pMapEntries = entry;
+            spec_info.dataSize = sizeof(shader_constant);
+            spec_info.pData = &shader_constant;
+            createPipeline(sizeof(ShaderParam), &spec_info);
+        }
+        else if (out_channel_ == in_channel_ && in_channel_ == group_)
+        {
+            config_.shader_type  = kConvShaderTypeDepthWise;
            createShaderModule(dw_conv_spv, sizeof(dw_conv_spv));
+            createPipeline(sizeof(ShaderParam));
+        }
        else
+        {
+            assert(group_ == 1); // TODO: support group > 1
+            config_.shader_type  = kConvShaderTypeBasic;
            createShaderModule(conv_spv, sizeof(conv_spv));
-        createPipeline(sizeof(ShaderParam));
+            createPipeline(sizeof(ShaderParam));
+        }
        computeGroupCount();
    }
@@ -143,9 +248,6 @@ bool OpConv::forward(Tensor& in, Tensor& filter_weights, Tensor& bias, Tensor& o
    bindTensor(device_, filter_weights, 2, descriptor_set_);
    bindTensor(device_, out, 3, descriptor_set_);
-    int M = out_height_ * out_width_;
-    int K = filter_height_ * filter_width_ * in_channel_;
-    int N = out_channel_;
    ShaderParam param = {in_height_, in_width_,
                         out_height_, out_width_,
                         stride_height_, stride_width_,
@@ -153,16 +255,40 @@ bool OpConv::forward(Tensor& in, Tensor& filter_weights, Tensor& bias, Tensor& o
                         filter_height_, filter_width_,
                         dilation_height_, dilation_width_,
                         in_channel_, batch_, has_bias_,
-                         M, K, N};
+                         M, K, N, 0, 0, 0};
+    if (config_.shader_type == kConvShaderTypeBasic || config_.shader_type == kConvShaderTypeDepthWise)
+    {
+        int partition_num = 1;
+        if (config_.shader_type == kConvShaderTypeBasic)
+        {
+            param.basic_shader_partition_size = group_y_;
+            partition_num = (int)ceil(1.0 * out_channel_ / group_y_);
+        }
+        for (int b = 0;  b < batch_; b++)
+        {
+            param.basic_shader_batch_idx = b;
+            for (int n = 0;  n < partition_num; n++)
+            {
+                param.basic_shader_partition_idx = n;
+                recordCommandBuffer((void *)&param, sizeof(ShaderParam));
+                runCommandBuffer();
+            }
+        }
+    }
+    else
+    {
+        recordCommandBuffer();
+        runCommandBuffer();
+    }
-    recordCommandBuffer((void *)&param, sizeof(ShaderParam));
-    runCommandBuffer();
    return true;
 }
 bool OpConv::computeGroupCount()
 {
-    if (dwconv_)
+    if (config_.shader_type == kConvShaderTypeDepthWise)
    {
        group_x_ = alignSize(out_width_, config_.local_size_x) / config_.local_size_x;
        group_y_ = alignSize(out_height_, config_.local_size_y) / config_.local_size_y;
@@ -175,13 +301,31 @@ bool OpConv::computeGroupCount()
    if (config_.shader_type == kConvShaderTypeBasic)
    {
-        group_x_ = alignSize(M, config_.local_size_x) / config_.local_size_x;
-        group_y_ = alignSize(N, config_.local_size_y) / config_.local_size_y;
+        group_x_ = alignSize(out_height_ * out_width_, config_.local_size_x) / config_.local_size_x;
-        group_z_ = alignSize(batch_, config_.local_size_z) / config_.local_size_z;
+        float GFLOPS = (2.0 * filter_height_ * filter_width_ * in_channel_ + 1) *
+                       (out_channel_ * out_height_ * out_width_) / 1000 / 1000 / 1000;
+        CV_Assert(config_.local_size_y == 1);
+        group_y_ = std::min(MAX_GROUP_COUNT_Y, (int)floor(MAX_COMPUTE_GFLOPS / (GFLOPS / out_channel_)));
+        group_z_ = 1;
+    }
+    else if (config_.shader_type == kConvShaderType48)
+    {
+        assert(config_.block_width == 8 &&
+               config_.block_height == 4 &&
+               config_.block_depth == 1 &&
+               config_.local_size_z == 1);
+        group_x_ = N / config_.block_width;
+        group_y_ = alignSize(alignSize(M, 4) / 4, config_.local_size_y) / config_.local_size_y;
+        group_z_ = batch_;
    }
    else
        CV_Assert(0);
+    CV_Assert(group_x_ <= MAX_GROUP_COUNT_X);
+    CV_Assert(group_y_ <= MAX_GROUP_COUNT_Y);
+    CV_Assert(group_z_ <= MAX_GROUP_COUNT_Z);
    return true;
 }