DEX Part 3 (#1184)

* CPU Direct Execution: Implement ConvertLayout and refactor * CPU Direct Execution: Implement Convolution

DEX Part 3 (#1184)
* CPU Direct Execution: Implement ConvertLayout and refactor * CPU Direct Execution: Implement Convolution
d37fa712 · Jaikrishnan Menon · Scott Cyphers · 4cd2c602 · d37fa712 · d37fa712
Commit d37fa712 authored Jul 11, 2018 by Jaikrishnan Menon Committed by Scott Cyphers Jul 11, 2018
6 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -27,6 +27,8 @@ set(SRC
    cpu_tensor_view_wrapper.cpp
    cpu_tensor_view.cpp
    cpu_tracing.cpp
+    builder/convert_layout.cpp
+    builder/convolution.cpp
    kernel/eigen_thread_pool.cpp
    kernel/pad.cpp
    kernel/reduce_max.cpp

--- a/src/ngraph/runtime/cpu/builder/convert_layout.cpp
+++ b/src/ngraph/runtime/cpu/builder/convert_layout.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ngraph/runtime/cpu/op/convert_layout.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::ConvertLayout)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+
+                auto& arg_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+
+                auto input_tvl =
+                    node->get_inputs()[0].get_output().get_tensor_view()->get_tensor_view_layout();
+                auto input_cpu_tvl =
+                    dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(input_tvl);
+                auto input_format = input_cpu_tvl->get_mkldnn_format();
+
+                // Reorder input shape if needed
+                auto input_axis_order = input_cpu_tvl->get_axis_order();
+                Shape input_shape(input_axis_order.size());
+                for (size_t idx = 0; idx < input_axis_order.size(); idx++)
+                {
+                    input_shape[idx] = args[0].get_shape()[input_axis_order[idx]];
+                }
+
+                auto output_tvl = node->get_output_tensor_view(0)->get_tensor_view_layout();
+                auto output_format =
+                    dynamic_cast<runtime::cpu::LayoutDescriptor&>(*output_tvl).get_mkldnn_format();
+
+                // MKLDNN relies on format names for selecting optimized kernel implementations
+                // Hacky way to deal with this until they move to using canonicalized layouts
+                if (input_format == mkldnn::memory::format::nchw &&
+                    runtime::cpu::mkldnn_utils::is_mkldnn_filter_format(output_format))
+                {
+                    input_format = mkldnn::memory::format::oihw;
+                }
+                if (output_format == mkldnn::memory::format::nchw &&
+                    runtime::cpu::mkldnn_utils::is_mkldnn_filter_format(input_format))
+                {
+                    output_format = mkldnn::memory::format::oihw;
+                }
+
+                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+
+                auto input_desc = mkldnn_emitter->build_memory_descriptor(
+                    input_shape, args[0].get_element_type(), input_format);
+                auto result_desc = mkldnn_emitter->build_memory_descriptor(out[0], output_format);
+
+                size_t reorder_index = mkldnn_emitter->build_reorder(input_desc, result_desc);
+
+                auto& deps = mkldnn_emitter->get_primitive_deps(reorder_index);
+                auto functor = [&, reorder_index](CPURuntimeContext* ctx) {
+                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
+                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, reorder_index);
+                };
+                functors.emplace_back(functor);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/convolution.cpp
+++ b/src/ngraph/runtime/cpu/builder/convolution.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ngraph/op/convolution.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/convolution.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Convolution)
+            {
+                auto convolution = static_cast<const ngraph::op::Convolution*>(node);
+
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+
+                auto arg0_shape = args[0].get_shape();
+                auto arg1_shape = args[1].get_shape();
+                auto result_shape = out[0].get_shape();
+
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    // For dilation, MKLDNN wants to know how many elements to insert between, not how far
+                    // apart to space the elements like nGraph. So we have to subtract 1 from each pos.
+                    Strides window_dilation_strides_adjusted;
+                    for (size_t s : convolution->get_window_dilation_strides())
+                    {
+                        window_dilation_strides_adjusted.push_back(s - 1);
+                    }
+
+                    auto input_format =
+                        runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
+                    auto weights_format =
+                        runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1);
+                    // HACK to help MKLDNN pick the right implementation
+                    if (weights_format == mkldnn::memory::format::nchw)
+                    {
+                        weights_format = mkldnn::memory::format::oihw;
+                    }
+                    auto output_format =
+                        runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0);
+
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_data_desc =
+                        mkldnn_emitter->build_memory_descriptor(args[0], input_format);
+                    auto weights_desc =
+                        mkldnn_emitter->build_memory_descriptor(args[1], weights_format);
+                    auto result_desc =
+                        mkldnn_emitter->build_memory_descriptor(out[0], output_format);
+                    size_t conv_index = 0;
+
+                    conv_index = mkldnn_emitter->build_convolution_forward(
+                        input_data_desc,
+                        weights_desc,
+                        result_desc,
+                        convolution->get_window_movement_strides(),
+                        window_dilation_strides_adjusted,
+                        convolution->get_padding_below(),
+                        convolution->get_padding_above());
+
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+
+                    auto functor = [&, conv_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    std::function<decltype(runtime::cpu::kernel::convolution<float>)> kernel;
+
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::convolution);
+
+                    auto window_movement_strides = convolution->get_window_movement_strides();
+                    auto window_dilation_strides = convolution->get_window_dilation_strides();
+                    auto padding_below = convolution->get_padding_below();
+                    auto padding_above = convolution->get_padding_above();
+                    auto data_dilation_strides = convolution->get_data_dilation_strides();
+
+                    auto functor = [&,
+                                    kernel,
+                                    arg0_shape,
+                                    arg1_shape,
+                                    result_shape,
+                                    window_movement_strides,
+                                    window_dilation_strides,
+                                    padding_below,
+                                    padding_above,
+                                    data_dilation_strides](CPURuntimeContext* ctx) {
+                        kernel(arg0_tensor,
+                               arg1_tensor,
+                               out_tensor,
+                               arg0_shape,
+                               arg1_shape,
+                               result_shape,
+                               window_movement_strides,
+                               window_dilation_strides,
+                               padding_below,
+                               padding_above,
+                               data_dilation_strides,
+                               0,
+                               1,
+                               1,
+                               0,
+                               0,
+                               1,
+                               false);
+                    };
+                    functors.emplace_back(functor);
+                }
+            }
+
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::ConvolutionBackpropData)
+            {
+                auto convolution = static_cast<const ngraph::op::ConvolutionBackpropData*>(node);
+
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+
+                auto arg0_shape = args[0].get_shape();
+                auto arg1_shape = args[1].get_shape();
+                auto result_shape = out[0].get_shape();
+
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    Strides window_dilation_strides_adjusted;
+
+                    for (size_t s : convolution->get_window_dilation_strides_forward())
+                    {
+                        window_dilation_strides_adjusted.push_back(s - 1);
+                    }
+
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    // HACK to help MKLDNN pick the right implementation
+                    auto weights_format =
+                        runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
+                    if (weights_format == mkldnn::memory::format::nchw)
+                    {
+                        weights_format = mkldnn::memory::format::oihw;
+                    }
+                    auto weights_desc =
+                        mkldnn_emitter->build_memory_descriptor(args[0], weights_format);
+                    auto delta_desc = mkldnn_emitter->build_memory_descriptor(
+                        args[1], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1));
+                    auto result_desc = mkldnn_emitter->build_memory_descriptor(
+                        out[0], runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
+
+                    size_t conv_bwd_data_index = mkldnn_emitter->build_convolution_backward_data(
+                        weights_desc,
+                        delta_desc,
+                        result_desc,
+                        convolution->get_window_movement_strides_forward(),
+                        window_dilation_strides_adjusted,
+                        convolution->get_padding_below_forward(),
+                        convolution->get_padding_above_forward());
+
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_bwd_data_index);
+
+                    auto functor = [&, conv_bwd_data_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_bwd_data_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    std::function<decltype(runtime::cpu::kernel::convolution<float>)> kernel;
+
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::convolution);
+
+                    auto window_movement_strides =
+                        convolution->get_window_movement_strides_backward();
+                    auto window_dilation_strides =
+                        convolution->get_window_dilation_strides_backward();
+                    auto padding_below = convolution->get_padding_below_backward();
+                    auto padding_above = convolution->get_padding_above_backward();
+                    auto data_dilation_strides = convolution->get_data_dilation_strides_backward();
+
+                    auto functor = [&,
+                                    kernel,
+                                    arg0_shape,
+                                    arg1_shape,
+                                    result_shape,
+                                    window_movement_strides,
+                                    window_dilation_strides,
+                                    padding_below,
+                                    padding_above,
+                                    data_dilation_strides](CPURuntimeContext* ctx) {
+                        kernel(arg1_tensor,
+                               arg0_tensor,
+                               out_tensor,
+                               arg1_shape,
+                               arg0_shape,
+                               result_shape,
+                               window_movement_strides,
+                               window_dilation_strides,
+                               padding_below,
+                               padding_above,
+                               data_dilation_strides,
+                               0,
+                               1,
+                               0,
+                               1,
+                               0,
+                               1,
+                               true);
+                    };
+                    functors.emplace_back(functor);
+                }
+            }
+
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::ConvolutionBackpropFilters)
+            {
+                auto convolution = static_cast<const ngraph::op::ConvolutionBackpropFilters*>(node);
+
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+
+                auto arg0_shape = args[0].get_shape();
+                auto arg1_shape = args[1].get_shape();
+                auto result_shape = out[0].get_shape();
+
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    Strides window_dilation_strides_adjusted;
+
+                    for (size_t s : convolution->get_window_dilation_strides_forward())
+                    {
+                        window_dilation_strides_adjusted.push_back(s - 1);
+                    }
+
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_desc = mkldnn_emitter->build_memory_descriptor(
+                        args[0], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0));
+                    auto delta_desc = mkldnn_emitter->build_memory_descriptor(
+                        args[1], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1));
+                    auto result_desc = mkldnn_emitter->build_memory_descriptor(
+                        out[0], runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
+
+                    size_t conv_bwd_weights_index =
+                        mkldnn_emitter->build_convolution_backward_weights(
+                            input_desc,
+                            delta_desc,
+                            result_desc,
+                            convolution->get_window_movement_strides_forward(),
+                            window_dilation_strides_adjusted,
+                            convolution->get_padding_below_forward(),
+                            convolution->get_padding_above_forward());
+
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_bwd_weights_index);
+
+                    auto functor = [&, conv_bwd_weights_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_bwd_weights_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    std::function<decltype(runtime::cpu::kernel::convolution<float>)> kernel;
+
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::convolution);
+
+                    auto window_movement_strides =
+                        convolution->get_window_movement_strides_backward();
+                    auto window_dilation_strides =
+                        convolution->get_window_dilation_strides_backward();
+                    auto padding_below = convolution->get_padding_below_backward();
+                    auto padding_above = convolution->get_padding_above_backward();
+                    auto data_dilation_strides = convolution->get_data_dilation_strides_backward();
+
+                    auto functor = [&,
+                                    kernel,
+                                    arg0_shape,
+                                    arg1_shape,
+                                    result_shape,
+                                    window_movement_strides,
+                                    window_dilation_strides,
+                                    padding_below,
+                                    padding_above,
+                                    data_dilation_strides](CPURuntimeContext* ctx) {
+                        kernel(arg0_tensor,
+                               arg1_tensor,
+                               out_tensor,
+                               arg0_shape,
+                               arg1_shape,
+                               result_shape,
+                               window_movement_strides,
+                               window_dilation_strides,
+                               padding_below,
+                               padding_above,
+                               data_dilation_strides,
+                               1,
+                               0,
+                               0,
+                               1,
+                               1,
+                               0,
+                               false);
+                    };
+                    functors.emplace_back(functor);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
@@ -98,7 +98,6 @@
 #include "ngraph/runtime/cpu/kernel/multiply.hpp"
 #include "ngraph/runtime/cpu/kernel/relu.hpp"
 #include "ngraph/runtime/cpu/kernel/result.hpp"
-#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
@@ -119,53 +118,6 @@
 using namespace std;
 using namespace ngraph;

-// Per-type kernel macro
-#define SELECT_KERNEL(KV, ET, K)                                                                   \
-    if (ET == element::boolean)                                                                    \
-    {                                                                                              \
-        KV = K<char>;                                                                              \
-    }                                                                                              \
-    else if (ET == element::f32)                                                                   \
-    {                                                                                              \
-        KV = K<float>;                                                                             \
-    }                                                                                              \
-    else if (ET == element::f64)                                                                   \
-    {                                                                                              \
-        KV = K<double>;                                                                            \
-    }                                                                                              \
-    else if (ET == element::i8)                                                                    \
-    {                                                                                              \
-        KV = K<int8_t>;                                                                            \
-    }                                                                                              \
-    else if (ET == element::i16)                                                                   \
-    {                                                                                              \
-        KV = K<int16_t>;                                                                           \
-    }                                                                                              \
-    else if (ET == element::i32)                                                                   \
-    {                                                                                              \
-        KV = K<int32_t>;                                                                           \
-    }                                                                                              \
-    else if (ET == element::i64)                                                                   \
-    {                                                                                              \
-        KV = K<int64_t>;                                                                           \
-    }                                                                                              \
-    else if (ET == element::u8)                                                                    \
-    {                                                                                              \
-        KV = K<uint8_t>;                                                                           \
-    }                                                                                              \
-    else if (ET == element::u16)                                                                   \
-    {                                                                                              \
-        KV = K<uint16_t>;                                                                          \
-    }                                                                                              \
-    else if (ET == element::u32)                                                                   \
-    {                                                                                              \
-        KV = K<uint32_t>;                                                                          \
-    }                                                                                              \
-    else if (ET == element::u64)                                                                   \
-    {                                                                                              \
-        KV = K<uint64_t>;                                                                          \
-    }
-
 #define BUILD_UNARY_ELEMWISE_FUNCTOR(OP)                                                           \
    auto& functors = external_function->get_functors();                                            \
    auto& tensor_data = external_function->get_tensor_data();                                      \
@@ -419,6 +371,14 @@ namespace ngraph
                {TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
                {TI(ngraph::op::Abs), &runtime::cpu::Builder::build<ngraph::op::Abs>},
                {TI(ngraph::op::Ceiling), &runtime::cpu::Builder::build<ngraph::op::Ceiling>},
+                {TI(ngraph::runtime::cpu::op::ConvertLayout),
+                 &runtime::cpu::Builder::build<ngraph::runtime::cpu::op::ConvertLayout>},
+                {TI(ngraph::op::Convolution),
+                 &runtime::cpu::Builder::build<ngraph::op::Convolution>},
+                {TI(ngraph::op::ConvolutionBackpropData),
+                 &runtime::cpu::Builder::build<ngraph::op::ConvolutionBackpropData>},
+                {TI(ngraph::op::ConvolutionBackpropFilters),
+                 &runtime::cpu::Builder::build<ngraph::op::ConvolutionBackpropFilters>},
                {TI(ngraph::op::Relu), &runtime::cpu::Builder::build<ngraph::op::Relu>},
                {TI(ngraph::op::Result), &runtime::cpu::Builder::build<ngraph::op::Result>},
                {TI(ngraph::op::MatmulBias), &runtime::cpu::Builder::build<ngraph::op::MatmulBias>},

--- a/src/ngraph/runtime/cpu/cpu_builder.hpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.hpp
@@ -29,6 +29,134 @@
                   const std::vector<TensorViewWrapper>& args,                                     \
                   const std::vector<TensorViewWrapper>& out)

+// Per-type kernel macro
+#define SELECT_KERNEL(KV, ET, K)                                                                   \
+    if (ET == element::boolean)                                                                    \
+    {                                                                                              \
+        KV = K<char>;                                                                              \
+    }                                                                                              \
+    else if (ET == element::f32)                                                                   \
+    {                                                                                              \
+        KV = K<float>;                                                                             \
+    }                                                                                              \
+    else if (ET == element::f64)                                                                   \
+    {                                                                                              \
+        KV = K<double>;                                                                            \
+    }                                                                                              \
+    else if (ET == element::i8)                                                                    \
+    {                                                                                              \
+        KV = K<int8_t>;                                                                            \
+    }                                                                                              \
+    else if (ET == element::i16)                                                                   \
+    {                                                                                              \
+        KV = K<int16_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::i32)                                                                   \
+    {                                                                                              \
+        KV = K<int32_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::i64)                                                                   \
+    {                                                                                              \
+        KV = K<int64_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::u8)                                                                    \
+    {                                                                                              \
+        KV = K<uint8_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::u16)                                                                   \
+    {                                                                                              \
+        KV = K<uint16_t>;                                                                          \
+    }                                                                                              \
+    else if (ET == element::u32)                                                                   \
+    {                                                                                              \
+        KV = K<uint32_t>;                                                                          \
+    }                                                                                              \
+    else if (ET == element::u64)                                                                   \
+    {                                                                                              \
+        KV = K<uint64_t>;                                                                          \
+    }
+
+#define SELECT_RANK(KV, ET, R, K)                                                                  \
+    if (R == 1)                                                                                    \
+        KV = K<ET, 1>;                                                                             \
+    else if (R == 2)                                                                               \
+        KV = K<ET, 2>;                                                                             \
+    else if (R == 3)                                                                               \
+        KV = K<ET, 3>;                                                                             \
+    else if (R == 4)                                                                               \
+        KV = K<ET, 4>;                                                                             \
+    else if (R == 5)                                                                               \
+        KV = K<ET, 5>;                                                                             \
+    else if (R == 6)                                                                               \
+        KV = K<ET, 6>;                                                                             \
+    else if (R == 7)                                                                               \
+        KV = K<ET, 7>;                                                                             \
+    else if (R == 8)                                                                               \
+        KV = K<ET, 8>;                                                                             \
+    else if (R == 9)                                                                               \
+        KV = K<ET, 9>;                                                                             \
+    else if (R == 10)                                                                              \
+        KV = K<ET, 10>;                                                                            \
+    else if (R == 11)                                                                              \
+        KV = K<ET, 11>;                                                                            \
+    else if (R == 12)                                                                              \
+        KV = K<ET, 12>;                                                                            \
+    else if (R == 13)                                                                              \
+        KV = K<ET, 13>;                                                                            \
+    else if (R == 14)                                                                              \
+        KV = K<ET, 14>;                                                                            \
+    else if (R == 15)                                                                              \
+        KV = K<ET, 15>;                                                                            \
+    else if (R == 16)                                                                              \
+        KV = K<ET, 16>;
+
+// Per-type and rank kernel macro
+#define SELECT_KERNEL_BY_RANK(KV, ET, R, K)                                                        \
+    if (ET == element::boolean)                                                                    \
+    {                                                                                              \
+        SELECT_RANK(KV, char, R, K);                                                               \
+    }                                                                                              \
+    else if (ET == element::f32)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, float, R, K);                                                              \
+    }                                                                                              \
+    else if (ET == element::f64)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, double, R, K);                                                             \
+    }                                                                                              \
+    else if (ET == element::i8)                                                                    \
+    {                                                                                              \
+        SELECT_RANK(KV, int8_t, R, K);                                                             \
+    }                                                                                              \
+    else if (ET == element::i16)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, int16_t, R, K);                                                            \
+    }                                                                                              \
+    else if (ET == element::i32)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, int32_t, R, K);                                                            \
+    }                                                                                              \
+    else if (ET == element::i64)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, int64_t, R, K);                                                            \
+    }                                                                                              \
+    else if (ET == element::u8)                                                                    \
+    {                                                                                              \
+        SELECT_RANK(KV, uint8_t, R, K);                                                            \
+    }                                                                                              \
+    else if (ET == element::u16)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, uint16_t, R, K);                                                           \
+    }                                                                                              \
+    else if (ET == element::u32)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, uint32_t, R, K);                                                           \
+    }                                                                                              \
+    else if (ET == element::u64)                                                                   \
+    {                                                                                              \
+        SELECT_RANK(KV, uint64_t, R, K);                                                           \
+    }
+
 namespace ngraph
 {
    namespace runtime

--- a/src/ngraph/runtime/cpu/kernel/convolution.hpp
+++ b/src/ngraph/runtime/cpu/kernel/convolution.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "ngraph/runtime/reference/convolution.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void convolution(void* input0,
+                                 void* input1,
+                                 void* output,
+                                 const Shape& arg0_shape,
+                                 const Shape& arg1_shape,
+                                 const Shape& result_shape,
+                                 const Strides& window_movement_strides,
+                                 const Strides& window_dilation_strides,
+                                 const CoordinateDiff& padding_below,
+                                 const CoordinateDiff& padding_above,
+                                 const Strides& data_dilation_strides,
+                                 size_t batch_axis_data,
+                                 size_t input_channel_axis_data,
+                                 size_t input_channel_axis_filters,
+                                 size_t output_channel_axis_filters,
+                                 size_t batch_axis_result,
+                                 size_t output_channel_axis_result,
+                                 bool rotate_filter)
+                {
+                    reference::convolution<ElementType>(static_cast<const ElementType*>(input0),
+                                                        static_cast<const ElementType*>(input1),
+                                                        static_cast<ElementType*>(output),
+                                                        arg0_shape,
+                                                        arg1_shape,
+                                                        result_shape,
+                                                        window_movement_strides,
+                                                        window_dilation_strides,
+                                                        padding_below,
+                                                        padding_above,
+                                                        data_dilation_strides,
+                                                        batch_axis_data,
+                                                        input_channel_axis_data,
+                                                        input_channel_axis_filters,
+                                                        output_channel_axis_filters,
+                                                        batch_axis_result,
+                                                        output_channel_axis_result,
+                                                        rotate_filter);
+                }
+            }
+        }
+    }
+}