ncsp_batch_normalization.cpp 22.7 KB
Newer Older
openvino-pushbot's avatar
openvino-pushbot committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include <assert.h>
#include <math.h>

#include "c_types_map.hpp"
21 22 23 24
#include "type_helpers.hpp"

#include "cpu_batch_normalization_utils.hpp"

25
#include "bfloat16_utils.hpp"
openvino-pushbot's avatar
openvino-pushbot committed
26 27
#include "ncsp_batch_normalization.hpp"

28 29
// clang 6 and 7 generate incorrect code with OMP_SIMD in some particular cases
#if (defined __clang_major__) && (__clang_major__ >= 6)
openvino-pushbot's avatar
openvino-pushbot committed
30 31 32 33 34 35 36 37 38
#define SAFE_TO_USE_OMP_SIMD 0
#else
#define SAFE_TO_USE_OMP_SIMD 1
#endif

namespace mkldnn {
namespace impl {
namespace cpu {

39
using namespace memory_tracking::names;
openvino-pushbot's avatar
openvino-pushbot committed
40

41 42
template <data_type_t data_type>
void ncsp_batch_normalization_fwd_t<data_type>::execute_forward() const {
openvino-pushbot's avatar
openvino-pushbot committed
43 44
    auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
    auto dst = reinterpret_cast<data_t *>(this->memory(0));
45 46 47 48 49 50
    auto scratchpad = this->scratchpad();

    const bool calculate_stats = !pd()->stats_is_src();
    const bool save_stats = pd()->is_training();
    const bool is_training = pd()->is_training();
    const bool fuse_bn_relu = pd()->fuse_bn_relu();
openvino-pushbot's avatar
openvino-pushbot committed
51

52
    acc_data_t *mean, *variance;
openvino-pushbot's avatar
openvino-pushbot committed
53
    if (!calculate_stats) {
54
        mean = reinterpret_cast<acc_data_t *>(
openvino-pushbot's avatar
openvino-pushbot committed
55
                const_cast<char *>(this->input_memory(1)));
56
        variance = reinterpret_cast<acc_data_t *>(
openvino-pushbot's avatar
openvino-pushbot committed
57 58 59
                const_cast<char *>(this->input_memory(2)));
    } else {
        if (save_stats) {
60 61
            mean = reinterpret_cast<acc_data_t *>(this->memory(1));
            variance = reinterpret_cast<acc_data_t *>(this->memory(2));
openvino-pushbot's avatar
openvino-pushbot committed
62
        } else {
63 64
            mean = scratchpad.template get<acc_data_t>(key_bnorm_tmp_mean);
            variance = scratchpad.template get<acc_data_t>(key_bnorm_tmp_var);
openvino-pushbot's avatar
openvino-pushbot committed
65 66
        }
    }
67
    auto idx_scale_shift = 1 + 2 * pd()->stats_is_src();
68
    auto scaleshift = reinterpret_cast<const acc_data_t *>(
openvino-pushbot's avatar
openvino-pushbot committed
69
            this->input_memory(idx_scale_shift));
70
    auto ws = reinterpret_cast<uint8_t *>(this->memory(pd()->ws_idx()));
71 72
    auto *ws_reduce = scratchpad.template get<acc_data_t>(key_bnorm_reduction);
    acc_data_t *tmp_data_ = scratchpad.template get<acc_data_t>(key_bnorm_bf16cvt);
openvino-pushbot's avatar
openvino-pushbot committed
73

74 75 76
    const float eps = pd()->desc()->batch_norm_epsilon;
    const bool use_scaleshift = pd()->use_scaleshift();
    const bool with_relu = pd()->with_relu_post_op();
openvino-pushbot's avatar
openvino-pushbot committed
77
    auto maybe_post_op
78
            = [&](acc_data_t res) { return (with_relu && res < 0) ? 0 : res; };
79 80
    const bool has_spatial = utils::one_of(pd()->ndims(), 4, 5);
    int SP = (has_spatial) ? pd()->H() * pd()->W() * pd()->D() : 1;
81 82
    const int simd_w = 16;
    const int SP_cl_align = utils::rnd_up(SP, simd_w);
83 84
    size_t N = pd()->MB();
    size_t C = pd()->C();
openvino-pushbot's avatar
openvino-pushbot committed
85

Alexey Suhov's avatar
Alexey Suhov committed
86
    int nthr = mkldnn_get_max_threads();
openvino-pushbot's avatar
openvino-pushbot committed
87 88 89
    size_t l3_size_ = get_cache_size(3, true) * nthr / 2;
    size_t data_size = N * C * SP * sizeof(data_t);
    bool do_blocking = (data_size >= l3_size_ / 2 && l3_size_ > 0);
Alexey Suhov's avatar
Alexey Suhov committed
90

91
    parallel(0, (size_t)mkldnn_get_max_threads(), [&](const int ithr, const int nthr) {
Alexey Suhov's avatar
Alexey Suhov committed
92
        int C_blks_per_iter = 1, iters = 1;
openvino-pushbot's avatar
openvino-pushbot committed
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
        int C_ithr = 0, C_nthr = 0, N_ithr = 0, N_nthr = 0, N_s = 0, N_e = 0;
        int S_ithr = 0, S_nthr = 0, S_s = 0, S_e = 0;
        int C_blk_gl_s = 0, C_blk_gl_e = 0, C_blk_s = 0, C_blk_e = 0;
        if (do_blocking) {
            size_t working_set_size = N * SP * sizeof(data_t);
            bnorm_utils::cache_balance(
                    working_set_size, C, C_blks_per_iter, iters);
        } else
            C_blks_per_iter = C;
        int last_iter_blks = C - (iters - 1) * C_blks_per_iter;
        bool spatial_thr_allowed
                = bnorm_utils::thread_balance(do_blocking, true, ithr, nthr, N,
                        C_blks_per_iter, SP, C_ithr, C_nthr, C_blk_s, C_blk_e,
                        N_ithr, N_nthr, N_s, N_e, S_ithr, S_nthr, S_s, S_e);
        balance211(C_blks_per_iter, nthr, ithr, C_blk_gl_s, C_blk_gl_e);
        int SP_N_ithr = N_ithr * S_nthr + S_ithr;
        int SP_N_nthr = N_nthr * S_nthr;
110

openvino-pushbot's avatar
openvino-pushbot committed
111 112
        for (int it = 0; it < iters; ++it) {
            if (it == iters - 1 && iters > 1) {
113 114 115 116 117 118
                // On the last iteration the access pattern to ws_reduce
                // might change (due to re-balance on C). So sync the
                // threads if they are not synced by the algorithm.
                if (SP_N_nthr == 1 && mkldnn_thr_syncable())
                    mkldnn_thr_barrier();

openvino-pushbot's avatar
openvino-pushbot committed
119 120 121 122 123 124 125 126 127 128
                S_s = S_e = C_blk_s = C_blk_e = N_s = N_e = 0;
                spatial_thr_allowed = bnorm_utils::thread_balance(do_blocking,
                        spatial_thr_allowed, ithr, nthr, N, last_iter_blks, SP,
                        C_ithr, C_nthr, C_blk_s, C_blk_e, N_ithr, N_nthr, N_s,
                        N_e, S_ithr, S_nthr, S_s, S_e);
                balance211(last_iter_blks, nthr, ithr, C_blk_gl_s, C_blk_gl_e);
                SP_N_ithr = N_ithr * S_nthr + S_ithr;
                SP_N_nthr = N_nthr * S_nthr;
            }
            size_t C_off = it * C_blks_per_iter;
129 130 131 132 133 134
            // On the last iteration the access pattern to ws_reduce
            // might change (due to re-balance on C). Since sync is not always
            // possible (in case of TBB) use different parts of ws for each
            // iteration if threads are not synced by the algorithm.
            size_t ws_iter_off = (mkldnn_thr_syncable() ? 0 : 1) * C_off;

openvino-pushbot's avatar
openvino-pushbot committed
135
            if (calculate_stats) {
136 137 138
                acc_data_t *mean_blk = mean + C_off;
                acc_data_t *variance_blk = variance + C_off;
                for (dim_t c = C_blk_s; c < C_blk_e; c++) {
openvino-pushbot's avatar
openvino-pushbot committed
139
                    size_t off = (c + C_off) * SP;
140 141 142 143 144 145 146 147 148 149 150 151 152 153
                    acc_data_t sum = 0;
                    for (dim_t n = N_s; n < N_e; ++n) {
                        const acc_data_t *_src;
                        size_t soff = off + n * C * SP;
                        if (data_type == data_type::bf16) {
                            // convert src from b16 to f32
                            acc_data_t *tmp_src = tmp_data_ + ithr * SP_cl_align;
                            bf16_cvt_utils::cvt_bfloat16_to_float(tmp_src,
                                    (mkldnn_bfloat16_t *)src + soff,
                                    nstl::max(0, S_e - S_s));
                            _src = tmp_src;
                        } else {
                            _src = reinterpret_cast<const acc_data_t *>(src + soff);
                        }
openvino-pushbot's avatar
openvino-pushbot committed
154
                        PRAGMA_OMP_SIMD(reduction(+ : sum))
155 156
                        for (dim_t sp = S_s; sp < S_e; ++sp) {
                            sum += _src[sp];
openvino-pushbot's avatar
openvino-pushbot committed
157
                        }
158
                    }
159 160
                    ws_reduce[ws_iter_off + SP_N_ithr * C_blks_per_iter + c]
                        = sum;
openvino-pushbot's avatar
openvino-pushbot committed
161
                }
Alexey Suhov's avatar
Alexey Suhov committed
162 163 164

                if (SP_N_nthr > 1) mkldnn_thr_barrier();

openvino-pushbot's avatar
openvino-pushbot committed
165 166 167
                for (int c = C_blk_gl_s; c < C_blk_gl_e; c++) {
                    mean_blk[c] = 0.;
                    for (int n = 0; n < SP_N_nthr; n++)
168 169
                        mean_blk[c] += ws_reduce[ws_iter_off
                                + n * C_blks_per_iter + c];
openvino-pushbot's avatar
openvino-pushbot committed
170 171
                    mean_blk[c] /= (N * SP);
                }
Alexey Suhov's avatar
Alexey Suhov committed
172 173 174

                if (SP_N_nthr > 1) mkldnn_thr_barrier();

openvino-pushbot's avatar
openvino-pushbot committed
175 176
                for (int c = C_blk_s; c < C_blk_e; c++) {
                    size_t off = c + C_off;
177 178 179 180 181 182 183 184 185 186 187 188 189 190
                    acc_data_t sum = 0.;
                    for (int n = N_s; n < N_e; ++n) {
                        const acc_data_t *_src;
                        size_t soff = off * SP + n * C * SP;
                        if (data_type == data_type::bf16) {
                            // convert src from b16 to f32
                            acc_data_t *tmp_src = tmp_data_ + ithr * SP_cl_align;
                            bf16_cvt_utils::cvt_bfloat16_to_float(tmp_src,
                                    (mkldnn_bfloat16_t *)src + soff,
                                    nstl::max(0, S_e - S_s));
                            _src = tmp_src;
                        } else {
                            _src = reinterpret_cast<const acc_data_t *>(src + soff);
                        }
openvino-pushbot's avatar
openvino-pushbot committed
191 192
                        PRAGMA_OMP_SIMD(reduction(+ : sum))
                        for (int sp = S_s; sp < S_e; ++sp) {
193
                            acc_data_t m = _src[sp] - mean[off];
openvino-pushbot's avatar
openvino-pushbot committed
194 195
                            sum += m * m;
                        }
196
                    }
197 198
                    ws_reduce[ws_iter_off + SP_N_ithr * C_blks_per_iter + c]
                        = sum;
openvino-pushbot's avatar
openvino-pushbot committed
199
                }
Alexey Suhov's avatar
Alexey Suhov committed
200 201 202

                if (SP_N_nthr > 1) mkldnn_thr_barrier();

openvino-pushbot's avatar
openvino-pushbot committed
203 204 205
                for (int c = C_blk_gl_s; c < C_blk_gl_e; c++) {
                    variance_blk[c] = 0.;
                    for (int n = 0; n < SP_N_nthr; n++)
206 207
                        variance_blk[c] += ws_reduce[ws_iter_off
                                + n * C_blks_per_iter + c];
openvino-pushbot's avatar
openvino-pushbot committed
208 209
                    variance_blk[c] /= (N * SP);
                }
Alexey Suhov's avatar
Alexey Suhov committed
210 211

                if (SP_N_nthr > 1) mkldnn_thr_barrier();
openvino-pushbot's avatar
openvino-pushbot committed
212
            }
Alexey Suhov's avatar
Alexey Suhov committed
213

openvino-pushbot's avatar
openvino-pushbot committed
214 215
            for (int c = C_blk_s; c < C_blk_e; c++) {
                size_t off = c + C_off;
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
                acc_data_t sqrt_variance
                        = static_cast<acc_data_t>(sqrtf(variance[off] + eps));
                acc_data_t sm = (use_scaleshift ? scaleshift[off] : 1.0f) / sqrt_variance;
                acc_data_t sv = use_scaleshift ? scaleshift[C + off] : 0;
                for (int n = N_s; n < N_e; ++n) {
                    acc_data_t *_dst;
                    const acc_data_t *_src;
                    size_t s_off = off * SP + n * C * SP;
                    if (data_type == data_type::bf16) {
                        // store dst to f32 buffer
                        _dst = tmp_data_ + ithr * SP_cl_align;
                        // convert src from b16 to f32
                        acc_data_t *tmp_src = tmp_data_ + (nthr + ithr) * SP_cl_align;
                        bf16_cvt_utils::cvt_bfloat16_to_float(tmp_src,
                                (mkldnn_bfloat16_t *)src + s_off,
                                nstl::max(0, S_e - S_s));
                        _src = tmp_src;
                    } else {
                        _dst = reinterpret_cast<acc_data_t *>(dst + s_off);
                        _src = reinterpret_cast<const acc_data_t *>(src + s_off);
                    }
openvino-pushbot's avatar
openvino-pushbot committed
237 238 239 240
#if SAFE_TO_USE_OMP_SIMD
                    PRAGMA_OMP_SIMD()
#endif
                    for (int sp = S_s; sp < S_e; ++sp) {
241 242
                        size_t d_off = s_off + sp;
                        acc_data_t bn_res = sm * (_src[sp] - mean[off]) + sv;
openvino-pushbot's avatar
openvino-pushbot committed
243 244 245 246 247 248 249 250 251 252
                        if (fuse_bn_relu) {
                            if (bn_res <= 0) {
                                bn_res = 0;
                                if (is_training)
                                    ws[d_off] = 0;
                            } else {
                                if (is_training)
                                    ws[d_off] = 1;
                            }
                        }
253
                        _dst[sp] = maybe_post_op(bn_res);
openvino-pushbot's avatar
openvino-pushbot committed
254
                    }
255 256 257 258 259 260 261
                    if (data_type == data_type::bf16) {
                        // convert dst from f32 to b16
                        bf16_cvt_utils::cvt_float_to_bfloat16(
                                (mkldnn_bfloat16_t *)dst + s_off, _dst,
                                nstl::max(0, S_e - S_s));
                    }
                }
openvino-pushbot's avatar
openvino-pushbot committed
262 263
            }
        }
Alexey Suhov's avatar
Alexey Suhov committed
264
    });
openvino-pushbot's avatar
openvino-pushbot committed
265 266
}

267 268 269 270 271
template struct ncsp_batch_normalization_fwd_t<data_type::f32>;
template struct ncsp_batch_normalization_fwd_t<data_type::bf16>;

template <data_type_t data_type>
void ncsp_batch_normalization_bwd_t<data_type>::execute_backward() const {
openvino-pushbot's avatar
openvino-pushbot committed
272
    auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
273 274
    auto mean = reinterpret_cast<const acc_data_t *>(this->input_memory(1));
    auto variance = reinterpret_cast<const acc_data_t *>(this->input_memory(2));
openvino-pushbot's avatar
openvino-pushbot committed
275
    auto diff_dst = reinterpret_cast<const data_t *>(this->input_memory(3));
276 277
    auto scaleshift
            = reinterpret_cast<const acc_data_t *>(this->input_memory(4));
openvino-pushbot's avatar
openvino-pushbot committed
278
    auto diff_src = reinterpret_cast<data_t *>(this->memory(0));
279 280 281 282

    auto scratchpad = this->scratchpad();

    auto diff_scaleshift = this->memory(1)
283 284
        ? reinterpret_cast<acc_data_t *>(this->memory(1))
        : scratchpad.template get<acc_data_t>(key_bnorm_tmp_diff_ss);
openvino-pushbot's avatar
openvino-pushbot committed
285
    auto ws = reinterpret_cast<const uint8_t *>(
286
            this->input_memory(pd()->ws_idx()));
287 288
    auto *ws_reduce = scratchpad.template get<acc_data_t>(key_bnorm_reduction);
    acc_data_t *tmp_data_ = scratchpad.template get<acc_data_t>(key_bnorm_bf16cvt);
openvino-pushbot's avatar
openvino-pushbot committed
289

290 291
    const bool has_spatial = utils::one_of(pd()->ndims(), 4, 5);
    int SP = (has_spatial) ? pd()->H() * pd()->W() * pd()->D() : 1;
292 293
    const int simd_w = 16;
    const int SP_cl_align = utils::rnd_up(SP, simd_w);
294 295 296 297 298
    size_t C = pd()->C(), N = pd()->MB();
    const bool use_scaleshift = pd()->use_scaleshift();
    const float eps = pd()->desc()->batch_norm_epsilon;
    const bool calculate_diff_stats = !pd()->use_global_stats();
    const bool fuse_bn_relu = pd()->fuse_bn_relu();
openvino-pushbot's avatar
openvino-pushbot committed
299

Alexey Suhov's avatar
Alexey Suhov committed
300
    int nthr = mkldnn_get_max_threads();
openvino-pushbot's avatar
openvino-pushbot committed
301 302 303
    size_t l3_size_ = get_cache_size(3, true) * nthr / 2;
    size_t data_size = N * C * SP * sizeof(data_t);
    bool do_blocking = (data_size >= l3_size_ / 2 && l3_size_ > 0);
Alexey Suhov's avatar
Alexey Suhov committed
304

305
    parallel(0, (size_t)mkldnn_get_max_threads(), [&](const int ithr, const int nthr) {
Alexey Suhov's avatar
Alexey Suhov committed
306
        int C_blks_per_iter = 1, iters = 1;
openvino-pushbot's avatar
openvino-pushbot committed
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
        int C_ithr = 0, C_nthr = 0, N_ithr = 0, N_nthr = 0, N_s = 0, N_e = 0;
        int S_ithr = 0, S_nthr = 0, S_s = 0, S_e = 0;
        int C_blk_gl_s = 0, C_blk_gl_e = 0, C_blk_s = 0, C_blk_e = 0;
        if (do_blocking) {
            size_t working_set_size = 2 * N * SP * sizeof(data_t);
            bnorm_utils::cache_balance(
                    working_set_size, C, C_blks_per_iter, iters);
        } else
            C_blks_per_iter = C;
        int last_iter_blks = C - (iters - 1) * C_blks_per_iter;
        bool spatial_thr_allowed
                = bnorm_utils::thread_balance(do_blocking, true, ithr, nthr, N,
                        C_blks_per_iter, SP, C_ithr, C_nthr, C_blk_s, C_blk_e,
                        N_ithr, N_nthr, N_s, N_e, S_ithr, S_nthr, S_s, S_e);
        balance211(C_blks_per_iter, nthr, ithr, C_blk_gl_s, C_blk_gl_e);
        int SP_N_ithr = N_ithr * S_nthr + S_ithr;
        int SP_N_nthr = N_nthr * S_nthr;

        for (int it = 0; it < iters; ++it) {
            if (it == iters - 1 && iters > 1) {
327 328 329 330 331 332
                // On the last iteration the access pattern to ws_reduce
                // might change (due to re-balance on C). So sync the
                // threads if they are not synced by the algorithm.
                if (SP_N_nthr == 1 && mkldnn_thr_syncable())
                    mkldnn_thr_barrier();

openvino-pushbot's avatar
openvino-pushbot committed
333 334 335 336 337 338
                C_blk_s = C_blk_e = N_s = N_e = 0;
                spatial_thr_allowed = bnorm_utils::thread_balance(do_blocking,
                        spatial_thr_allowed, ithr, nthr, N, last_iter_blks, SP,
                        C_ithr, C_nthr, C_blk_s, C_blk_e, N_ithr, N_nthr, N_s,
                        N_e, S_ithr, S_nthr, S_s, S_e);
                balance211(last_iter_blks, nthr, ithr, C_blk_gl_s, C_blk_gl_e);
Alexey Suhov's avatar
Alexey Suhov committed
339 340
                SP_N_ithr = N_ithr * S_nthr + S_ithr;
                SP_N_nthr = N_nthr * S_nthr;
openvino-pushbot's avatar
openvino-pushbot committed
341 342
            }
            size_t C_off = it * C_blks_per_iter;
343 344 345 346 347 348
            // On the last iteration the access pattern to ws_reduce
            // might change (due to re-balance on C). Since sync is not always
            // possible (in case of TBB) use different parts of ws for each
            // iteration if threads are not synced by the algorithm.
            size_t ws_iter_off = (mkldnn_thr_syncable() ? 0 : 1) * 2 * C_off;

349 350
            acc_data_t *diff_gamma_blk = diff_scaleshift + C_off;
            acc_data_t *diff_beta_blk = diff_scaleshift + C + C_off;
openvino-pushbot's avatar
openvino-pushbot committed
351 352
            for (int c = C_blk_s; c < C_blk_e; c++) {
                size_t off = c + C_off;
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
                acc_data_t diff_gamma = 0.0, diff_beta = 0.0;
                acc_data_t v_mean = mean[off];
                for (int n = N_s; n < N_e; ++n) {
                    const acc_data_t *_diff_dst;
                    const acc_data_t *_src;
                    size_t s_off = off * SP + n * C * SP;
                    if (data_type == data_type::bf16) {
                        // convert diff_dst from b16 to f32
                        acc_data_t *tmp_diff_dst = tmp_data_ + ithr * SP_cl_align;
                        bf16_cvt_utils::cvt_bfloat16_to_float(tmp_diff_dst,
                                (mkldnn_bfloat16_t *)diff_dst + s_off,
                                nstl::max(0, S_e - S_s));
                        _diff_dst = tmp_diff_dst;
                        // convert src from b16 to f32
                        acc_data_t *tmp_src = tmp_data_ + (nthr + ithr) * SP_cl_align;
                        bf16_cvt_utils::cvt_bfloat16_to_float(tmp_src,
                                (mkldnn_bfloat16_t *)src + s_off,
                                nstl::max(0, S_e - S_s));
                        _src = tmp_src;
                    } else {
                        _diff_dst = reinterpret_cast<const acc_data_t *>(diff_dst + s_off);
                        _src = reinterpret_cast<const acc_data_t *>(src + s_off);
                    }
openvino-pushbot's avatar
openvino-pushbot committed
376 377
                    PRAGMA_OMP_SIMD(reduction(+ : diff_gamma, diff_beta))
                    for (int sp = S_s; sp < S_e; ++sp) {
378 379 380 381
                        const size_t d_off = s_off + sp;
                        acc_data_t dd;
                        if (fuse_bn_relu && !ws[d_off])
                            dd = 0;
openvino-pushbot's avatar
openvino-pushbot committed
382
                        else
383 384 385
                            dd = _diff_dst[sp];
                        diff_gamma
                                += (_src[sp] - v_mean) * dd;
openvino-pushbot's avatar
openvino-pushbot committed
386 387
                        diff_beta += dd;
                    }
388
                }
389 390 391 392
                ws_reduce[ws_iter_off + SP_N_ithr * C_blks_per_iter + c]
                    = diff_gamma;
                ws_reduce[ws_iter_off + SP_N_nthr * C_blks_per_iter
                        + SP_N_ithr * C_blks_per_iter + c] = diff_beta;
openvino-pushbot's avatar
openvino-pushbot committed
393
            }
Alexey Suhov's avatar
Alexey Suhov committed
394 395 396

            if (SP_N_nthr > 1) mkldnn_thr_barrier();

openvino-pushbot's avatar
openvino-pushbot committed
397
            for (int c = C_blk_gl_s; c < C_blk_gl_e; c++) {
398
                acc_data_t sqrt_variance = static_cast<acc_data_t>(
openvino-pushbot's avatar
openvino-pushbot committed
399 400 401 402
                        1.0f / sqrtf(variance[c + C_off] + eps));
                diff_gamma_blk[c] = 0.;
                diff_beta_blk[c] = 0.;
                for (int n = 0; n < SP_N_nthr; n++) {
403
                    diff_gamma_blk[c] += ws_reduce[ws_iter_off
openvino-pushbot's avatar
openvino-pushbot committed
404
                            + n * C_blks_per_iter + c];
405 406 407
                    diff_beta_blk[c] += ws_reduce[ws_iter_off
                            + SP_N_nthr * C_blks_per_iter + n * C_blks_per_iter
                            + c];
openvino-pushbot's avatar
openvino-pushbot committed
408 409 410
                }
                diff_gamma_blk[c] *= sqrt_variance;
            }
Alexey Suhov's avatar
Alexey Suhov committed
411 412 413

            if (SP_N_nthr > 1) mkldnn_thr_barrier();

openvino-pushbot's avatar
openvino-pushbot committed
414 415
            for (int c = C_blk_s; c < C_blk_e; c++) {
                size_t off = c + C_off;
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
                acc_data_t gamma = use_scaleshift ? scaleshift[off] : 1;
                acc_data_t sqrt_variance = static_cast<acc_data_t>(
                        1.0f / sqrtf(variance[off] + eps));
                acc_data_t v_mean = mean[off];
                for (int n = N_s; n < N_e; ++n) {
                    acc_data_t *_diff_src;
                    const acc_data_t *_diff_dst;
                    const acc_data_t *_src;
                    size_t s_off = off * SP + n * C * SP;
                    if (data_type == data_type::bf16) {
                        // store diff_src to f32 buffer
                        _diff_src = tmp_data_ + ithr * SP_cl_align;
                        // convert diff_dst from b16 to f32
                        acc_data_t *tmp_diff_dst = tmp_data_ + ithr * SP_cl_align;
                        bf16_cvt_utils::cvt_bfloat16_to_float(tmp_diff_dst,
                                (mkldnn_bfloat16_t *)diff_dst + s_off,
                                nstl::max(0, S_e - S_s));
                        _diff_dst = tmp_diff_dst;
                        if (calculate_diff_stats) {
                            // convert src from b16 to f32
                            acc_data_t *tmp_src = tmp_data_ + (2 * nthr + ithr) * SP_cl_align;
                            bf16_cvt_utils::cvt_bfloat16_to_float(tmp_src,
                                    (mkldnn_bfloat16_t *)src + s_off,
                                    nstl::max(0, S_e - S_s));
                            _src = tmp_src;
                        } else
                            _src = nullptr; // to avoid compiler warning w/ gcc483
                    } else {
                        _diff_src = reinterpret_cast<acc_data_t *>(diff_src + s_off);
                        _diff_dst = reinterpret_cast<const acc_data_t *>(diff_dst + s_off);
                        _src = reinterpret_cast<const acc_data_t *>(src + s_off);
                    }
openvino-pushbot's avatar
openvino-pushbot committed
448 449 450 451
#if SAFE_TO_USE_OMP_SIMD
                    PRAGMA_OMP_SIMD()
#endif
                    for (int sp = S_s; sp < S_e; ++sp) {
452 453 454 455
                        const size_t d_off = s_off + sp;
                        acc_data_t v_diff_src;
                        if (fuse_bn_relu && !ws[d_off])
                            v_diff_src = 0;
openvino-pushbot's avatar
openvino-pushbot committed
456
                        else
457
                            v_diff_src = _diff_dst[sp];
openvino-pushbot's avatar
openvino-pushbot committed
458 459
                        if (calculate_diff_stats) {
                            v_diff_src -= diff_beta_blk[c] / (SP * N)
460 461 462
                                    + (_src[sp] - v_mean)
                                            * diff_gamma_blk[c] * sqrt_variance
                                            / (SP * N);
openvino-pushbot's avatar
openvino-pushbot committed
463 464
                        }
                        v_diff_src *= gamma * sqrt_variance;
465 466 467 468 469 470 471
                        _diff_src[sp] = v_diff_src;
                    }
                    if (data_type == data_type::bf16) {
                        // convert diff_src from f32 to b16
                        bf16_cvt_utils::cvt_float_to_bfloat16(
                                (mkldnn_bfloat16_t *)diff_src + s_off,
                                _diff_src, nstl::max(0, S_e - S_s));
openvino-pushbot's avatar
openvino-pushbot committed
472
                    }
473
                }
openvino-pushbot's avatar
openvino-pushbot committed
474 475
            }
        }
Alexey Suhov's avatar
Alexey Suhov committed
476
    });
openvino-pushbot's avatar
openvino-pushbot committed
477
}
478 479 480

template struct ncsp_batch_normalization_bwd_t<data_type::f32>;
template struct ncsp_batch_normalization_bwd_t<data_type::bf16>;
openvino-pushbot's avatar
openvino-pushbot committed
481 482 483 484 485
}
}
}

// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s