imgproc_integral.cl 20.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Shengen Yan,yanshengen@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
Andrey Pavlenko's avatar
Andrey Pavlenko committed
28
//     and/or other materials provided with the distribution.
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

Ilya Lavrenov's avatar
Ilya Lavrenov committed
46 47
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
yao's avatar
yao committed
48
#pragma OPENCL EXTENSION cl_amd_fp64:enable
Ilya Lavrenov's avatar
Ilya Lavrenov committed
49 50
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
yao's avatar
yao committed
51
#endif
52 53 54
#define CONVERT(step) ((step)>>1)
#else
#define CONVERT(step) ((step))
55
#endif
Ilya Lavrenov's avatar
Ilya Lavrenov committed
56

57 58 59 60 61 62 63 64 65 66
#define LSIZE 256
#define LSIZE_1 255
#define LSIZE_2 254
#define HF_LSIZE 128
#define LOG_LSIZE 8
#define LOG_NUM_BANKS 5
#define NUM_BANKS 32
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)


67 68
kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global TYPE *sqsum,
                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step,int dst1_step)
69
{
70 71
    int lid = get_local_id(0);
    int gid = get_group_id(0);
72
    int4 src_t[2], sum_t[2];
73
    TYPE4 sqsum_t[2];
74
    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
75
    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
76
    __local int* sum_p;
77
    __local TYPE* sqsum_p;
78 79 80 81
    src_step = src_step >> 2;
    gid = gid << 1;
    for(int i = 0; i < rows; i =i + LSIZE_1)
    {
82 83
        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : 0);
        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : 0);
84

niko's avatar
niko committed
85
        sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
86
        sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
87
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
88
        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
89
        barrier(CLK_LOCAL_MEM_FENCE);
90

91 92
        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];
93
        lm_sqsum[0][bf_loc] = convert_TYPE4(src_t[0] * src_t[0]);
94 95

        lm_sum[1][bf_loc] = src_t[1];
96
        lm_sqsum[1][bf_loc] = convert_TYPE4(src_t[1] * src_t[1]);
97

98 99 100 101 102
        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
103 104
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);
105 106 107 108 109 110 111 112

            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
            }
            offset <<= 1;
        }
113
        barrier(CLK_LOCAL_MEM_FENCE);
114 115 116 117 118 119 120 121 122 123
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
        }
        for(int d = 1;  d < LSIZE; d <<= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
124 125 126
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);

127 128 129 130
            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
131

132 133 134 135
                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
            }
        }
136
        barrier(CLK_LOCAL_MEM_FENCE);
137 138
        int loc_s0 = gid * dst_step  + i + lid - 1 - pre_invalid * dst_step /4, loc_s1 = loc_s0 + dst_step ;
        int loc_sq0 = gid * CONVERT(dst1_step) + i + lid - 1 - pre_invalid * dst1_step / sizeof(TYPE),loc_sq1 = loc_sq0 + CONVERT(dst1_step);
yao's avatar
yao committed
139 140
        if(lid > 0 && (i+lid) <= rows)
        {
141 142
            lm_sum[0][bf_loc] += sum_t[0];
            lm_sum[1][bf_loc] += sum_t[1];
143 144 145
            lm_sqsum[0][bf_loc] += sqsum_t[0];
            lm_sqsum[1][bf_loc] += sqsum_t[1];
            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
146
            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
147 148 149 150
            for(int k = 0; k < 4; k++)
            {
                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
151
                sqsum[loc_sq0 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
152
            }
153
            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
154
            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
155 156 157 158
            for(int k = 0; k < 4; k++)
            {
                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
159
                sqsum[loc_sq1 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
160
            }
161 162 163 164 165 166
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}


167 168
kernel void integral_rows_D4(__global int4 *srcsum,__global TYPE4 * srcsqsum,__global int *sum ,
                          __global TYPE *sqsum,int rows,int cols,int src_step,int src1_step,int sum_step,
169 170
                          int sqsum_step,int sum_offset,int sqsum_offset)
{
171 172
    int lid = get_local_id(0);
    int gid = get_group_id(0);
173
    int4 src_t[2], sum_t[2];
174
    TYPE4 sqsrc_t[2],sqsum_t[2];
175
    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
176
    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
177
    __local int *sum_p;
178
    __local TYPE *sqsum_p;
179
    src_step = src_step >> 4;
180 181
    src1_step = (src1_step / sizeof(TYPE)) >> 2 ;
    gid <<= 1;
182 183
    for(int i = 0; i < rows; i =i + LSIZE_1)
    {
184 185 186 187
        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid ] : (int4)0;
        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid ] : (TYPE4)0;
        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid  + 1] : (int4)0;
        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid  + 1] : (TYPE4)0;
188

189
        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
190
        sqsum_t[0] =  (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
191
        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
192
        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
193
        barrier(CLK_LOCAL_MEM_FENCE);
194

195 196 197
        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];
        lm_sqsum[0][bf_loc] = sqsrc_t[0];
198

199 200
        lm_sum[1][bf_loc] = src_t[1];
        lm_sqsum[1][bf_loc] = sqsrc_t[1];
201

202 203 204 205 206
        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
207 208
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);
209 210 211 212 213 214 215 216

            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
            }
            offset <<= 1;
        }
217
        barrier(CLK_LOCAL_MEM_FENCE);
218 219 220 221 222 223 224 225 226 227
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
        }
        for(int d = 1;  d < LSIZE; d <<= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
228 229 230
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);

231 232 233 234
            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
235

236 237 238 239
                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
            }
        }
niko's avatar
niko committed
240
        barrier(CLK_LOCAL_MEM_FENCE);
241 242
        if(gid == 0 && (i + lid) <= rows)
        {
yao's avatar
yao committed
243 244
            sum[sum_offset + i + lid] = 0;
            sqsum[sqsum_offset + i + lid] = 0;
245 246 247
        }
        if(i + lid == 0)
        {
248 249
            int loc0 = gid  * sum_step;
            int loc1 = gid  * CONVERT(sqsum_step);
yao's avatar
yao committed
250
            for(int k = 1; k <= 8; k++)
251
            {
252
                if(gid * 4 + k > cols) break;
253
                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
254
                sqsum[sqsum_offset + loc1 + k * sqsum_step / sizeof(TYPE)] = 0;
255 256
            }
        }
257 258 259
        int loc_s0 = sum_offset + gid  * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
        int loc_sq0 = sqsum_offset + gid  * CONVERT(sqsum_step) + sqsum_step / sizeof(TYPE) + i + lid, loc_sq1 = loc_sq0 + CONVERT(sqsum_step) ;

yao's avatar
yao committed
260 261
        if(lid > 0 && (i+lid) <= rows)
        {
262 263
            lm_sum[0][bf_loc] += sum_t[0];
            lm_sum[1][bf_loc] += sum_t[1];
264 265 266
            lm_sqsum[0][bf_loc] += sqsum_t[0];
            lm_sqsum[1][bf_loc] += sqsum_t[1];
            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
267
            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
268 269
            for(int k = 0; k < 4; k++)
            {
270
                if(gid * 4 + k >= cols) break;
271
                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
272
                sqsum[loc_sq0 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
273
            }
274
            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
275
            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
276 277
            for(int k = 0; k < 4; k++)
            {
278
                if(gid * 4 + 4 + k >= cols) break;
279
                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
280
                sqsum[loc_sq1 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
281
            }
282
          }
283 284 285
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
yao's avatar
yao committed
286

287 288
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global TYPE *sqsum,
                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step, int dst1_step)
yao's avatar
yao committed
289
{
290 291
    int lid = get_local_id(0);
    int gid = get_group_id(0);
yao's avatar
yao committed
292
    float4 src_t[2], sum_t[2];
293
    TYPE4 sqsum_t[2];
yao's avatar
yao committed
294
    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
295
    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
yao's avatar
yao committed
296
    __local float* sum_p;
297
    __local TYPE* sqsum_p;
yao's avatar
yao committed
298 299 300 301
    src_step = src_step >> 2;
    gid = gid << 1;
    for(int i = 0; i < rows; i =i + LSIZE_1)
    {
302 303
        src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : (float4)0);
        src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : (float4)0);
yao's avatar
yao committed
304 305

        sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
306
        sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
yao's avatar
yao committed
307
        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
308
        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
yao's avatar
yao committed
309 310 311 312
        barrier(CLK_LOCAL_MEM_FENCE);

        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];
313
        lm_sqsum[0][bf_loc] = convert_TYPE4(src_t[0] * src_t[0]);
yao's avatar
yao committed
314 315

        lm_sum[1][bf_loc] = src_t[1];
316
        lm_sqsum[1][bf_loc] = convert_TYPE4(src_t[1] * src_t[1]);
yao's avatar
yao committed
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357

        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
            }
            offset <<= 1;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
        }
        for(int d = 1;  d < LSIZE; d <<= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];

                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
358
        int loc_sq0 = gid * CONVERT(dst1_step) + i + lid - 1 - pre_invalid * dst1_step / sizeof(TYPE), loc_sq1 = loc_sq0 + CONVERT(dst1_step);
yao's avatar
yao committed
359 360 361 362 363 364 365
        if(lid > 0 && (i+lid) <= rows)
        {
            lm_sum[0][bf_loc] += sum_t[0];
            lm_sum[1][bf_loc] += sum_t[1];
            lm_sqsum[0][bf_loc] += sqsum_t[0];
            lm_sqsum[1][bf_loc] += sqsum_t[1];
            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
366
            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
yao's avatar
yao committed
367 368 369 370
            for(int k = 0; k < 4; k++)
            {
                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
371
                sqsum[loc_sq0 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
yao's avatar
yao committed
372 373
            }
            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
374
            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
yao's avatar
yao committed
375 376 377 378
            for(int k = 0; k < 4; k++)
            {
                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
379
                sqsum[loc_sq1 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
yao's avatar
yao committed
380 381 382 383 384 385 386
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}


387 388
kernel void integral_rows_D5(__global float4 *srcsum,__global TYPE4 * srcsqsum,__global float *sum ,
                          __global TYPE *sqsum,int rows,int cols,int src_step,int src1_step, int sum_step,
yao's avatar
yao committed
389 390
                          int sqsum_step,int sum_offset,int sqsum_offset)
{
391 392
    int lid = get_local_id(0);
    int gid = get_group_id(0);
yao's avatar
yao committed
393
    float4 src_t[2], sum_t[2];
394
    TYPE4 sqsrc_t[2],sqsum_t[2];
yao's avatar
yao committed
395
    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
396
    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
yao's avatar
yao committed
397
    __local float *sum_p;
398
    __local TYPE *sqsum_p;
yao's avatar
yao committed
399
    src_step = src_step >> 4;
400
    src1_step = (src1_step / sizeof(TYPE)) >> 2;
yao's avatar
yao committed
401 402 403
    for(int i = 0; i < rows; i =i + LSIZE_1)
    {
        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
404
        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid * 2] : (TYPE4)0;
yao's avatar
yao committed
405
        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
406
        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid * 2 + 1] : (TYPE4)0;
yao's avatar
yao committed
407 408

        sum_t[0] =  (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
409
        sqsum_t[0] =  (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
yao's avatar
yao committed
410
        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
411
        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
yao's avatar
yao committed
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
        barrier(CLK_LOCAL_MEM_FENCE);

        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
        lm_sum[0][bf_loc] = src_t[0];
        lm_sqsum[0][bf_loc] = sqsrc_t[0];

        lm_sum[1][bf_loc] = src_t[1];
        lm_sqsum[1][bf_loc] = sqsrc_t[1];

        int offset = 1;
        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
            }
            offset <<= 1;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if(lid < 2)
        {
            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
        }
        for(int d = 1;  d < LSIZE; d <<= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            offset >>= 1;
            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
            ai += GET_CONFLICT_OFFSET(ai);
            bi += GET_CONFLICT_OFFSET(bi);

            if((lid & 127) < d)
            {
                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];

                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if(gid == 0 && (i + lid) <= rows)
        {
            sum[sum_offset + i + lid] = 0;
            sqsum[sqsum_offset + i + lid] = 0;
        }
        if(i + lid == 0)
        {
            int loc0 = gid * 2 * sum_step;
468
            int loc1 = gid * 2 * CONVERT(sqsum_step);
yao's avatar
yao committed
469 470 471 472
            for(int k = 1; k <= 8; k++)
            {
                if(gid * 8 + k > cols) break;
                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
473
                sqsum[sqsum_offset + loc1 + k * sqsum_step / sizeof(TYPE)] = 0;
yao's avatar
yao committed
474 475 476
            }
        }
        int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
477
        int loc_sq0 = sqsum_offset + gid * 2 * CONVERT(sqsum_step) + sqsum_step / sizeof(TYPE) + i + lid, loc_sq1 = loc_sq0 + CONVERT(sqsum_step) ;
yao's avatar
yao committed
478 479 480 481 482 483 484
        if(lid > 0 && (i+lid) <= rows)
        {
            lm_sum[0][bf_loc] += sum_t[0];
            lm_sum[1][bf_loc] += sum_t[1];
            lm_sqsum[0][bf_loc] += sqsum_t[0];
            lm_sqsum[1][bf_loc] += sqsum_t[1];
            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
485
            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
yao's avatar
yao committed
486 487 488 489
            for(int k = 0; k < 4; k++)
            {
                if(gid * 8 + k >= cols) break;
                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
490
                sqsum[loc_sq0 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
yao's avatar
yao committed
491 492
            }
            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
493
            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
yao's avatar
yao committed
494 495 496 497
            for(int k = 0; k < 4; k++)
            {
                if(gid * 8 + 4 + k >= cols) break;
                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
498
                sqsum[loc_sq1 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
yao's avatar
yao committed
499 500 501 502
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
503
}