Commit bd1d7cd2 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by OpenCV Buildbot

Merge pull request #839 from pengx17:2.4_ocl_csbp

parents 87765c0f 9cfa24e5
...@@ -1769,6 +1769,44 @@ namespace cv ...@@ -1769,6 +1769,44 @@ namespace cv
std::vector<oclMat> datas; std::vector<oclMat> datas;
oclMat out; oclMat out;
}; };
class CV_EXPORTS StereoConstantSpaceBP
{
public:
enum { DEFAULT_NDISP = 128 };
enum { DEFAULT_ITERS = 8 };
enum { DEFAULT_LEVELS = 4 };
enum { DEFAULT_NR_PLANE = 4 };
static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
explicit StereoConstantSpaceBP(
int ndisp = DEFAULT_NDISP,
int iters = DEFAULT_ITERS,
int levels = DEFAULT_LEVELS,
int nr_plane = DEFAULT_NR_PLANE,
int msg_type = CV_32F);
StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
int min_disp_th = 0,
int msg_type = CV_32F);
void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
int ndisp;
int iters;
int levels;
int nr_plane;
float max_data_term;
float data_weight;
float max_disc_term;
float disc_single_jump;
int min_disp_th;
int msg_type;
bool use_local_init_data_cost;
private:
oclMat u[2], d[2], l[2], r[2];
oclMat disp_selected_pyr[2];
oclMat data_cost;
oclMat data_cost_selected;
oclMat temp;
oclMat out;
};
} }
} }
#if defined _MSC_VER && _MSC_VER >= 1200 #if defined _MSC_VER && _MSC_VER >= 1200
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Jin Ma, jin@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef FLT_MAX
#define FLT_MAX CL_FLT_MAX
#endif
#ifndef SHRT_MAX
#define SHRT_MAX CL_SHORT_MAX
#endif
///////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////get_first_k_initial_global//////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////
__kernel void get_first_k_initial_global_0(__global short *data_cost_selected_, __global short *selected_disp_pyr,
__global short *ctemp, int h, int w, int nr_plane,
int cmsg_step1, int cdisp_step1, int cndisp)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
__global short *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
__global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
__global short *data_cost = ctemp + y * cmsg_step1 + x;
for(int i = 0; i < nr_plane; i++)
{
short minimum = SHRT_MAX;
int id = 0;
for(int d = 0; d < cndisp; d++)
{
short cur = data_cost[d * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = d;
}
}
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost [id * cdisp_step1] = SHRT_MAX;
}
}
}
__kernel void get_first_k_initial_global_1(__global float *data_cost_selected_, __global float *selected_disp_pyr,
__global float *ctemp, int h, int w, int nr_plane,
int cmsg_step1, int cdisp_step1, int cndisp)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
__global float *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
__global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
__global float *data_cost = ctemp + y * cmsg_step1 + x;
for(int i = 0; i < nr_plane; i++)
{
float minimum = FLT_MAX;
int id = 0;
for(int d = 0; d < cndisp; d++)
{
float cur = data_cost[d * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = d;
}
}
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost [id * cdisp_step1] = FLT_MAX;
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////get_first_k_initial_local////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void get_first_k_initial_local_0(__global short *data_cost_selected_, __global short *selected_disp_pyr,
__global short *ctemp,int h, int w, int nr_plane,
int cmsg_step1, int cdisp_step1, int cndisp)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
__global short *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
__global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
__global short *data_cost = ctemp + y * cmsg_step1 + x;
int nr_local_minimum = 0;
short prev = data_cost[0 * cdisp_step1];
short cur = data_cost[1 * cdisp_step1];
short next = data_cost[2 * cdisp_step1];
for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
{
if (cur < prev && cur < next)
{
data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
selected_disparity[nr_local_minimum * cdisp_step1] = d;
data_cost[d * cdisp_step1] = SHRT_MAX;
nr_local_minimum++;
}
prev = cur;
cur = next;
next = data_cost[(d + 1) * cdisp_step1];
}
for (int i = nr_local_minimum; i < nr_plane; i++)
{
short minimum = SHRT_MAX;
int id = 0;
for (int d = 0; d < cndisp; d++)
{
cur = data_cost[d * cdisp_step1];
if (cur < minimum)
{
minimum = cur;
id = d;
}
}
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost[id * cdisp_step1] = SHRT_MAX;
}
}
}
__kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, __global float *selected_disp_pyr,
__global float *ctemp,int h, int w, int nr_plane,
int cmsg_step1, int cdisp_step1, int cndisp)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
__global float *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
__global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
__global float *data_cost = ctemp + y * cmsg_step1 + x;
int nr_local_minimum = 0;
float prev = data_cost[0 * cdisp_step1];
float cur = data_cost[1 * cdisp_step1];
float next = data_cost[2 * cdisp_step1];
for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
{
if (cur < prev && cur < next)
{
data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
selected_disparity[nr_local_minimum * cdisp_step1] = d;
data_cost[d * cdisp_step1] = FLT_MAX ;
nr_local_minimum++;
}
prev = cur;
cur = next;
next = data_cost[(d + 1) * cdisp_step1];
}
for (int i = nr_local_minimum; i < nr_plane; i++)
{
float minimum = FLT_MAX;
int id = 0;
for (int d = 0; d < cndisp; d++)
{
cur = data_cost[d * cdisp_step1];
if (cur < minimum)
{
minimum = cur;
id = d;
}
}
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost[id * cdisp_step1] = FLT_MAX;
}
}
}
///////////////////////////////////////////////////////////////
/////////////////////// init data cost ////////////////////////
///////////////////////////////////////////////////////////////
float compute_3(__global uchar* left, __global uchar* right,
float cdata_weight, float cmax_data_term)
{
float tb = 0.114f * abs((int)left[0] - right[0]);
float tg = 0.587f * abs((int)left[1] - right[1]);
float tr = 0.299f * abs((int)left[2] - right[2]);
return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
}
float compute_1(__global uchar* left, __global uchar* right,
float cdata_weight, float cmax_data_term)
{
return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term);
}
short round_short(float v){
return convert_short_sat_rte(v);
}
///////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////init_data_cost///////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////
__kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright,
int h, int w, int level, int channels,
int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1,
int cth, int cimg_step, int cndisp)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
int y0 = y << level;
int yt = (y + 1) << level;
int x0 = x << level;
int xt = (x + 1) << level;
__global short *data_cost = ctemp + y * cmsg_step1 + x;
for(int d = 0; d < cndisp; ++d)
{
float val = 0.0f;
for(int yi = y0; yi < yt; yi++)
{
for(int xi = x0; xi < xt; xi++)
{
int xr = xi - d;
if(d < cth || xr < 0)
val += cdata_weight * cmax_data_term;
else
{
__global uchar *lle = cleft + yi * cimg_step + xi * channels;
__global uchar *lri = cright + yi * cimg_step + xr * channels;
if(channels == 1)
val += compute_1(lle, lri, cdata_weight, cmax_data_term);
else
val += compute_3(lle, lri, cdata_weight, cmax_data_term);
}
}
}
data_cost[cdisp_step1 * d] = round_short(val);
}
}
}
__kernel void init_data_cost_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright,
int h, int w, int level, int channels,
int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1,
int cth, int cimg_step, int cndisp)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
int y0 = y << level;
int yt = (y + 1) << level;
int x0 = x << level;
int xt = (x + 1) << level;
__global float *data_cost = ctemp + y * cmsg_step1 + x;
for(int d = 0; d < cndisp; ++d)
{
float val = 0.0f;
for(int yi = y0; yi < yt; yi++)
{
for(int xi = x0; xi < xt; xi++)
{
int xr = xi - d;
if(d < cth || xr < 0)
val += cdata_weight * cmax_data_term;
else
{
__global uchar* lle = cleft + yi * cimg_step + xi * channels;
__global uchar* lri = cright + yi * cimg_step + xr * channels;
if(channels == 1)
val += compute_1(lle, lri, cdata_weight, cmax_data_term);
else
val += compute_3(lle, lri, cdata_weight, cmax_data_term);
}
}
}
data_cost[cdisp_step1 * d] = val;
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////init_data_cost_reduce//////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void init_data_cost_reduce_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright,
__local float *smem, int level, int rows, int cols, int h, int winsz, int channels,
int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth,
int cdisp_step1, int cmsg_step1)
{
int x_out = get_group_id(0);
int y_out = get_group_id(1) % h;
//int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2);
int tid = get_local_id(0);
if (d < cndisp)
{
int x0 = x_out << level;
int y0 = y_out << level;
int len = min(y0 + winsz, rows) - y0;
float val = 0.0f;
if (x0 + tid < cols)
{
if (x0 + tid - d < 0 || d < cth)
val = cdata_weight * cmax_data_term * len;
else
{
__global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );
__global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
for(int y = 0; y < len; ++y)
{
if(channels == 1)
val += compute_1(lle, lri, cdata_weight, cmax_data_term);
else
val += compute_3(lle, lri, cdata_weight, cmax_data_term);
lle += cimg_step;
lri += cimg_step;
}
}
}
__local float* dline = smem + winsz * get_local_id(2);
dline[tid] = val;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 256)
{
if (tid < 128)
dline[tid] += dline[tid + 128];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 128)
{
if (tid < 64)
dline[tid] += dline[tid + 64];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d<cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
__global short* data_cost = ctemp + y_out * cmsg_step1 + x_out;
if (tid == 0)
data_cost[cdisp_step1 * d] = convert_short_sat_rte(dline[0]);
}
}
__kernel void init_data_cost_reduce_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright,
__local float *smem, int level, int rows, int cols, int h, int winsz, int channels,
int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth,
int cdisp_step1, int cmsg_step1)
{
int x_out = get_group_id(0);
int y_out = get_group_id(1) % h;
int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2);
int tid = get_local_id(0);
if (d < cndisp)
{
int x0 = x_out << level;
int y0 = y_out << level;
int len = min(y0 + winsz, rows) - y0;
float val = 0.0f;
//float val = 528.0f;
if (x0 + tid < cols)
{
if (x0 + tid - d < 0 || d < cth)
val = cdata_weight * cmax_data_term * len;
else
{
__global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );
__global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
for(int y = 0; y < len; ++y)
{
if(channels == 1)
val += compute_1(lle, lri, cdata_weight, cmax_data_term);
else
val += compute_3(lle, lri, cdata_weight, cmax_data_term);
lle += cimg_step;
lri += cimg_step;
}
}
}
__local float* dline = smem + winsz * get_local_id(2);
dline[tid] = val;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 256)
if (tid < 128)
dline[tid] += dline[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local float* dline = smem + winsz * get_local_id(2);
if (winsz >= 128)
if (tid < 64)
dline[tid] += dline[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < cndisp)
{
__global float *data_cost = ctemp + y_out * cmsg_step1 + x_out;
__local float* dline = smem + winsz * get_local_id(2);
if (tid == 0)
data_cost[cdisp_step1 * d] = dline[0];
}
}
///////////////////////////////////////////////////////////////
////////////////////// compute data cost //////////////////////
///////////////////////////////////////////////////////////////
__kernel void compute_data_cost_0(__global const short *selected_disp_pyr, __global short *data_cost_,
__global uchar *cleft, __global uchar *cright,
int h, int w, int level, int nr_plane, int channels,
int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight,
float cmax_data_term, int cimg_step, int cth)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
int y0 = y << level;
int yt = (y + 1) << level;
int x0 = x << level;
int xt = (x + 1) << level;
__global const short *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;
__global short *data_cost = data_cost_ + y * cmsg_step1 + x;
for(int d = 0; d < nr_plane; d++)
{
float val = 0.0f;
for(int yi = y0; yi < yt; yi++)
{
for(int xi = x0; xi < xt; xi++)
{
int sel_disp = selected_disparity[d * cdisp_step2];
int xr = xi - sel_disp;
if (xr < 0 || sel_disp < cth)
val += cdata_weight * cmax_data_term;
else
{
__global uchar* left_x = cleft + yi * cimg_step + xi * channels;
__global uchar* right_x = cright + yi * cimg_step + xr * channels;
if(channels == 1)
val += compute_1(left_x, right_x, cdata_weight, cmax_data_term);
else
val += compute_3(left_x, right_x, cdata_weight, cmax_data_term);
}
}
}
data_cost[cdisp_step1 * d] = convert_short_sat_rte(val);
}
}
}
__kernel void compute_data_cost_1(__global const float *selected_disp_pyr, __global float *data_cost_,
__global uchar *cleft, __global uchar *cright,
int h, int w, int level, int nr_plane, int channels,
int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight,
float cmax_data_term, int cimg_step, int cth)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
int y0 = y << level;
int yt = (y + 1) << level;
int x0 = x << level;
int xt = (x + 1) << level;
__global const float *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;
__global float *data_cost = data_cost_ + y * cmsg_step1 + x;
for(int d = 0; d < nr_plane; d++)
{
float val = 0.0f;
for(int yi = y0; yi < yt; yi++)
{
for(int xi = x0; xi < xt; xi++)
{
int sel_disp = selected_disparity[d * cdisp_step2];
int xr = xi - sel_disp;
if (xr < 0 || sel_disp < cth)
val += cdata_weight * cmax_data_term;
else
{
__global uchar* left_x = cleft + yi * cimg_step + xi * channels;
__global uchar* right_x = cright + yi * cimg_step + xr * channels;
if(channels == 1)
val += compute_1(left_x, right_x, cdata_weight, cmax_data_term);
else
val += compute_3(left_x, right_x, cdata_weight, cmax_data_term);
}
}
}
data_cost[cdisp_step1 * d] = val;
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////compute_data_cost_reduce//////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void compute_data_cost_reduce_0(__global const short* selected_disp_pyr, __global short* data_cost_,
__global uchar *cleft, __global uchar *cright,__local float *smem,
int level, int rows, int cols, int h, int nr_plane,
int channels, int winsz,
int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2,
float cdata_weight, float cmax_data_term, int cimg_step,int cth)
{
int x_out = get_group_id(0);
int y_out = get_group_id(1) % h;
int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2);
int tid = get_local_id(0);
__global const short* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;
__global short* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;
if (d < nr_plane)
{
int sel_disp = selected_disparity[d * cdisp_step2];
int x0 = x_out << level;
int y0 = y_out << level;
int len = min(y0 + winsz, rows) - y0;
float val = 0.0f;
if (x0 + tid < cols)
{
if (x0 + tid - sel_disp < 0 || sel_disp < cth)
val = cdata_weight * cmax_data_term * len;
else
{
__global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );
__global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
for(int y = 0; y < len; ++y)
{
if(channels == 1)
val += compute_1(lle, lri, cdata_weight, cmax_data_term);
else
val += compute_3(lle, lri, cdata_weight, cmax_data_term);
lle += cimg_step;
lri += cimg_step;
}
}
}
__local float* dline = smem + winsz * get_local_id(2);
dline[tid] = val;
}
barrier(CLK_LOCAL_MEM_FENCE);
// if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
//if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
{
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
{
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
{
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
{
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
{
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
{
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (tid == 0)
data_cost[cdisp_step1 * d] = convert_short_sat_rte(vdline[0]);
}
}
__kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr, __global float *data_cost_,
__global uchar *cleft, __global uchar *cright, __local float *smem,
int level, int rows, int cols, int h, int nr_plane,
int channels, int winsz,
int cmsg_step1, int cmsg_step2, int cdisp_step1,int cdisp_step2, float cdata_weight,
float cmax_data_term, int cimg_step, int cth)
{
int x_out = get_group_id(0);
int y_out = get_group_id(1) % h;
int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2);
int tid = get_local_id(0);
__global const float *selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;
__global float *data_cost = data_cost_ + y_out * cmsg_step1 + x_out;
if (d < nr_plane)
{
int sel_disp = selected_disparity[d * cdisp_step2];
int x0 = x_out << level;
int y0 = y_out << level;
int len = min(y0 + winsz, rows) - y0;
float val = 0.0f;
if (x0 + tid < cols)
{
if (x0 + tid - sel_disp < 0 || sel_disp < cth)
val = cdata_weight * cmax_data_term * len;
else
{
__global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );
__global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
for(int y = 0; y < len; ++y)
{
if(channels == 1)
val += compute_1(lle, lri, cdata_weight, cmax_data_term);
else
val += compute_3(lle, lri, cdata_weight, cmax_data_term);
lle += cimg_step;
lri += cimg_step;
}
}
}
__local float* dline = smem + winsz * get_local_id(2);
dline[tid] = val;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 64)
{
if (tid < 32)
vdline[tid] += vdline[tid + 32];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 32)
{
if (tid < 16)
vdline[tid] += vdline[tid + 16];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 16)
{
if (tid < 8)
vdline[tid] += vdline[tid + 8];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 8)
{
if (tid < 4)
vdline[tid] += vdline[tid + 4];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 4)
{
if (tid < 2)
vdline[tid] += vdline[tid + 2];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (winsz >= 2)
{
if (tid < 1)
vdline[tid] += vdline[tid + 1];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(d < nr_plane)
{
__local volatile float* vdline = smem + winsz * get_local_id(2);
if (tid == 0)
data_cost[cdisp_step1 * d] = vdline[0];
}
}
///////////////////////////////////////////////////////////////
//////////////////////// init message /////////////////////////
///////////////////////////////////////////////////////////////
void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
__global short *r_new, __global const short *u_cur, __global const short *d_cur,
__global const short *l_cur, __global const short *r_cur,
__global short *data_cost_selected, __global short *disparity_selected_new,
__global short *data_cost_new, __global const short* data_cost_cur,
__global const short *disparity_selected_cur,
int nr_plane, int nr_plane2,
int cdisp_step1, int cdisp_step2)
{
for(int i = 0; i < nr_plane; i++)
{
short minimum = SHRT_MAX;
int id = 0;
for(int j = 0; j < nr_plane2; j++)
{
short cur = data_cost_new[j * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = j;
}
}
data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
data_cost_new[id * cdisp_step1] = SHRT_MAX;
}
}
void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new,
__global float *r_new, __global const float *u_cur, __global const float *d_cur,
__global const float *l_cur, __global const float *r_cur,
__global float *data_cost_selected, __global float *disparity_selected_new,
__global float *data_cost_new, __global const float *data_cost_cur,
__global const float *disparity_selected_cur,
int nr_plane, int nr_plane2,
int cdisp_step1, int cdisp_step2)
{
for(int i = 0; i < nr_plane; i++)
{
float minimum = FLT_MAX;
int id = 0;
for(int j = 0; j < nr_plane2; j++)
{
float cur = data_cost_new[j * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = j;
}
}
data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
data_cost_new[id * cdisp_step1] = FLT_MAX;
}
}
__kernel void init_message_0(__global short *u_new_, __global short *d_new_, __global short *l_new_,
__global short *r_new_, __global short *u_cur_, __global const short *d_cur_,
__global const short *l_cur_, __global const short *r_cur_, __global short *ctemp,
__global short *selected_disp_pyr_new, __global const short *selected_disp_pyr_cur,
__global short *data_cost_selected_, __global const short *data_cost_,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2,
int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < h && x < w)
{
__global const short *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
__global const short *d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2;
__global const short *l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1);
__global const short *r_cur = r_cur_ + y/2 * cmsg_step2 + max(0, x/2 - 1);
__global short *data_cost_new = ctemp + y * cmsg_step1 + x;
__global const short *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
__global const short *data_cost = data_cost_ + y * cmsg_step1 + x;
for(int d = 0; d < nr_plane2; d++)
{
int idx2 = d * cdisp_step2;
short val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
data_cost_new[d * cdisp_step1] = val;
}
__global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
__global short *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
__global short *u_new = u_new_ + y * cmsg_step1 + x;
__global short *d_new = d_new_ + y * cmsg_step1 + x;
__global short *l_new = l_new_ + y * cmsg_step1 + x;
__global short *r_new = r_new_ + y * cmsg_step1 + x;
u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
get_first_k_element_increase_0(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
data_cost_selected, disparity_selected_new, data_cost_new,
data_cost, disparity_selected_cur, nr_plane, nr_plane2,
cdisp_step1, cdisp_step2);
}
}
__kernel void init_message_1(__global float *u_new_, __global float *d_new_, __global float *l_new_,
__global float *r_new_, __global const float *u_cur_, __global const float *d_cur_,
__global const float *l_cur_, __global const float *r_cur_, __global float *ctemp,
__global float *selected_disp_pyr_new, __global const float *selected_disp_pyr_cur,
__global float *data_cost_selected_, __global const float *data_cost_,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2,
int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2)
{
int x = get_global_id(0);
int y = get_global_id(1);
__global const float *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
__global const float *d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2;
__global const float *l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1);
__global const float *r_cur = r_cur_ + y/2 * cmsg_step2 + max(0, x/2 - 1);
__global float *data_cost_new = ctemp + y * cmsg_step1 + x;
__global const float *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
__global const float *data_cost = data_cost_ + y * cmsg_step1 + x;
if (y < h && x < w)
{
for(int d = 0; d < nr_plane2; d++)
{
int idx2 = d * cdisp_step2;
float val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
data_cost_new[d * cdisp_step1] = val;
}
}
__global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
__global float *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
__global float *u_new = u_new_ + y * cmsg_step1 + x;
__global float *d_new = d_new_ + y * cmsg_step1 + x;
__global float *l_new = l_new_ + y * cmsg_step1 + x;
__global float *r_new = r_new_ + y * cmsg_step1 + x;
barrier(CLK_LOCAL_MEM_FENCE);
if(y < h && x < w)
{
u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
for(int i = 0; i < nr_plane; i++)
{
float minimum = FLT_MAX;
int id = 0;
for(int j = 0; j < nr_plane2; j++)
{
float cur = data_cost_new[j * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = j;
}
}
data_cost_selected[i * cdisp_step1] = data_cost[id * cdisp_step1];
disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
data_cost_new[id * cdisp_step1] = FLT_MAX;
}
}
}
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
__global const short *msg2, __global const short *msg3,
__global const short *dst_disp, __global const short *src_disp,
int nr_plane, __global short *temp,
float cmax_disc_term, int cdisp_step1, float cdisc_single_jump)
{
short minimum = SHRT_MAX;
for(int d = 0; d < nr_plane; d++)
{
int idx = d * cdisp_step1;
short val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
if(val < minimum)
minimum = val;
msg_dst[idx] = val;
}
float sum = 0;
for(int d = 0; d < nr_plane; d++)
{
float cost_min = minimum + cmax_disc_term;
short src_disp_reg = src_disp[d * cdisp_step1];
for(int d2 = 0; d2 < nr_plane; d2++)
cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] +
cdisc_single_jump * abs(dst_disp[d2 * cdisp_step1] - src_disp_reg)));
temp[d * cdisp_step1] = convert_short_sat_rte(cost_min);
sum += cost_min;
}
sum /= nr_plane;
for(int d = 0; d < nr_plane; d++)
msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum);
}
void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
__global const float *msg2, __global const float *msg3,
__global const float *dst_disp, __global const float *src_disp,
int nr_plane, __global float *temp,
float cmax_disc_term, int cdisp_step1, float cdisc_single_jump)
{
float minimum = FLT_MAX;
for(int d = 0; d < nr_plane; d++)
{
int idx = d * cdisp_step1;
float val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
if(val < minimum)
minimum = val;
msg_dst[idx] = val;
}
float sum = 0;
for(int d = 0; d < nr_plane; d++)
{
float cost_min = minimum + cmax_disc_term;
float src_disp_reg = src_disp[d * cdisp_step1];
for(int d2 = 0; d2 < nr_plane; d2++)
cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] +
cdisc_single_jump * fabs(dst_disp[d2 * cdisp_step1] - src_disp_reg)));
temp[d * cdisp_step1] = cost_min;
sum += cost_min;
}
sum /= nr_plane;
for(int d = 0; d < nr_plane; d++)
msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum;
}
__kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_,
__global const short *data_cost_selected, __global const short *selected_disp_pyr_cur,
__global short *ctemp, int h, int w, int nr_plane, int i,
float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump)
{
int y = get_global_id(1);
int x = ((get_global_id(0)) << 1) + ((y + i) & 1);
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
__global const short *data = data_cost_selected + y * cmsg_step1 + x;
__global short *u = u_ + y * cmsg_step1 + x;
__global short *d = d_ + y * cmsg_step1 + x;
__global short *l = l_ + y * cmsg_step1 + x;
__global short *r = r_ + y * cmsg_step1 + x;
__global const short *disp = selected_disp_pyr_cur + y * cmsg_step1 + x;
__global short *temp = ctemp + y * cmsg_step1 + x;
message_per_pixel_0(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
message_per_pixel_0(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
message_per_pixel_0(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
message_per_pixel_0(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
}
}
__kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_,
__global const float *data_cost_selected, __global const float *selected_disp_pyr_cur,
__global float *ctemp, int h, int w, int nr_plane, int i,
float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump)
{
int y = get_global_id(1);
int x = ((get_global_id(0)) << 1) + ((y + i) & 1);
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
__global const float *data = data_cost_selected + y * cmsg_step1 + x;
__global float *u = u_ + y * cmsg_step1 + x;
__global float *d = d_ + y * cmsg_step1 + x;
__global float *l = l_ + y * cmsg_step1 + x;
__global float *r = r_ + y * cmsg_step1 + x;
__global const float *disp = selected_disp_pyr_cur + y * cmsg_step1 + x;
__global float *temp = ctemp + y * cmsg_step1 + x;
message_per_pixel_1(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
message_per_pixel_1(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
message_per_pixel_1(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
message_per_pixel_1(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp,
cmax_disc_term, cdisp_step1, cdisc_single_jump);
}
}
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
__kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_,
__global const short *r_, __global const short * data_cost_selected,
__global const short *disp_selected_pyr,
__global short* disp,
int res_step, int cols, int rows, int nr_plane,
int cmsg_step1, int cdisp_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
{
__global const short *data = data_cost_selected + y * cmsg_step1 + x;
__global const short *disp_selected = disp_selected_pyr + y * cmsg_step1 + x;
__global const short *u = u_ + (y+1) * cmsg_step1 + (x+0);
__global const short *d = d_ + (y-1) * cmsg_step1 + (x+0);
__global const short *l = l_ + (y+0) * cmsg_step1 + (x+1);
__global const short *r = r_ + (y+0) * cmsg_step1 + (x-1);
short best = 0;
short best_val = SHRT_MAX;
for (int i = 0; i < nr_plane; ++i)
{
int idx = i * cdisp_step1;
short val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
if (val < best_val)
{
best_val = val;
best = disp_selected[idx];
}
}
disp[res_step * y + x] = best;
}
}
__kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_,
__global const float *r_, __global const float *data_cost_selected,
__global const float *disp_selected_pyr,
__global short *disp,
int res_step, int cols, int rows, int nr_plane,
int cmsg_step1, int cdisp_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
{
__global const float *data = data_cost_selected + y * cmsg_step1 + x;
__global const float *disp_selected = disp_selected_pyr + y * cmsg_step1 + x;
__global const float *u = u_ + (y+1) * cmsg_step1 + (x+0);
__global const float *d = d_ + (y-1) * cmsg_step1 + (x+0);
__global const float *l = l_ + (y+0) * cmsg_step1 + (x+1);
__global const float *r = r_ + (y+0) * cmsg_step1 + (x-1);
short best = 0;
short best_val = SHRT_MAX;
for (int i = 0; i < nr_plane; ++i)
{
int idx = i * cdisp_step1;
float val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
if (val < best_val)
{
best_val = val;
best = convert_short_sat_rte(disp_selected[idx]);
}
}
disp[res_step * y + x] = best;
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Jin Ma, jin@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::ocl;
using namespace std;
#if !defined (HAVE_OPENCL)
namespace cv
{
namespace ocl
{
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int &, int &, int &, int &)
{
throw_nogpu();
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int)
{
throw_nogpu();
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float,
float, float, int, int)
{
throw_nogpu();
}
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &, const oclMat &, oclMat &)
{
throw_nogpu();
}
}
}
#else /* !defined (HAVE_OPENCL) */
namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *stereocsbp;
}
}
namespace cv
{
namespace ocl
{
namespace stereoCSBP
{
//////////////////////////////////////////////////////////////////////////
//////////////////////////////common////////////////////////////////////
////////////////////////////////////////////////////////////////////////
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
static string get_kernel_name(string kernel_name, int data_type)
{
stringstream idxStr;
if(data_type == CV_16S)
idxStr << "0";
else
idxStr << "1";
kernel_name += idxStr.str();
return kernel_name;
}
using cv::ocl::StereoConstantSpaceBP;
//////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////init_data_cost//////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
static void init_data_cost_caller(const oclMat &left, const oclMat &right, oclMat &temp,
StereoConstantSpaceBP &rthis,
int msg_step, int h, int w, int level)
{
Context *clCxt = left.clCxt;
int data_type = rthis.msg_type;
int channels = left.oclchannels();
string kernelName = get_kernel_name("init_data_cost_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8 ,1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int cdisp_step1 = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&cdisp_step1));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void init_data_cost_reduce_caller(const oclMat &left, const oclMat &right, oclMat &temp,
StereoConstantSpaceBP &rthis,
int msg_step, int h, int w, int level)
{
Context *clCxt = left.clCxt;
int data_type = rthis.msg_type;
int channels = left.oclchannels();
int win_size = (int)std::pow(2.f, level);
string kernelName = get_kernel_name("init_data_cost_reduce_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
const int threadsNum = 256;
//size_t blockSize = threadsNum;
size_t localThreads[3] = {win_size, 1, threadsNum / win_size};
size_t globalThreads[3] = {w *localThreads[0],
h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2]
};
int local_mem_size = threadsNum * sizeof(float);
int cdisp_step1 = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 3, local_mem_size, (void *)NULL));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&win_size));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&cdisp_step1));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void get_first_initial_local_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
oclMat &temp, StereoConstantSpaceBP &rthis,
int h, int w, int nr_plane, int msg_step)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
string kernelName = get_kernel_name("get_first_k_initial_local_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8 ,1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
oclMat &temp, StereoConstantSpaceBP &rthis,
int h, int w, int nr_plane, int msg_step)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
string kernelName = get_kernel_name("get_first_k_initial_global_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void init_data_cost(const oclMat &left, const oclMat &right, oclMat &temp, StereoConstantSpaceBP &rthis,
uchar *disp_selected_pyr, uchar *data_cost_selected,
size_t msg_step, int h, int w, int level, int nr_plane)
{
if(level <= 1)
init_data_cost_caller(left, right, temp, rthis, msg_step, h, w, level);
else
init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level);
if(rthis.use_local_init_data_cost == true)
{
get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step);
}
else
{
get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w,
nr_plane, msg_step);
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////compute_data_cost//////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////
static void compute_data_cost_caller(uchar *disp_selected_pyr, uchar *data_cost,
StereoConstantSpaceBP &rthis, int msg_step1,
int msg_step2, const oclMat &left, const oclMat &right, int h,
int w, int h2, int level, int nr_plane)
{
Context *clCxt = left.clCxt;
int channels = left.oclchannels();
int data_type = rthis.msg_type;
string kernelName = get_kernel_name("compute_data_cost_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&msg_step1));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&msg_step2));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&disp_step1));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step2));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost,
StereoConstantSpaceBP &rthis, int msg_step1,
int msg_step2, const oclMat &left, const oclMat &right, int h,
int w, int h2, int level, int nr_plane)
{
Context *clCxt = left.clCxt;
int data_type = rthis.msg_type;
int channels = left.oclchannels();
int win_size = (int)std::pow(2.f, level);
string kernelName = get_kernel_name("compute_data_cost_reduce_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
const size_t threadsNum = 256;
//size_t blockSize = threadsNum;
size_t localThreads[3] = {win_size, 1, threadsNum / win_size};
size_t globalThreads[3] = {w *localThreads[0],
h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2]
};
int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2;
size_t local_mem_size = threadsNum * sizeof(float);
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 4, local_mem_size, (void *)NULL));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&win_size));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&msg_step1));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step2));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&disp_step1));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&disp_step2));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis,
int msg_step1, int msg_step2, const oclMat &left, const oclMat &right, int h, int w,
int h2, int level, int nr_plane)
{
if(level <= 1)
compute_data_cost_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2,
left, right, h, w, h2, level, nr_plane);
else
compute_data_cost_reduce_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2,
left, right, h, w, h2, level, nr_plane);
}
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////init message//////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////
static void init_message(uchar *u_new, uchar *d_new, uchar *l_new, uchar *r_new,
uchar *u_cur, uchar *d_cur, uchar *l_cur, uchar *r_cur,
uchar *disp_selected_pyr_new, uchar *disp_selected_pyr_cur,
uchar *data_cost_selected, uchar *data_cost, oclMat &temp, StereoConstantSpaceBP rthis,
size_t msg_step1, size_t msg_step2, int h, int w, int nr_plane,
int h2, int w2, int nr_plane2)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
string kernelName = get_kernel_name("init_message_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u_new));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_new));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l_new));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r_new));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&u_cur));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_cur));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&l_cur));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&r_cur));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&disp_selected_pyr_new));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *)&disp_selected_pyr_cur));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&data_cost));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&h2));
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_int), (void *)&w2));
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&nr_plane2));
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&disp_step1));
openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2));
openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1));
openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////calc_all_iterations////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////
static void calc_all_iterations_caller(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
int msg_step, int h, int w, int nr_plane, int i)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
string kernelName = get_kernel_name("compute_message_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, (localThreads[0]) << 1) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&i));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_float), (void *)&rthis.max_disc_term));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
int msg_step, int h, int w, int nr_plane)
{
for(int t = 0; t < rthis.iters; t++)
calc_all_iterations_caller(u, d, l, r, data_cost_selected, disp_selected_pyr, temp, rthis,
msg_step, h, w, nr_plane, t & 1);
}
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////compute_disp////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////
static void compute_disp(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
uchar *disp_selected_pyr, StereoConstantSpaceBP &rthis, size_t msg_step,
oclMat &disp, int nr_plane)
{
Context *clCxt = disp.clCxt;
int data_type = rthis.msg_type;
string kernelName = get_kernel_name("compute_disp_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0],
divUp(disp.rows, localThreads[1]) *localThreads[1],
1
};
int step_size = disp.step / disp.elemSize();
int disp_step = disp.rows * msg_step;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&disp.data));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&step_size));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&disp.cols));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&disp.rows));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
}
}
}
namespace
{
const float DEFAULT_MAX_DATA_TERM = 30.0f;
const float DEFAULT_DATA_WEIGHT = 1.0f;
const float DEFAULT_MAX_DISC_TERM = 160.0f;
const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
}
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane)
{
ndisp = (int) ((float) width / 3.14f);
if ((ndisp & 1) != 0)
ndisp++;
int mm = ::max(width, height);
iters = mm / 100 + ((mm > 1200) ? - 4 : 4);
levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
if (levels == 0) levels++;
nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
int msg_type_)
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
msg_type(msg_type_), use_local_init_data_cost(true)
{
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_,
int min_disp_th_, int msg_type_)
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
max_data_term(max_data_term_), data_weight(data_weight_),
max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
msg_type(msg_type_), use_local_init_data_cost(true)
{
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
}
template<class T>
static void csbp_operator(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp)
{
CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
&& left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3));
const Scalar zero = Scalar::all(0);
////////////////////////////////////Init///////////////////////////////////////////////////
int rows = left.rows;
int cols = left.cols;
rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
int levels = rthis.levels;
AutoBuffer<int> buf(levels * 4);
int *cols_pyr = buf;
int *rows_pyr = cols_pyr + levels;
int *nr_plane_pyr = rows_pyr + levels;
int *step_pyr = nr_plane_pyr + levels;
cols_pyr[0] = cols;
rows_pyr[0] = rows;
nr_plane_pyr[0] = rthis.nr_plane;
const int n = 64;
step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T);
for (int i = 1; i < levels; i++)
{
cols_pyr[i] = cols_pyr[i - 1] / 2;
rows_pyr[i] = rows_pyr[i - 1]/ 2;
nr_plane_pyr[i] = nr_plane_pyr[i - 1] * 2;
step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T);
}
Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2);
u[0].create(msg_size, DataType<T>::type);
d[0].create(msg_size, DataType<T>::type);
l[0].create(msg_size, DataType<T>::type);
r[0].create(msg_size, DataType<T>::type);
u[1].create(msg_size, DataType<T>::type);
d[1].create(msg_size, DataType<T>::type);
l[1].create(msg_size, DataType<T>::type);
r[1].create(msg_size, DataType<T>::type);
disp_selected_pyr[0].create(msg_size, DataType<T>::type);
disp_selected_pyr[1].create(msg_size, DataType<T>::type);
data_cost.create(data_cost_size, DataType<T>::type);
data_cost_selected.create(msg_size, DataType<T>::type);
Size temp_size = data_cost_size;
if (data_cost_size.width * data_cost_size.height < step_pyr[0] * rows_pyr[levels - 1] * rthis.ndisp)
temp_size = Size(step_pyr[0], rows_pyr[levels - 1] * rthis.ndisp);
temp.create(temp_size, DataType<T>::type);
temp = zero;
///////////////////////////////// Compute////////////////////////////////////////////////
//csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
// rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
l[0] = zero;
d[0] = zero;
r[0] = zero;
u[0] = zero;
disp_selected_pyr[0] = zero;
l[1] = zero;
d[1] = zero;
r[1] = zero;
u[1] = zero;
disp_selected_pyr[1] = zero;
data_cost = zero;
data_cost_selected = zero;
int cur_idx = 0;
for (int i = levels - 1; i >= 0; i--)
{
if (i == levels - 1)
{
cv::ocl::stereoCSBP::init_data_cost(left, right, temp, rthis, disp_selected_pyr[cur_idx].data,
data_cost_selected.data, step_pyr[0], rows_pyr[i], cols_pyr[i],
i, nr_plane_pyr[i]);
}
else
{
cv::ocl::stereoCSBP::compute_data_cost(
disp_selected_pyr[cur_idx].data, data_cost.data, rthis, step_pyr[0],
step_pyr[0], left, right, rows_pyr[i], cols_pyr[i], rows_pyr[i + 1], i,
nr_plane_pyr[i + 1]);
int new_idx = (cur_idx + 1) & 1;
cv::ocl::stereoCSBP::init_message(u[new_idx].data, d[new_idx].data, l[new_idx].data, r[new_idx].data,
u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
disp_selected_pyr[new_idx].data, disp_selected_pyr[cur_idx].data,
data_cost_selected.data, data_cost.data, temp, rthis, step_pyr[0],
step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i + 1],
cols_pyr[i + 1], nr_plane_pyr[i + 1]);
cur_idx = new_idx;
}
cv::ocl::stereoCSBP::calc_all_iterations(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
data_cost_selected.data, disp_selected_pyr[cur_idx].data, temp,
rthis, step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i]);
}
if (disp.empty())
disp.create(rows, cols, CV_16S);
out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
out = zero;
stereoCSBP::compute_disp(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
data_cost_selected.data, disp_selected_pyr[cur_idx].data, rthis, step_pyr[0],
out, nr_plane_pyr[0]);
if (disp.type() != CV_16S)
out.convertTo(disp, disp.type());
}
typedef void (*csbp_operator_t)(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp);
const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat &right, oclMat &disp)
{
CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out,
left, right, disp);
}
#endif /* !defined (HAVE_OPENCL) */
...@@ -134,4 +134,64 @@ TEST_P(StereoMatchBP, Regression) ...@@ -134,4 +134,64 @@ TEST_P(StereoMatchBP, Regression)
INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64), INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64),
testing::Values(8),testing::Values(2),testing::Values(25.0f), testing::Values(8),testing::Values(2),testing::Values(25.0f),
testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f))); testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f)));
//////////////////////////////////////////////////////////////////////////
// ConstSpaceBeliefPropagation
PARAM_TEST_CASE(StereoMatchConstSpaceBP, int, int, int, int, float, float, float, float, int, int)
{
int ndisp_;
int iters_;
int levels_;
int nr_plane_;
float max_data_term_;
float data_weight_;
float max_disc_term_;
float disc_single_jump_;
int min_disp_th_;
int msg_type_;
virtual void SetUp()
{
ndisp_ = GET_PARAM(0);
iters_ = GET_PARAM(1);
levels_ = GET_PARAM(2);
nr_plane_ = GET_PARAM(3);
max_data_term_ = GET_PARAM(4);
data_weight_ = GET_PARAM(5);
max_disc_term_ = GET_PARAM(6);
disc_single_jump_ = GET_PARAM(7);
min_disp_th_ = GET_PARAM(8);
msg_type_ = GET_PARAM(9);
}
};
TEST_P(StereoMatchConstSpaceBP, Regression)
{
Mat left_image = readImage("csstereobp/aloe-L.png");
Mat right_image = readImage("csstereobp/aloe-R.png");
Mat disp_gold = readImage("csstereobp/aloe-disp.png", IMREAD_GRAYSCALE);
ocl::oclMat d_left, d_right;
ocl::oclMat d_disp;
Mat disp;
ASSERT_FALSE(left_image.empty());
ASSERT_FALSE(right_image.empty());
ASSERT_FALSE(disp_gold.empty());
d_left.upload(left_image);
d_right.upload(right_image);
ocl::StereoConstantSpaceBP bp(ndisp_, iters_, levels_, nr_plane_, max_data_term_, data_weight_,
max_disc_term_, disc_single_jump_, 0, CV_32F);
bp(d_left, d_right, d_disp);
d_disp.download(disp);
disp.convertTo(disp, disp_gold.depth());
EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-4);
//EXPECT_MAT_NEAR(disp_gold, disp, 1.0, "");
}
INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchConstSpaceBP, testing::Combine(testing::Values(128),
testing::Values(16),testing::Values(4), testing::Values(4), testing::Values(30.0f),
testing::Values(1.0f),testing::Values(160.0f),
testing::Values(10.0f), testing::Values(0), testing::Values(CV_32F)));
#endif // HAVE_OPENCL #endif // HAVE_OPENCL
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment