// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.


// #if defined _HFS_CUDA_ON_

#include "../precomp.hpp"
#include "../magnitude/magnitude.hpp"

namespace cv { namespace hfs {

__global__ void derrivativeXYDevice(const uchar *gray_img,
    int *delta_x, int *delta_y, int *mag, Vector2i img_size)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    if (x > img_size.x - 1 || y > img_size.y - 1)
        return;

    int idx = y*img_size.x + x;

    if (x == 0)
        delta_x[idx] = gray_img[idx + 1] - gray_img[idx];
    else if (x == img_size.x - 1)
        delta_x[idx] = gray_img[idx] - gray_img[idx - 1];
    else
        delta_x[idx] = gray_img[idx + 1] - gray_img[idx - 1];

    if (y == 0)
        delta_y[idx] = gray_img[idx + img_size.x] - gray_img[idx];
    else if (y == img_size.y - 1)
        delta_y[idx] = gray_img[idx] - gray_img[idx - img_size.x];
    else
        delta_y[idx] = gray_img[idx + img_size.x] - gray_img[idx - img_size.x];

    mag[idx] = (int)(0.5 +
        sqrt((double)(delta_x[idx] * delta_x[idx] + delta_y[idx] * delta_y[idx])));
}

__device__ __forceinline__ int dmin(int a, int b)
{
    return a < b ? a : b;
}

__device__ __forceinline__ int dmax(int a, int b)
{
    return a > b ? a : b;
}

__global__ void nonMaxSuppDevice(uchar *nms_mag,
    int *delta_x, int *delta_y, int *mag, Vector2i img_size)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    if (x > img_size.x - 1 || y > img_size.y - 1) return;

    int idx = y*img_size.x + x;

    if (x == 0 || x == img_size.x - 1 || y == 0 || y == img_size.y - 1)
    {
        nms_mag[idx] = 0;
        return;
    }

    int m00, gx, gy, z1, z2;
    double mag1, mag2, xprep, yprep;

    m00 = mag[idx];
    if (m00 == 0)
    {
        nms_mag[idx] = 0;
        return;
    }
    else
    {
        xprep = -(gx = delta_x[idx]) / ((double)m00);
        yprep = (gy = delta_y[idx]) / ((double)m00);
    }

    if (gx >= 0)
    {
        if (gy >= 0)
        {
            if (gx >= gy)
            {
                z1 = mag[idx - 1];
                z2 = mag[idx - img_size.x - 1];
                mag1 = (m00 - z1)*xprep + (z2 - z1)*yprep;

                z1 = mag[idx + 1];
                z2 = mag[idx + img_size.x + 1];
                mag2 = (m00 - z1)*xprep + (z2 - z1)*yprep;
            }
            else
            {
                z1 = mag[idx - img_size.x];
                z2 = mag[idx - img_size.x - 1];
                mag1 = (z1 - z2)*xprep + (z1 - m00)*yprep;

                z1 = mag[idx + img_size.x];
                z2 = mag[idx + img_size.x + 1];
                mag2 = (z1 - z2)*xprep + (z1 - m00)*yprep;
            }
        }
        else
        {
            if (gx >= -gy)
            {
                z1 = mag[idx - 1];
                z2 = mag[idx + img_size.x - 1];
                mag1 = (m00 - z1)*xprep + (z1 - z2)*yprep;

                z1 = mag[idx + 1];
                z2 = mag[idx - img_size.x + 1];
                mag2 = (m00 - z1)*xprep + (z1 - z2)*yprep;
            }
            else
            {
                z1 = mag[idx + img_size.x];
                z2 = mag[idx + img_size.x - 1];
                mag1 = (z1 - z2)*xprep + (m00 - z1)*yprep;

                z1 = mag[idx - img_size.x];
                z2 = mag[idx - img_size.x + 1];
                mag2 = (z1 - z2)*xprep + (m00 - z1)*yprep;
            }
        }
    }
    else
    {
        if (gy >= 0)
        {
            if (-gx >= gy)
            {
                z1 = mag[idx + 1];
                z2 = mag[idx - img_size.x + 1];
                mag1 = (z1 - m00)*xprep + (z2 - z1)*yprep;

                z1 = mag[idx - 1];
                z2 = mag[idx + img_size.x - 1];
                mag2 = (z1 - m00)*xprep + (z2 - z1)*yprep;
            }
            else
            {
                z1 = mag[idx - img_size.x];
                z2 = mag[idx - img_size.x + 1];
                mag1 = (z2 - z1)*xprep + (z1 - m00)*yprep;

                z1 = mag[idx + img_size.x];
                z2 = mag[idx + img_size.x - 1];
                mag2 = (z2 - z1)*xprep + (z1 - m00)*yprep;
            }
        }
        else
        {
            if (-gx > -gy)
            {
                z1 = mag[idx + 1];
                z2 = mag[idx + img_size.x + 1];
                mag1 = (z1 - m00)*xprep + (z1 - z2)*yprep;

                z1 = mag[idx - 1];
                z2 = mag[idx - img_size.x - 1];
                mag2 = (z1 - m00)*xprep + (z1 - z2)*yprep;
            }
            else
            {
                z1 = mag[idx + img_size.x];
                z2 = mag[idx + img_size.x + 1];
                mag1 = (z2 - z1)*xprep + (m00 - z1)*yprep;

                z1 = mag[idx - img_size.x];
                z2 = mag[idx - img_size.x - 1];
                mag2 = (z2 - z1)*xprep + (m00 - z1)*yprep;
            }
        }
    }

    if (mag1 > 0 || mag2 >= 0)
        nms_mag[idx] = 0;
    else
        nms_mag[idx] = (uchar)dmin(dmax(m00, 0), 255);
}

void Magnitude::derrivativeXYGpu()
{
    uchar *gray_ptr = gray_img->getGpuData();
    int *dx_ptr = delta_x->getGpuData();
    int *dy_ptr = delta_y->getGpuData();
    int *mag_ptr = mag->getGpuData();

    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
    dim3 gridSize((int)ceil((float)img_size.x / (float)blockSize.x),
        (int)ceil((float)img_size.y / (float)blockSize.y));

    derrivativeXYDevice << <gridSize, blockSize >> >
        (gray_ptr, dx_ptr, dy_ptr, mag_ptr, img_size);
}

void Magnitude::nonMaxSuppGpu()
{
    int *dx_ptr = delta_x->getGpuData();
    int *dy_ptr = delta_y->getGpuData();
    int *mag_ptr = mag->getGpuData();
    uchar *nms_ptr = nms_mag->getGpuData();

    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
    dim3 gridSize((int)ceil((float)img_size.x / (float)blockSize.x),
        (int)ceil((float)img_size.y / (float)blockSize.y));

    nonMaxSuppDevice << <gridSize, blockSize >> >
        (nms_ptr, dx_ptr, dy_ptr, mag_ptr, img_size);
}

void Magnitude::processImgGpu(const Mat& bgr3u, Mat& mag1u)
{
    Mat gray, blur1u;
    cvtColor(bgr3u, gray, COLOR_BGR2GRAY);
    GaussianBlur(gray, blur1u, Size(7, 7), 1, 1);

    img_size.x = bgr3u.cols;
    img_size.y = bgr3u.rows;

    loadImage(blur1u, gray_img);
    gray_img->updateDeviceFromHost();
    derrivativeXYGpu();
    nonMaxSuppGpu();
    mag1u.create(bgr3u.rows, bgr3u.cols, CV_8UC1);
    nms_mag->updateHostFromDevice();
    loadImage(nms_mag, mag1u);
}

}}

// #endif