void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
int borderMode, const float* borderValue, cudaStream_t stream)
{
typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);