Commit 2544c7ea authored by Philip Langdale's avatar Philip Langdale Committed by Timo Rothenpieler

avfilter/vf_scale_cuda: Switch to using ffnvcodec

This change switches the vf_scale_cuda filter from using the
full cuda sdk to using the ffnvcodec headers and loader.

Most of the change is a direct mapping, but I also switched from
using texture references to using texture objects. This is supposed
to be the preferred way of using textures, and the texture object API
is the one I added to ffnvcodec.
Signed-off-by: 's avatarPhilip Langdale <philipl@overt.org>
Signed-off-by: 's avatarTimo Rothenpieler <timo@rothenpieler.org>
parent 7debf427
...@@ -2975,7 +2975,7 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait" ...@@ -2975,7 +2975,7 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
hwupload_cuda_filter_deps="ffnvcodec" hwupload_cuda_filter_deps="ffnvcodec"
scale_npp_filter_deps="ffnvcodec libnpp" scale_npp_filter_deps="ffnvcodec libnpp"
scale_cuda_filter_deps="cuda_sdk" scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
thumbnail_cuda_filter_deps="cuda_sdk" thumbnail_cuda_filter_deps="cuda_sdk"
transpose_npp_filter_deps="ffnvcodec libnpp" transpose_npp_filter_deps="ffnvcodec libnpp"
......
...@@ -20,14 +20,13 @@ ...@@ -20,14 +20,13 @@
* DEALINGS IN THE SOFTWARE. * DEALINGS IN THE SOFTWARE.
*/ */
#include <cuda.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include "libavutil/avstring.h" #include "libavutil/avstring.h"
#include "libavutil/common.h" #include "libavutil/common.h"
#include "libavutil/hwcontext.h" #include "libavutil/hwcontext.h"
#include "libavutil/hwcontext_cuda.h" #include "libavutil/hwcontext_cuda_internal.h"
#include "libavutil/cuda_check.h" #include "libavutil/cuda_check.h"
#include "libavutil/internal.h" #include "libavutil/internal.h"
#include "libavutil/opt.h" #include "libavutil/opt.h"
...@@ -53,10 +52,13 @@ static const enum AVPixelFormat supported_formats[] = { ...@@ -53,10 +52,13 @@ static const enum AVPixelFormat supported_formats[] = {
#define BLOCKX 32 #define BLOCKX 32
#define BLOCKY 16 #define BLOCKY 16
#define CHECK_CU(x) FF_CUDA_CHECK(ctx, x) #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
typedef struct CUDAScaleContext { typedef struct CUDAScaleContext {
const AVClass *class; const AVClass *class;
AVCUDADeviceContext *hwctx;
enum AVPixelFormat in_fmt; enum AVPixelFormat in_fmt;
enum AVPixelFormat out_fmt; enum AVPixelFormat out_fmt;
...@@ -80,7 +82,6 @@ typedef struct CUDAScaleContext { ...@@ -80,7 +82,6 @@ typedef struct CUDAScaleContext {
char *h_expr; ///< height expression string char *h_expr; ///< height expression string
CUcontext cu_ctx; CUcontext cu_ctx;
CUevent cu_event;
CUmodule cu_module; CUmodule cu_module;
CUfunction cu_func_uchar; CUfunction cu_func_uchar;
CUfunction cu_func_uchar2; CUfunction cu_func_uchar2;
...@@ -88,12 +89,7 @@ typedef struct CUDAScaleContext { ...@@ -88,12 +89,7 @@ typedef struct CUDAScaleContext {
CUfunction cu_func_ushort; CUfunction cu_func_ushort;
CUfunction cu_func_ushort2; CUfunction cu_func_ushort2;
CUfunction cu_func_ushort4; CUfunction cu_func_ushort4;
CUtexref cu_tex_uchar; CUstream cu_stream;
CUtexref cu_tex_uchar2;
CUtexref cu_tex_uchar4;
CUtexref cu_tex_ushort;
CUtexref cu_tex_ushort2;
CUtexref cu_tex_ushort4;
CUdeviceptr srcBuffer; CUdeviceptr srcBuffer;
CUdeviceptr dstBuffer; CUdeviceptr dstBuffer;
...@@ -258,48 +254,49 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink) ...@@ -258,48 +254,49 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink)
AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
CudaFunctions *cu = device_hwctx->internal->cuda_dl;
int w, h; int w, h;
int ret; int ret;
extern char vf_scale_cuda_ptx[]; extern char vf_scale_cuda_ptx[];
ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx)); s->hwctx = device_hwctx;
s->cu_stream = s->hwctx->stream;
ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
if (ret < 0)
goto fail;
ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx));
if (ret < 0)
goto fail;
CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"));
if (ret < 0)
goto fail;
CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2"));
if (ret < 0)
goto fail;
CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"));
if (ret < 0)
goto fail;
CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort"));
if (ret < 0) if (ret < 0)
goto fail; goto fail;
ret = CHECK_CU(cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx)); CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2"));
if (ret < 0)
goto fail;
CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4"));
if (ret < 0) if (ret < 0)
goto fail; goto fail;
CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"));
CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2")); CHECK_CU(cu->cuCtxPopCurrent(&dummy));
CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"));
CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort"));
CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2"));
CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4"));
CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex"));
CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"));
CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"));
CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex"));
CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex"));
CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex"));
CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER));
CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER));
CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER));
CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER));
CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER));
CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER));
CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR));
CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR));
CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR));
CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR));
CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR));
CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR));
CHECK_CU(cuCtxPopCurrent(&dummy));
if ((ret = ff_scale_eval_dimensions(s, if ((ret = ff_scale_eval_dimensions(s,
s->w_expr, s->h_expr, s->w_expr, s->h_expr,
...@@ -335,30 +332,48 @@ fail: ...@@ -335,30 +332,48 @@ fail:
return ret; return ret;
} }
static int call_resize_kernel(CUDAScaleContext *ctx, CUfunction func, CUtexref tex, int channels, static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, int channels,
uint8_t *src_dptr, int src_width, int src_height, int src_pitch, uint8_t *src_dptr, int src_width, int src_height, int src_pitch,
uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch,
int pixel_size) int pixel_size)
{ {
CUdeviceptr src_devptr = (CUdeviceptr)src_dptr; CUDAScaleContext *s = ctx->priv;
CudaFunctions *cu = s->hwctx->internal->cuda_dl;
CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr; CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr;
void *args_uchar[] = { &dst_devptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height }; CUtexObject tex = 0;
CUDA_ARRAY_DESCRIPTOR desc; void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height };
int ret;
desc.Width = src_width;
desc.Height = src_height; CUDA_TEXTURE_DESC tex_desc = {
desc.NumChannels = channels; .filterMode = CU_TR_FILTER_MODE_LINEAR,
if (pixel_size == 1) { .flags = CU_TRSF_READ_AS_INTEGER,
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; };
} else {
desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
}
CHECK_CU(cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch * pixel_size)); CUDA_RESOURCE_DESC res_desc = {
CHECK_CU(cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, .resType = CU_RESOURCE_TYPE_PITCH2D,
BLOCKX, BLOCKY, 1, 0, 0, args_uchar, NULL)); .res.pitch2D.format = pixel_size == 1 ?
CU_AD_FORMAT_UNSIGNED_INT8 :
CU_AD_FORMAT_UNSIGNED_INT16,
.res.pitch2D.numChannels = channels,
.res.pitch2D.width = src_width,
.res.pitch2D.height = src_height,
.res.pitch2D.pitchInBytes = src_pitch,
.res.pitch2D.devPtr = (CUdeviceptr)src_dptr,
};
return 0; ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
if (ret < 0)
goto exit;
ret = CHECK_CU(cu->cuLaunchKernel(func,
DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL));
exit:
if (tex)
CHECK_CU(cu->cuTexObjectDestroy(tex));
return ret;
} }
static int scalecuda_resize(AVFilterContext *ctx, static int scalecuda_resize(AVFilterContext *ctx,
...@@ -369,59 +384,59 @@ static int scalecuda_resize(AVFilterContext *ctx, ...@@ -369,59 +384,59 @@ static int scalecuda_resize(AVFilterContext *ctx,
switch (in_frames_ctx->sw_format) { switch (in_frames_ctx->sw_format) {
case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUV420P:
call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, call_resize_kernel(ctx, s->cu_func_uchar, 1,
in->data[0], in->width, in->height, in->linesize[0], in->data[0], in->width, in->height, in->linesize[0],
out->data[0], out->width, out->height, out->linesize[0], out->data[0], out->width, out->height, out->linesize[0],
1); 1);
call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, call_resize_kernel(ctx, s->cu_func_uchar, 1,
in->data[0]+in->linesize[0]*in->height, in->width/2, in->height/2, in->linesize[0]/2, in->data[0]+in->linesize[0]*in->height, in->width/2, in->height/2, in->linesize[0]/2,
out->data[0]+out->linesize[0]*out->height, out->width/2, out->height/2, out->linesize[0]/2, out->data[0]+out->linesize[0]*out->height, out->width/2, out->height/2, out->linesize[0]/2,
1); 1);
call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, call_resize_kernel(ctx, s->cu_func_uchar, 1,
in->data[0]+ ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2, in->height/2, in->linesize[0]/2, in->data[0]+ ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2, in->height/2, in->linesize[0]/2,
out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2, out->height/2, out->linesize[0]/2, out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2, out->height/2, out->linesize[0]/2,
1); 1);
break; break;
case AV_PIX_FMT_YUV444P: case AV_PIX_FMT_YUV444P:
call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, call_resize_kernel(ctx, s->cu_func_uchar, 1,
in->data[0], in->width, in->height, in->linesize[0], in->data[0], in->width, in->height, in->linesize[0],
out->data[0], out->width, out->height, out->linesize[0], out->data[0], out->width, out->height, out->linesize[0],
1); 1);
call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, call_resize_kernel(ctx, s->cu_func_uchar, 1,
in->data[0]+in->linesize[0]*in->height, in->width, in->height, in->linesize[0], in->data[0]+in->linesize[0]*in->height, in->width, in->height, in->linesize[0],
out->data[0]+out->linesize[0]*out->height, out->width, out->height, out->linesize[0], out->data[0]+out->linesize[0]*out->height, out->width, out->height, out->linesize[0],
1); 1);
call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, call_resize_kernel(ctx, s->cu_func_uchar, 1,
in->data[0]+in->linesize[0]*in->height*2, in->width, in->height, in->linesize[0], in->data[0]+in->linesize[0]*in->height*2, in->width, in->height, in->linesize[0],
out->data[0]+out->linesize[0]*out->height*2, out->width, out->height, out->linesize[0], out->data[0]+out->linesize[0]*out->height*2, out->width, out->height, out->linesize[0],
1); 1);
break; break;
case AV_PIX_FMT_NV12: case AV_PIX_FMT_NV12:
call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, call_resize_kernel(ctx, s->cu_func_uchar, 1,
in->data[0], in->width, in->height, in->linesize[0], in->data[0], in->width, in->height, in->linesize[0],
out->data[0], out->width, out->height, out->linesize[0], out->data[0], out->width, out->height, out->linesize[0],
1); 1);
call_resize_kernel(s, s->cu_func_uchar2, s->cu_tex_uchar2, 2, call_resize_kernel(ctx, s->cu_func_uchar2, 2,
in->data[1], in->width/2, in->height/2, in->linesize[1], in->data[1], in->width/2, in->height/2, in->linesize[1],
out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width/2, out->height/2, out->linesize[1]/2, out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width/2, out->height/2, out->linesize[1]/2,
1); 1);
break; break;
case AV_PIX_FMT_P010LE: case AV_PIX_FMT_P010LE:
call_resize_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, call_resize_kernel(ctx, s->cu_func_ushort, 1,
in->data[0], in->width, in->height, in->linesize[0]/2, in->data[0], in->width, in->height, in->linesize[0]/2,
out->data[0], out->width, out->height, out->linesize[0]/2, out->data[0], out->width, out->height, out->linesize[0]/2,
2); 2);
call_resize_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, call_resize_kernel(ctx, s->cu_func_ushort2, 2,
in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2, in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2,
out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4,
2); 2);
break; break;
case AV_PIX_FMT_P016LE: case AV_PIX_FMT_P016LE:
call_resize_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, call_resize_kernel(ctx, s->cu_func_ushort, 1,
in->data[0], in->width, in->height, in->linesize[0] / 2, in->data[0], in->width, in->height, in->linesize[0] / 2,
out->data[0], out->width, out->height, out->linesize[0] / 2, out->data[0], out->width, out->height, out->linesize[0] / 2,
2); 2);
call_resize_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, call_resize_kernel(ctx, s->cu_func_ushort2, 2,
in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2, in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2,
out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4,
2); 2);
...@@ -460,11 +475,10 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) ...@@ -460,11 +475,10 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
{ {
AVFilterContext *ctx = link->dst; AVFilterContext *ctx = link->dst;
CUDAScaleContext *s = ctx->priv; CUDAScaleContext *s = ctx->priv;
AVFilterLink *outlink = ctx->outputs[0]; AVFilterLink *outlink = ctx->outputs[0];
AVHWFramesContext *frames_ctx = (AVHWFramesContext*)s->frames_ctx->data; CudaFunctions *cu = s->hwctx->internal->cuda_dl;
AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;
AVFrame *out = NULL; AVFrame *out = NULL;
CUcontext dummy; CUcontext dummy;
...@@ -476,13 +490,13 @@ static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) ...@@ -476,13 +490,13 @@ static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
goto fail; goto fail;
} }
ret = CHECK_CU(cuCtxPushCurrent(device_hwctx->cuda_ctx)); ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
if (ret < 0) if (ret < 0)
goto fail; goto fail;
ret = cudascale_scale(ctx, out, in); ret = cudascale_scale(ctx, out, in);
CHECK_CU(cuCtxPopCurrent(&dummy)); CHECK_CU(cu->cuCtxPopCurrent(&dummy));
if (ret < 0) if (ret < 0)
goto fail; goto fail;
......
...@@ -22,14 +22,8 @@ ...@@ -22,14 +22,8 @@
extern "C" { extern "C" {
texture<unsigned char, 2> uchar_tex; __global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex,
texture<uchar2, 2> uchar2_tex; unsigned char *dst,
texture<uchar4, 2> uchar4_tex;
texture<unsigned short, 2> ushort_tex;
texture<ushort2, 2> ushort2_tex;
texture<ushort4, 2> ushort4_tex;
__global__ void Subsample_Bilinear_uchar(unsigned char *dst,
int dst_width, int dst_height, int dst_pitch, int dst_width, int dst_height, int dst_pitch,
int src_width, int src_height) int src_width, int src_height)
{ {
...@@ -48,15 +42,16 @@ __global__ void Subsample_Bilinear_uchar(unsigned char *dst, ...@@ -48,15 +42,16 @@ __global__ void Subsample_Bilinear_uchar(unsigned char *dst,
// Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
float dx = wh / (0.5f + wh); float dx = wh / (0.5f + wh);
float dy = wv / (0.5f + wv); float dy = wv / (0.5f + wv);
int y0 = tex2D(uchar_tex, xi-dx, yi-dy); int y0 = tex2D<unsigned char>(uchar_tex, xi-dx, yi-dy);
int y1 = tex2D(uchar_tex, xi+dx, yi-dy); int y1 = tex2D<unsigned char>(uchar_tex, xi+dx, yi-dy);
int y2 = tex2D(uchar_tex, xi-dx, yi+dy); int y2 = tex2D<unsigned char>(uchar_tex, xi-dx, yi+dy);
int y3 = tex2D(uchar_tex, xi+dx, yi+dy); int y3 = tex2D<unsigned char>(uchar_tex, xi+dx, yi+dy);
dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2); dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2);
} }
} }
__global__ void Subsample_Bilinear_uchar2(uchar2 *dst, __global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex,
uchar2 *dst,
int dst_width, int dst_height, int dst_pitch2, int dst_width, int dst_height, int dst_pitch2,
int src_width, int src_height) int src_width, int src_height)
{ {
...@@ -75,10 +70,10 @@ __global__ void Subsample_Bilinear_uchar2(uchar2 *dst, ...@@ -75,10 +70,10 @@ __global__ void Subsample_Bilinear_uchar2(uchar2 *dst,
// Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
float dx = wh / (0.5f + wh); float dx = wh / (0.5f + wh);
float dy = wv / (0.5f + wv); float dy = wv / (0.5f + wv);
uchar2 c0 = tex2D(uchar2_tex, xi-dx, yi-dy); uchar2 c0 = tex2D<uchar2>(uchar2_tex, xi-dx, yi-dy);
uchar2 c1 = tex2D(uchar2_tex, xi+dx, yi-dy); uchar2 c1 = tex2D<uchar2>(uchar2_tex, xi+dx, yi-dy);
uchar2 c2 = tex2D(uchar2_tex, xi-dx, yi+dy); uchar2 c2 = tex2D<uchar2>(uchar2_tex, xi-dx, yi+dy);
uchar2 c3 = tex2D(uchar2_tex, xi+dx, yi+dy); uchar2 c3 = tex2D<uchar2>(uchar2_tex, xi+dx, yi+dy);
int2 uv; int2 uv;
uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
...@@ -86,7 +81,8 @@ __global__ void Subsample_Bilinear_uchar2(uchar2 *dst, ...@@ -86,7 +81,8 @@ __global__ void Subsample_Bilinear_uchar2(uchar2 *dst,
} }
} }
__global__ void Subsample_Bilinear_uchar4(uchar4 *dst, __global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex,
uchar4 *dst,
int dst_width, int dst_height, int dst_pitch, int dst_width, int dst_height, int dst_pitch,
int src_width, int src_height) int src_width, int src_height)
{ {
...@@ -105,10 +101,10 @@ __global__ void Subsample_Bilinear_uchar4(uchar4 *dst, ...@@ -105,10 +101,10 @@ __global__ void Subsample_Bilinear_uchar4(uchar4 *dst,
// Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
float dx = wh / (0.5f + wh); float dx = wh / (0.5f + wh);
float dy = wv / (0.5f + wv); float dy = wv / (0.5f + wv);
uchar4 c0 = tex2D(uchar4_tex, xi-dx, yi-dy); uchar4 c0 = tex2D<uchar4>(uchar4_tex, xi-dx, yi-dy);
uchar4 c1 = tex2D(uchar4_tex, xi+dx, yi-dy); uchar4 c1 = tex2D<uchar4>(uchar4_tex, xi+dx, yi-dy);
uchar4 c2 = tex2D(uchar4_tex, xi-dx, yi+dy); uchar4 c2 = tex2D<uchar4>(uchar4_tex, xi-dx, yi+dy);
uchar4 c3 = tex2D(uchar4_tex, xi+dx, yi+dy); uchar4 c3 = tex2D<uchar4>(uchar4_tex, xi+dx, yi+dy);
int4 res; int4 res;
res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
...@@ -119,7 +115,8 @@ __global__ void Subsample_Bilinear_uchar4(uchar4 *dst, ...@@ -119,7 +115,8 @@ __global__ void Subsample_Bilinear_uchar4(uchar4 *dst,
} }
} }
__global__ void Subsample_Bilinear_ushort(unsigned short *dst, __global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex,
unsigned short *dst,
int dst_width, int dst_height, int dst_pitch, int dst_width, int dst_height, int dst_pitch,
int src_width, int src_height) int src_width, int src_height)
{ {
...@@ -138,15 +135,16 @@ __global__ void Subsample_Bilinear_ushort(unsigned short *dst, ...@@ -138,15 +135,16 @@ __global__ void Subsample_Bilinear_ushort(unsigned short *dst,
// Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
float dx = wh / (0.5f + wh); float dx = wh / (0.5f + wh);
float dy = wv / (0.5f + wv); float dy = wv / (0.5f + wv);
int y0 = tex2D(ushort_tex, xi-dx, yi-dy); int y0 = tex2D<unsigned short>(ushort_tex, xi-dx, yi-dy);
int y1 = tex2D(ushort_tex, xi+dx, yi-dy); int y1 = tex2D<unsigned short>(ushort_tex, xi+dx, yi-dy);
int y2 = tex2D(ushort_tex, xi-dx, yi+dy); int y2 = tex2D<unsigned short>(ushort_tex, xi-dx, yi+dy);
int y3 = tex2D(ushort_tex, xi+dx, yi+dy); int y3 = tex2D<unsigned short>(ushort_tex, xi+dx, yi+dy);
dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2); dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2);
} }
} }
__global__ void Subsample_Bilinear_ushort2(ushort2 *dst, __global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex,
ushort2 *dst,
int dst_width, int dst_height, int dst_pitch2, int dst_width, int dst_height, int dst_pitch2,
int src_width, int src_height) int src_width, int src_height)
{ {
...@@ -165,10 +163,10 @@ __global__ void Subsample_Bilinear_ushort2(ushort2 *dst, ...@@ -165,10 +163,10 @@ __global__ void Subsample_Bilinear_ushort2(ushort2 *dst,
// Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
float dx = wh / (0.5f + wh); float dx = wh / (0.5f + wh);
float dy = wv / (0.5f + wv); float dy = wv / (0.5f + wv);
ushort2 c0 = tex2D(ushort2_tex, xi-dx, yi-dy); ushort2 c0 = tex2D<ushort2>(ushort2_tex, xi-dx, yi-dy);
ushort2 c1 = tex2D(ushort2_tex, xi+dx, yi-dy); ushort2 c1 = tex2D<ushort2>(ushort2_tex, xi+dx, yi-dy);
ushort2 c2 = tex2D(ushort2_tex, xi-dx, yi+dy); ushort2 c2 = tex2D<ushort2>(ushort2_tex, xi-dx, yi+dy);
ushort2 c3 = tex2D(ushort2_tex, xi+dx, yi+dy); ushort2 c3 = tex2D<ushort2>(ushort2_tex, xi+dx, yi+dy);
int2 uv; int2 uv;
uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
...@@ -176,7 +174,8 @@ __global__ void Subsample_Bilinear_ushort2(ushort2 *dst, ...@@ -176,7 +174,8 @@ __global__ void Subsample_Bilinear_ushort2(ushort2 *dst,
} }
} }
__global__ void Subsample_Bilinear_ushort4(ushort4 *dst, __global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex,
ushort4 *dst,
int dst_width, int dst_height, int dst_pitch, int dst_width, int dst_height, int dst_pitch,
int src_width, int src_height) int src_width, int src_height)
{ {
...@@ -195,10 +194,10 @@ __global__ void Subsample_Bilinear_ushort4(ushort4 *dst, ...@@ -195,10 +194,10 @@ __global__ void Subsample_Bilinear_ushort4(ushort4 *dst,
// Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
float dx = wh / (0.5f + wh); float dx = wh / (0.5f + wh);
float dy = wv / (0.5f + wv); float dy = wv / (0.5f + wv);
ushort4 c0 = tex2D(ushort4_tex, xi-dx, yi-dy); ushort4 c0 = tex2D<ushort4>(ushort4_tex, xi-dx, yi-dy);
ushort4 c1 = tex2D(ushort4_tex, xi+dx, yi-dy); ushort4 c1 = tex2D<ushort4>(ushort4_tex, xi+dx, yi-dy);
ushort4 c2 = tex2D(ushort4_tex, xi-dx, yi+dy); ushort4 c2 = tex2D<ushort4>(ushort4_tex, xi-dx, yi+dy);
ushort4 c3 = tex2D(ushort4_tex, xi+dx, yi+dy); ushort4 c3 = tex2D<ushort4>(ushort4_tex, xi+dx, yi+dy);
int4 res; int4 res;
res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment