Commit 451be676 authored by Michael Niedermayer's avatar Michael Niedermayer

Merge remote-tracking branch 'rbultje/vp9-bugfixes'

* rbultje/vp9-bugfixes:
  vp9: match another find_ref_mvs() bug in libvpx.
  vp9: fix scaled motion vector clipping for sub8x8 blocks.
  vp9: improve signbias check.
  vp9: don't allow compound references if error_resilience is enabled.
  vp9: clamp segmented lflvl before applying ref/mode deltas.
  vp9: reset loopfilter mode/ref deltas on keyframe.
  vp9: fix crash when playing back 440/440 content with width%64<56.
  vp9: extend loopfilter workaround for vp9 h/v mix-up to work for 422.
  vp9: clip motion vectors in the same way as libvpx does.
  vp9: set skip flag if the block had no coded coefficients.
  vp9: apply mv scaling workaround only when subsampling is enabled.
  vp9: read all 4x4 blocks in sub8x8 blocks individually with scalability.
  vp9: fix segmentation map referencing upon framesize change.
  vp9: disable more pmulhrsw optimizations in idct16/32.
  vp9: disable all pmulhrsw in 8/16 iadst x86 optimizations.
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents 66f4b1e9 900e3af8
...@@ -153,6 +153,7 @@ typedef struct VP9Context { ...@@ -153,6 +153,7 @@ typedef struct VP9Context {
uint8_t temporal; uint8_t temporal;
uint8_t absolute_vals; uint8_t absolute_vals;
uint8_t update_map; uint8_t update_map;
uint8_t ignore_refmap;
struct { struct {
uint8_t q_enabled; uint8_t q_enabled;
uint8_t lf_enabled; uint8_t lf_enabled;
...@@ -613,11 +614,11 @@ static int decode_frame_header(AVCodecContext *ctx, ...@@ -613,11 +614,11 @@ static int decode_frame_header(AVCodecContext *ctx,
} else { } else {
s->refreshrefmask = get_bits(&s->gb, 8); s->refreshrefmask = get_bits(&s->gb, 8);
s->refidx[0] = get_bits(&s->gb, 3); s->refidx[0] = get_bits(&s->gb, 3);
s->signbias[0] = get_bits1(&s->gb); s->signbias[0] = get_bits1(&s->gb) && !s->errorres;
s->refidx[1] = get_bits(&s->gb, 3); s->refidx[1] = get_bits(&s->gb, 3);
s->signbias[1] = get_bits1(&s->gb); s->signbias[1] = get_bits1(&s->gb) && !s->errorres;
s->refidx[2] = get_bits(&s->gb, 3); s->refidx[2] = get_bits(&s->gb, 3);
s->signbias[2] = get_bits1(&s->gb); s->signbias[2] = get_bits1(&s->gb) && !s->errorres;
if (!s->refs[s->refidx[0]].f->data[0] || if (!s->refs[s->refidx[0]].f->data[0] ||
!s->refs[s->refidx[1]].f->data[0] || !s->refs[s->refidx[1]].f->data[0] ||
!s->refs[s->refidx[2]].f->data[0]) { !s->refs[s->refidx[2]].f->data[0]) {
...@@ -647,8 +648,8 @@ static int decode_frame_header(AVCodecContext *ctx, ...@@ -647,8 +648,8 @@ static int decode_frame_header(AVCodecContext *ctx,
s->highprecisionmvs = get_bits1(&s->gb); s->highprecisionmvs = get_bits1(&s->gb);
s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE : s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
get_bits(&s->gb, 2); get_bits(&s->gb, 2);
s->allowcompinter = s->signbias[0] != s->signbias[1] || s->allowcompinter = (s->signbias[0] != s->signbias[1] ||
s->signbias[0] != s->signbias[2]; s->signbias[0] != s->signbias[2]);
if (s->allowcompinter) { if (s->allowcompinter) {
if (s->signbias[0] == s->signbias[1]) { if (s->signbias[0] == s->signbias[1]) {
s->fixcompref = 2; s->fixcompref = 2;
...@@ -697,6 +698,15 @@ static int decode_frame_header(AVCodecContext *ctx, ...@@ -697,6 +698,15 @@ static int decode_frame_header(AVCodecContext *ctx,
s->framectxid = c = get_bits(&s->gb, 2); s->framectxid = c = get_bits(&s->gb, 2);
/* loopfilter header data */ /* loopfilter header data */
if (s->keyframe || s->errorres || s->intraonly) {
// reset loopfilter defaults
s->lf_delta.ref[0] = 1;
s->lf_delta.ref[1] = 0;
s->lf_delta.ref[2] = -1;
s->lf_delta.ref[3] = -1;
s->lf_delta.mode[0] = 0;
s->lf_delta.mode[1] = 0;
}
s->filter.level = get_bits(&s->gb, 6); s->filter.level = get_bits(&s->gb, 6);
sharp = get_bits(&s->gb, 3); sharp = get_bits(&s->gb, 3);
// if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
...@@ -724,6 +734,7 @@ static int decode_frame_header(AVCodecContext *ctx, ...@@ -724,6 +734,7 @@ static int decode_frame_header(AVCodecContext *ctx,
s->uvdc_qdelta == 0 && s->uvac_qdelta == 0; s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
/* segmentation header info */ /* segmentation header info */
s->segmentation.ignore_refmap = 0;
if ((s->segmentation.enabled = get_bits1(&s->gb))) { if ((s->segmentation.enabled = get_bits1(&s->gb))) {
if ((s->segmentation.update_map = get_bits1(&s->gb))) { if ((s->segmentation.update_map = get_bits1(&s->gb))) {
for (i = 0; i < 7; i++) for (i = 0; i < 7; i++)
...@@ -738,10 +749,11 @@ static int decode_frame_header(AVCodecContext *ctx, ...@@ -738,10 +749,11 @@ static int decode_frame_header(AVCodecContext *ctx,
if ((!s->segmentation.update_map || s->segmentation.temporal) && if ((!s->segmentation.update_map || s->segmentation.temporal) &&
(w != s->frames[CUR_FRAME].tf.f->width || (w != s->frames[CUR_FRAME].tf.f->width ||
h != s->frames[CUR_FRAME].tf.f->height)) { h != s->frames[CUR_FRAME].tf.f->height)) {
av_log(ctx, AV_LOG_ERROR, av_log(ctx, AV_LOG_WARNING,
"Reference segmap (temp=%d,update=%d) enabled on size-change!\n", "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
s->segmentation.temporal, s->segmentation.update_map); s->segmentation.temporal, s->segmentation.update_map);
return AVERROR_INVALIDDATA; s->segmentation.ignore_refmap = 1;
//return AVERROR_INVALIDDATA;
} }
if (get_bits1(&s->gb)) { if (get_bits1(&s->gb)) {
...@@ -788,9 +800,9 @@ static int decode_frame_header(AVCodecContext *ctx, ...@@ -788,9 +800,9 @@ static int decode_frame_header(AVCodecContext *ctx,
sh = s->filter.level >= 32; sh = s->filter.level >= 32;
if (s->segmentation.feat[i].lf_enabled) { if (s->segmentation.feat[i].lf_enabled) {
if (s->segmentation.absolute_vals) if (s->segmentation.absolute_vals)
lflvl = s->segmentation.feat[i].lf_val; lflvl = av_clip_uintp2(s->segmentation.feat[i].lf_val, 6);
else else
lflvl = s->filter.level + s->segmentation.feat[i].lf_val; lflvl = av_clip_uintp2(s->filter.level + s->segmentation.feat[i].lf_val, 6);
} else { } else {
lflvl = s->filter.level; lflvl = s->filter.level;
} }
...@@ -1100,7 +1112,7 @@ static void find_ref_mvs(VP9Context *s, ...@@ -1100,7 +1112,7 @@ static void find_ref_mvs(VP9Context *s,
int row = s->row, col = s->col, row7 = s->row7; int row = s->row, col = s->col, row7 = s->row7;
const int8_t (*p)[2] = mv_ref_blk_off[b->bs]; const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
#define INVALID_MV 0x80008000U #define INVALID_MV 0x80008000U
uint32_t mem = INVALID_MV; uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
int i; int i;
#define RETURN_DIRECT_MV(mv) \ #define RETURN_DIRECT_MV(mv) \
...@@ -1131,15 +1143,25 @@ static void find_ref_mvs(VP9Context *s, ...@@ -1131,15 +1143,25 @@ static void find_ref_mvs(VP9Context *s,
if (sb > 0) { \ if (sb > 0) { \
VP56mv tmp; \ VP56mv tmp; \
uint32_t m; \ uint32_t m; \
clamp_mv(&tmp, &mv, s); \ av_assert2(idx == 1); \
m = AV_RN32A(&tmp); \ av_assert2(mem != INVALID_MV); \
if (!idx) { \ if (mem_sub8x8 == INVALID_MV) { \
AV_WN32A(pmv, m); \ clamp_mv(&tmp, &mv, s); \
return; \ m = AV_RN32A(&tmp); \
} else if (mem == INVALID_MV) { \ if (m != mem) { \
mem = m; \ AV_WN32A(pmv, m); \
} else if (m != mem) { \ return; \
AV_WN32A(pmv, m); \ } \
mem_sub8x8 = AV_RN32A(&mv); \
} else if (mem_sub8x8 != AV_RN32A(&mv)) { \
clamp_mv(&tmp, &mv, s); \
m = AV_RN32A(&tmp); \
if (m != mem) { \
AV_WN32A(pmv, m); \
} else { \
/* BUG I'm pretty sure this isn't the intention */ \
AV_WN32A(pmv, 0); \
} \
return; \ return; \
} \ } \
} else { \ } else { \
...@@ -1458,7 +1480,7 @@ static void decode_mode(AVCodecContext *ctx) ...@@ -1458,7 +1480,7 @@ static void decode_mode(AVCodecContext *ctx)
vp56_rac_get_prob_branchy(&s->c, vp56_rac_get_prob_branchy(&s->c,
s->prob.segpred[s->above_segpred_ctx[col] + s->prob.segpred[s->above_segpred_ctx[col] +
s->left_segpred_ctx[row7]]))) { s->left_segpred_ctx[row7]]))) {
if (!s->errorres) { if (!s->errorres && !s->segmentation.ignore_refmap) {
int pred = 8, x; int pred = 8, x;
uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map; uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
...@@ -2296,7 +2318,7 @@ static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs, ...@@ -2296,7 +2318,7 @@ static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
nnz, scan, nb, band_counts, qmul); nnz, scan, nb, band_counts, qmul);
} }
static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel) static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
{ {
VP9Context *s = ctx->priv_data; VP9Context *s = ctx->priv_data;
VP9Block *b = s->b; VP9Block *b = s->b;
...@@ -2325,6 +2347,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi ...@@ -2325,6 +2347,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
const int16_t *y_band_counts = band_counts[b->tx]; const int16_t *y_band_counts = band_counts[b->tx];
const int16_t *uv_band_counts = band_counts[b->uvtx]; const int16_t *uv_band_counts = band_counts[b->uvtx];
int bytesperpixel = is8bitsperpixel ? 1 : 2; int bytesperpixel = is8bitsperpixel ? 1 : 2;
int total_coeff = 0;
#define MERGE(la, end, step, rd) \ #define MERGE(la, end, step, rd) \
for (n = 0; n < end; n += step) \ for (n = 0; n < end; n += step) \
...@@ -2344,6 +2367,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi ...@@ -2344,6 +2367,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
c, e, p, a[x] + l[y], yscans[txtp], \ c, e, p, a[x] + l[y], yscans[txtp], \
ynbs[txtp], y_band_counts, qmul[0]); \ ynbs[txtp], y_band_counts, qmul[0]); \
a[x] = l[y] = !!res; \ a[x] = l[y] = !!res; \
total_coeff |= !!res; \
if (step >= 4) { \ if (step >= 4) { \
AV_WN16A(&s->eob[n], res); \ AV_WN16A(&s->eob[n], res); \
} else { \ } else { \
...@@ -2417,6 +2441,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi ...@@ -2417,6 +2441,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
16 * step * step, c, e, p, a[x] + l[y], \ 16 * step * step, c, e, p, a[x] + l[y], \
uvscan, uvnb, uv_band_counts, qmul[1]); \ uvscan, uvnb, uv_band_counts, qmul[1]); \
a[x] = l[y] = !!res; \ a[x] = l[y] = !!res; \
total_coeff |= !!res; \
if (step >= 4) { \ if (step >= 4) { \
AV_WN16A(&s->uveob[pl][n], res); \ AV_WN16A(&s->uveob[pl][n], res); \
} else { \ } else { \
...@@ -2456,16 +2481,18 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi ...@@ -2456,16 +2481,18 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
break; break;
} }
} }
return total_coeff;
} }
static void decode_coeffs_8bpp(AVCodecContext *ctx) static int decode_coeffs_8bpp(AVCodecContext *ctx)
{ {
decode_coeffs(ctx, 1); return decode_coeffs(ctx, 1);
} }
static void decode_coeffs_16bpp(AVCodecContext *ctx) static int decode_coeffs_16bpp(AVCodecContext *ctx)
{ {
decode_coeffs(ctx, 0); return decode_coeffs(ctx, 0);
} }
static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a, static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
...@@ -2733,18 +2760,24 @@ static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func sm ...@@ -2733,18 +2760,24 @@ static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func sm
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *ref, ptrdiff_t ref_stride, const uint8_t *ref, ptrdiff_t ref_stride,
ThreadFrame *ref_frame, ThreadFrame *ref_frame,
ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
int px, int py, int pw, int ph,
int bw, int bh, int w, int h, int bytesperpixel, int bw, int bh, int w, int h, int bytesperpixel,
const uint16_t *scale, const uint8_t *step) const uint16_t *scale, const uint8_t *step)
{ {
#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14) #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
int mx, my;
int refbw_m1, refbh_m1;
int th;
VP56mv mv;
mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
// BUG libvpx seems to scale the two components separately. This introduces // BUG libvpx seems to scale the two components separately. This introduces
// rounding errors but we have to reproduce them to be exactly compatible // rounding errors but we have to reproduce them to be exactly compatible
// with the output from libvpx... // with the output from libvpx...
int mx = scale_mv(mv->x * 2, 0) + scale_mv(x * 16, 0); mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
int my = scale_mv(mv->y * 2, 1) + scale_mv(y * 16, 1); my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
int refbw_m1, refbh_m1;
int th;
y = my >> 4; y = my >> 4;
x = mx >> 4; x = mx >> 4;
...@@ -2776,17 +2809,33 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func ...@@ -2776,17 +2809,33 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
const uint8_t *ref_u, ptrdiff_t src_stride_u, const uint8_t *ref_u, ptrdiff_t src_stride_u,
const uint8_t *ref_v, ptrdiff_t src_stride_v, const uint8_t *ref_v, ptrdiff_t src_stride_v,
ThreadFrame *ref_frame, ThreadFrame *ref_frame,
ptrdiff_t y, ptrdiff_t x, const VP56mv *mv, ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
int px, int py, int pw, int ph,
int bw, int bh, int w, int h, int bytesperpixel, int bw, int bh, int w, int h, int bytesperpixel,
const uint16_t *scale, const uint8_t *step) const uint16_t *scale, const uint8_t *step)
{ {
// BUG https://code.google.com/p/webm/issues/detail?id=820 int mx, my;
int mx = scale_mv(mv->x << !s->ss_h, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
int my = scale_mv(mv->y << !s->ss_v, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
#undef scale_mv
int refbw_m1, refbh_m1; int refbw_m1, refbh_m1;
int th; int th;
VP56mv mv;
if (s->ss_h) {
// BUG https://code.google.com/p/webm/issues/detail?id=820
mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
} else {
mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
}
if (s->ss_v) {
// BUG https://code.google.com/p/webm/issues/detail?id=820
mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
} else {
mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
}
#undef scale_mv
y = my >> 4; y = my >> 4;
x = mx >> 4; x = mx >> 4;
ref_u += y * src_stride_u + x * bytesperpixel; ref_u += y * src_stride_u + x * bytesperpixel;
...@@ -2822,15 +2871,17 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func ...@@ -2822,15 +2871,17 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
} }
} }
#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \ #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
px, py, pw, ph, bw, bh, w, h, i) \
mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \ mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
mv, bw, bh, w, h, bytesperpixel, \ mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
s->mvscale[b->ref[i]], s->mvstep[b->ref[i]]) s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
row, col, mv, bw, bh, w, h, i) \ row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
row, col, mv, bw, bh, w, h, bytesperpixel, \ row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
s->mvscale[b->ref[i]], s->mvstep[b->ref[i]]) s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
#define SCALED 1
#define FN(x) x##_scaled_8bpp #define FN(x) x##_scaled_8bpp
#define BYTES_PER_PIXEL 1 #define BYTES_PER_PIXEL 1
#include "vp9_mc_template.c" #include "vp9_mc_template.c"
...@@ -2843,6 +2894,7 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func ...@@ -2843,6 +2894,7 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
#undef mc_chroma_dir #undef mc_chroma_dir
#undef FN #undef FN
#undef BYTES_PER_PIXEL #undef BYTES_PER_PIXEL
#undef SCALED
static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2], static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
uint8_t *dst, ptrdiff_t dst_stride, uint8_t *dst, ptrdiff_t dst_stride,
...@@ -2921,13 +2973,15 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc) ...@@ -2921,13 +2973,15 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
} }
} }
#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \ #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
px, py, pw, ph, bw, bh, w, h, i) \
mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \ mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
mv, bw, bh, w, h, bytesperpixel) mv, bw, bh, w, h, bytesperpixel)
#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
row, col, mv, bw, bh, w, h, i) \ row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
row, col, mv, bw, bh, w, h, bytesperpixel) row, col, mv, bw, bh, w, h, bytesperpixel)
#define SCALED 0
#define FN(x) x##_8bpp #define FN(x) x##_8bpp
#define BYTES_PER_PIXEL 1 #define BYTES_PER_PIXEL 1
#include "vp9_mc_template.c" #include "vp9_mc_template.c"
...@@ -2940,6 +2994,7 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc) ...@@ -2940,6 +2994,7 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
#undef mc_chroma_dir_dir #undef mc_chroma_dir_dir
#undef FN #undef FN
#undef BYTES_PER_PIXEL #undef BYTES_PER_PIXEL
#undef SCALED
static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel) static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
{ {
...@@ -3077,8 +3132,12 @@ static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_ ...@@ -3077,8 +3132,12 @@ static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_
} }
if (!ss_h) if (!ss_h)
mask[0][y][3] |= m_col; mask[0][y][3] |= m_col;
if (!ss_v) if (!ss_v) {
mask[1][y][3] |= m_col; if (ss_h && (col_end & 1))
mask[1][y][3] |= (t << (w - 1)) - t;
else
mask[1][y][3] |= m_col;
}
} }
} else { } else {
int y, t = 1 << col_and_7, m_col = (t << w) - t; int y, t = 1 << col_and_7, m_col = (t << w) - t;
...@@ -3164,10 +3223,17 @@ static void decode_b(AVCodecContext *ctx, int row, int col, ...@@ -3164,10 +3223,17 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
(s->ss_v && h4 * 2 == (1 << b->tx))); (s->ss_v && h4 * 2 == (1 << b->tx)));
if (!b->skip) { if (!b->skip) {
int has_coeffs;
if (bytesperpixel == 1) { if (bytesperpixel == 1) {
decode_coeffs_8bpp(ctx); has_coeffs = decode_coeffs_8bpp(ctx);
} else { } else {
decode_coeffs_16bpp(ctx); has_coeffs = decode_coeffs_16bpp(ctx);
}
if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
b->skip = 1;
memset(&s->above_skip_ctx[col], 1, w4);
memset(&s->left_skip_ctx[s->row7], 1, h4);
} }
} else { } else {
int row7 = s->row7; int row7 = s->row7;
...@@ -3272,7 +3338,7 @@ static void decode_b(AVCodecContext *ctx, int row, int col, ...@@ -3272,7 +3338,7 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h; int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0; int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
for (n = 1; o < w; n++) { for (n = s->ss_h; o < w; n++) {
int bw = 64 >> n; int bw = 64 >> n;
av_assert2(n <= 4); av_assert2(n <= 4);
......
...@@ -53,14 +53,15 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -53,14 +53,15 @@ static void FN(inter_pred)(AVCodecContext *ctx)
if (b->bs > BS_8x8) { if (b->bs > BS_8x8) {
VP56mv uvmv; VP56mv uvmv;
#if SCALED == 0
if (b->bs == BS_8x4) { if (b->bs == BS_8x4) {
mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y, mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1, 0); row << 3, col << 3, &b->mv[0][0],,,,, 8, 4, w1, h1, 0);
mc_luma_dir(s, mc[3][b->filter][0], mc_luma_dir(s, mc[3][b->filter][0],
s->dst[0] + 4 * ls_y, ls_y, s->dst[0] + 4 * ls_y, ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
(row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1, 0); (row << 3) + 4, col << 3, &b->mv[2][0],,,,, 8, 4, w1, h1, 0);
w1 = (w1 + s->ss_h) >> s->ss_h; w1 = (w1 + s->ss_h) >> s->ss_h;
if (s->ss_v) { if (s->ss_v) {
h1 = (h1 + 1) >> 1; h1 = (h1 + 1) >> 1;
...@@ -70,14 +71,14 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -70,14 +71,14 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 2, col << (3 - s->ss_h), row << 2, col << (3 - s->ss_h),
&uvmv, 8 >> s->ss_h, 4, w1, h1, 0); &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
} else { } else {
mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0], mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
s->dst[1], s->dst[2], ls_uv, s->dst[1], s->dst[2], ls_uv,
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 3, col << (3 - s->ss_h), row << 3, col << (3 - s->ss_h),
&b->mv[0][0], 8 >> s->ss_h, 4, w1, h1, 0); &b->mv[0][0],,,,, 8 >> s->ss_h, 4, w1, h1, 0);
// BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
// to get the motion vector for the bottom 4x4 block // to get the motion vector for the bottom 4x4 block
// https://code.google.com/p/webm/issues/detail?id=993 // https://code.google.com/p/webm/issues/detail?id=993
...@@ -91,17 +92,17 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -91,17 +92,17 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
(row << 3) + 4, col << (3 - s->ss_h), (row << 3) + 4, col << (3 - s->ss_h),
&uvmv, 8 >> s->ss_h, 4, w1, h1, 0); &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
} }
if (b->comp) { if (b->comp) {
mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y, mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2, 1); row << 3, col << 3, &b->mv[0][1],,,,, 8, 4, w2, h2, 1);
mc_luma_dir(s, mc[3][b->filter][1], mc_luma_dir(s, mc[3][b->filter][1],
s->dst[0] + 4 * ls_y, ls_y, s->dst[0] + 4 * ls_y, ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
(row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2, 1); (row << 3) + 4, col << 3, &b->mv[2][1],,,,, 8, 4, w2, h2, 1);
w2 = (w2 + s->ss_h) >> s->ss_h; w2 = (w2 + s->ss_h) >> s->ss_h;
if (s->ss_v) { if (s->ss_v) {
h2 = (h2 + 1) >> 1; h2 = (h2 + 1) >> 1;
...@@ -111,14 +112,14 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -111,14 +112,14 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 2, col << (3 - s->ss_h), row << 2, col << (3 - s->ss_h),
&uvmv, 8 >> s->ss_h, 4, w2, h2, 1); &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
} else { } else {
mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1], mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
s->dst[1], s->dst[2], ls_uv, s->dst[1], s->dst[2], ls_uv,
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 3, col << (3 - s->ss_h), row << 3, col << (3 - s->ss_h),
&b->mv[0][1], 8 >> s->ss_h, 4, w2, h2, 1); &b->mv[0][1],,,,, 8 >> s->ss_h, 4, w2, h2, 1);
// BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
// to get the motion vector for the bottom 4x4 block // to get the motion vector for the bottom 4x4 block
// https://code.google.com/p/webm/issues/detail?id=993 // https://code.google.com/p/webm/issues/detail?id=993
...@@ -132,16 +133,16 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -132,16 +133,16 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
(row << 3) + 4, col << (3 - s->ss_h), (row << 3) + 4, col << (3 - s->ss_h),
&uvmv, 8 >> s->ss_h, 4, w2, h2, 1); &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
} }
} }
} else if (b->bs == BS_4x8) { } else if (b->bs == BS_4x8) {
mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y, mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1, 0); row << 3, col << 3, &b->mv[0][0],,,,, 4, 8, w1, h1, 0);
mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y, mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1, 0); row << 3, (col << 3) + 4, &b->mv[1][0],,,,, 4, 8, w1, h1, 0);
h1 = (h1 + s->ss_v) >> s->ss_v; h1 = (h1 + s->ss_v) >> s->ss_v;
if (s->ss_h) { if (s->ss_h) {
w1 = (w1 + 1) >> 1; w1 = (w1 + 1) >> 1;
...@@ -151,30 +152,30 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -151,30 +152,30 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << (3 - s->ss_v), col << 2, row << (3 - s->ss_v), col << 2,
&uvmv, 4, 8 >> s->ss_v, w1, h1, 0); &uvmv,,,,, 4, 8 >> s->ss_v, w1, h1, 0);
} else { } else {
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
s->dst[1], s->dst[2], ls_uv, s->dst[1], s->dst[2], ls_uv,
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << (3 - s->ss_v), col << 3, row << (3 - s->ss_v), col << 3,
&b->mv[0][0], 4, 8 >> s->ss_v, w1, h1, 0); &b->mv[0][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
s->dst[1] + 4 * bytesperpixel, s->dst[1] + 4 * bytesperpixel,
s->dst[2] + 4 * bytesperpixel, ls_uv, s->dst[2] + 4 * bytesperpixel, ls_uv,
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << (3 - s->ss_v), (col << 3) + 4, row << (3 - s->ss_v), (col << 3) + 4,
&b->mv[1][0], 4, 8 >> s->ss_v, w1, h1, 0); &b->mv[1][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
} }
if (b->comp) { if (b->comp) {
mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y, mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2, 1); row << 3, col << 3, &b->mv[0][1],,,,, 4, 8, w2, h2, 1);
mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y, mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2, 1); row << 3, (col << 3) + 4, &b->mv[1][1],,,,, 4, 8, w2, h2, 1);
h2 = (h2 + s->ss_v) >> s->ss_v; h2 = (h2 + s->ss_v) >> s->ss_v;
if (s->ss_h) { if (s->ss_h) {
w2 = (w2 + 1) >> 1; w2 = (w2 + 1) >> 1;
...@@ -184,42 +185,48 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -184,42 +185,48 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << (3 - s->ss_v), col << 2, row << (3 - s->ss_v), col << 2,
&uvmv, 4, 8 >> s->ss_v, w2, h2, 1); &uvmv,,,,, 4, 8 >> s->ss_v, w2, h2, 1);
} else { } else {
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
s->dst[1], s->dst[2], ls_uv, s->dst[1], s->dst[2], ls_uv,
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << (3 - s->ss_v), col << 3, row << (3 - s->ss_v), col << 3,
&b->mv[0][1], 4, 8 >> s->ss_v, w2, h2, 1); &b->mv[0][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
s->dst[1] + 4 * bytesperpixel, s->dst[1] + 4 * bytesperpixel,
s->dst[2] + 4 * bytesperpixel, ls_uv, s->dst[2] + 4 * bytesperpixel, ls_uv,
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << (3 - s->ss_v), (col << 3) + 4, row << (3 - s->ss_v), (col << 3) + 4,
&b->mv[1][1], 4, 8 >> s->ss_v, w2, h2, 1); &b->mv[1][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
} }
} }
} else { } else
#endif
{
av_assert2(b->bs == BS_4x4); av_assert2(b->bs == BS_4x4);
// FIXME if two horizontally adjacent blocks have the same MV, // FIXME if two horizontally adjacent blocks have the same MV,
// do a w8 instead of a w4 call // do a w8 instead of a w4 call
mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y, mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1, 0); row << 3, col << 3, &b->mv[0][0],
0, 0, 8, 8, 4, 4, w1, h1, 0);
mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y, mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1, 0); row << 3, (col << 3) + 4, &b->mv[1][0],
4, 0, 8, 8, 4, 4, w1, h1, 0);
mc_luma_dir(s, mc[4][b->filter][0], mc_luma_dir(s, mc[4][b->filter][0],
s->dst[0] + 4 * ls_y, ls_y, s->dst[0] + 4 * ls_y, ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
(row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1, 0); (row << 3) + 4, col << 3, &b->mv[2][0],
0, 4, 8, 8, 4, 4, w1, h1, 0);
mc_luma_dir(s, mc[4][b->filter][0], mc_luma_dir(s, mc[4][b->filter][0],
s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y, s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
(row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1, 0); (row << 3) + 4, (col << 3) + 4, &b->mv[3][0],
4, 4, 8, 8, 4, 4, w1, h1, 0);
if (s->ss_v) { if (s->ss_v) {
h1 = (h1 + 1) >> 1; h1 = (h1 + 1) >> 1;
if (s->ss_h) { if (s->ss_h) {
...@@ -231,7 +238,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -231,7 +238,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 2, col << 2, row << 2, col << 2,
&uvmv, 4, 4, w1, h1, 0); &uvmv, 0, 0, 4, 4, 4, 4, w1, h1, 0);
} else { } else {
uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]); uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
...@@ -239,7 +246,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -239,7 +246,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 2, col << 3, row << 2, col << 3,
&uvmv, 4, 4, w1, h1, 0); &uvmv, 0, 0, 8, 4, 4, 4, w1, h1, 0);
uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]); uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]);
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
s->dst[1] + 4 * bytesperpixel, s->dst[1] + 4 * bytesperpixel,
...@@ -247,7 +254,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -247,7 +254,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 2, (col << 3) + 4, row << 2, (col << 3) + 4,
&uvmv, 4, 4, w1, h1, 0); &uvmv, 4, 0, 8, 4, 4, 4, w1, h1, 0);
} }
} else { } else {
if (s->ss_h) { if (s->ss_h) {
...@@ -258,7 +265,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -258,7 +265,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 3, col << 2, row << 3, col << 2,
&uvmv, 4, 4, w1, h1, 0); &uvmv, 0, 0, 4, 8, 4, 4, w1, h1, 0);
// BUG libvpx uses wrong block index for 4:2:2 bs=4x4 // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
// bottom block // bottom block
// https://code.google.com/p/webm/issues/detail?id=993 // https://code.google.com/p/webm/issues/detail?id=993
...@@ -268,52 +275,52 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -268,52 +275,52 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
(row << 3) + 4, col << 2, (row << 3) + 4, col << 2,
&uvmv, 4, 4, w1, h1, 0); &uvmv, 0, 4, 4, 8, 4, 4, w1, h1, 0);
} else { } else {
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
s->dst[1], s->dst[2], ls_uv, s->dst[1], s->dst[2], ls_uv,
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 3, col << 3, row << 3, col << 3,
&b->mv[0][0], 4, 4, w1, h1, 0); &b->mv[0][0], 0, 0, 8, 8, 4, 4, w1, h1, 0);
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
s->dst[1] + 4 * bytesperpixel, s->dst[1] + 4 * bytesperpixel,
s->dst[2] + 4 * bytesperpixel, ls_uv, s->dst[2] + 4 * bytesperpixel, ls_uv,
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << 3, (col << 3) + 4, row << 3, (col << 3) + 4,
&b->mv[1][0], 4, 4, w1, h1, 0); &b->mv[1][0], 4, 0, 8, 8, 4, 4, w1, h1, 0);
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv, s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
(row << 3) + 4, col << 3, (row << 3) + 4, col << 3,
&b->mv[2][0], 4, 4, w1, h1, 0); &b->mv[2][0], 0, 4, 8, 8, 4, 4, w1, h1, 0);
mc_chroma_dir(s, mc[4][b->filter][0], mc_chroma_dir(s, mc[4][b->filter][0],
s->dst[1] + 4 * ls_uv + 4 * bytesperpixel, s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv, s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
(row << 3) + 4, (col << 3) + 4, (row << 3) + 4, (col << 3) + 4,
&b->mv[3][0], 4, 4, w1, h1, 0); &b->mv[3][0], 4, 4, 8, 8, 4, 4, w1, h1, 0);
} }
} }
if (b->comp) { if (b->comp) {
mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y, mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2, 1); row << 3, col << 3, &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y, mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2, 1); row << 3, (col << 3) + 4, &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
mc_luma_dir(s, mc[4][b->filter][1], mc_luma_dir(s, mc[4][b->filter][1],
s->dst[0] + 4 * ls_y, ls_y, s->dst[0] + 4 * ls_y, ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
(row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2, 1); (row << 3) + 4, col << 3, &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
mc_luma_dir(s, mc[4][b->filter][1], mc_luma_dir(s, mc[4][b->filter][1],
s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y, s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
(row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2, 1); (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
if (s->ss_v) { if (s->ss_v) {
h2 = (h2 + 1) >> 1; h2 = (h2 + 1) >> 1;
if (s->ss_h) { if (s->ss_h) {
...@@ -325,7 +332,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -325,7 +332,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 2, col << 2, row << 2, col << 2,
&uvmv, 4, 4, w2, h2, 1); &uvmv, 0, 0, 4, 4, 4, 4, w2, h2, 1);
} else { } else {
uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]); uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
...@@ -333,7 +340,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -333,7 +340,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 2, col << 3, row << 2, col << 3,
&uvmv, 4, 4, w2, h2, 1); &uvmv, 0, 0, 8, 4, 4, 4, w2, h2, 1);
uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]); uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]);
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
s->dst[1] + 4 * bytesperpixel, s->dst[1] + 4 * bytesperpixel,
...@@ -341,7 +348,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -341,7 +348,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 2, (col << 3) + 4, row << 2, (col << 3) + 4,
&uvmv, 4, 4, w2, h2, 1); &uvmv, 4, 0, 8, 4, 4, 4, w2, h2, 1);
} }
} else { } else {
if (s->ss_h) { if (s->ss_h) {
...@@ -352,7 +359,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -352,7 +359,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 3, col << 2, row << 3, col << 2,
&uvmv, 4, 4, w2, h2, 1); &uvmv, 0, 0, 4, 8, 4, 4, w2, h2, 1);
// BUG libvpx uses wrong block index for 4:2:2 bs=4x4 // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
// bottom block // bottom block
// https://code.google.com/p/webm/issues/detail?id=993 // https://code.google.com/p/webm/issues/detail?id=993
...@@ -362,34 +369,34 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -362,34 +369,34 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
(row << 3) + 4, col << 2, (row << 3) + 4, col << 2,
&uvmv, 4, 4, w2, h2, 1); &uvmv, 0, 4, 4, 8, 4, 4, w2, h2, 1);
} else { } else {
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
s->dst[1], s->dst[2], ls_uv, s->dst[1], s->dst[2], ls_uv,
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 3, col << 3, row << 3, col << 3,
&b->mv[0][1], 4, 4, w2, h2, 1); &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
s->dst[1] + 4 * bytesperpixel, s->dst[1] + 4 * bytesperpixel,
s->dst[2] + 4 * bytesperpixel, ls_uv, s->dst[2] + 4 * bytesperpixel, ls_uv,
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << 3, (col << 3) + 4, row << 3, (col << 3) + 4,
&b->mv[1][1], 4, 4, w2, h2, 1); &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv, s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
(row << 3) + 4, col << 3, (row << 3) + 4, col << 3,
&b->mv[2][1], 4, 4, w2, h2, 1); &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
mc_chroma_dir(s, mc[4][b->filter][1], mc_chroma_dir(s, mc[4][b->filter][1],
s->dst[1] + 4 * ls_uv + 4 * bytesperpixel, s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv, s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
(row << 3) + 4, (col << 3) + 4, (row << 3) + 4, (col << 3) + 4,
&b->mv[3][1], 4, 4, w2, h2, 1); &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
} }
} }
} }
...@@ -401,7 +408,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -401,7 +408,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y, mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y,
ref1->data[0], ref1->linesize[0], tref1, ref1->data[0], ref1->linesize[0], tref1,
row << 3, col << 3, &b->mv[0][0], bw, bh, w1, h1, 0); row << 3, col << 3, &b->mv[0][0], 0, 0, bw, bh, bw, bh, w1, h1, 0);
w1 = (w1 + s->ss_h) >> s->ss_h; w1 = (w1 + s->ss_h) >> s->ss_h;
h1 = (h1 + s->ss_v) >> s->ss_v; h1 = (h1 + s->ss_v) >> s->ss_v;
mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][0], mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][0],
...@@ -409,12 +416,12 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -409,12 +416,12 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref1->data[1], ref1->linesize[1], ref1->data[1], ref1->linesize[1],
ref1->data[2], ref1->linesize[2], tref1, ref1->data[2], ref1->linesize[2], tref1,
row << (3 - s->ss_v), col << (3 - s->ss_h), row << (3 - s->ss_v), col << (3 - s->ss_h),
&b->mv[0][0], uvbw, uvbh, w1, h1, 0); &b->mv[0][0], 0, 0, uvbw, uvbh, uvbw, uvbh, w1, h1, 0);
if (b->comp) { if (b->comp) {
mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y, mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y,
ref2->data[0], ref2->linesize[0], tref2, ref2->data[0], ref2->linesize[0], tref2,
row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2, 1); row << 3, col << 3, &b->mv[0][1], 0, 0, bw, bh, bw, bh, w2, h2, 1);
w2 = (w2 + s->ss_h) >> s->ss_h; w2 = (w2 + s->ss_h) >> s->ss_h;
h2 = (h2 + s->ss_v) >> s->ss_v; h2 = (h2 + s->ss_v) >> s->ss_v;
mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][1], mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][1],
...@@ -422,7 +429,7 @@ static void FN(inter_pred)(AVCodecContext *ctx) ...@@ -422,7 +429,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
ref2->data[1], ref2->linesize[1], ref2->data[1], ref2->linesize[1],
ref2->data[2], ref2->linesize[2], tref2, ref2->data[2], ref2->linesize[2], tref2,
row << (3 - s->ss_v), col << (3 - s->ss_h), row << (3 - s->ss_v), col << (3 - s->ss_h),
&b->mv[0][1], uvbw, uvbh, w2, h2, 1); &b->mv[0][1], 0, 0, uvbw, uvbh, uvbw, uvbh, w2, h2, 1);
} }
} }
} }
...@@ -868,7 +868,8 @@ VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13 ...@@ -868,7 +868,8 @@ VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7 ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
%if cpuflag(ssse3) ; unfortunately, the code below overflows in some cases
%if 0; cpuflag(ssse3)
SUMSUB_BA w, 3, 4, 2 SUMSUB_BA w, 3, 4, 2
SUMSUB_BA w, 0, 7, 2 SUMSUB_BA w, 0, 7, 2
pmulhrsw m3, W_11585x2_REG pmulhrsw m3, W_11585x2_REG
...@@ -996,7 +997,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 ...@@ -996,7 +997,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
; SUMSUB_BA w, x13, x14, 7 ; t6, t9 ; SUMSUB_BA w, x13, x14, 7 ; t6, t9
; SUMSUB_BA w, x15, x12, 7 ; t7, t8 ; SUMSUB_BA w, x15, x12, 7 ; t7, t8
%macro VP9_IDCT16_1D_START 5 ; src, nnzc, stride, scratch, scratch_stride %macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
%if %2 <= 4 %if %2 <= 4
mova m3, [%1+ 1*%3] ; IN(1) mova m3, [%1+ 1*%3] ; IN(1)
mova m0, [%1+ 3*%3] ; IN(3) mova m0, [%1+ 3*%3] ; IN(3)
...@@ -1089,7 +1090,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 ...@@ -1089,7 +1090,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
%if cpuflag(ssse3) %if cpuflag(ssse3) && %6 == 0
SUMSUB_BA w, 2, 5, 7 SUMSUB_BA w, 2, 5, 7
SUMSUB_BA w, 3, 4, 7 SUMSUB_BA w, 3, 4, 7
pmulhrsw m5, [pw_11585x2] ; t10 pmulhrsw m5, [pw_11585x2] ; t10
...@@ -1163,7 +1164,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 ...@@ -1163,7 +1164,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
SUMSUB_BA w, 4, 6, 2 ; t4, t5 SUMSUB_BA w, 4, 6, 2 ; t4, t5
SUMSUB_BA w, 7, 5, 2 ; t7, t6 SUMSUB_BA w, 7, 5, 2 ; t7, t6
%if cpuflag(ssse3) %if cpuflag(ssse3) && %6 == 0
SUMSUB_BA w, 6, 5, 2 SUMSUB_BA w, 6, 5, 2
pmulhrsw m5, [pw_11585x2] ; t5 pmulhrsw m5, [pw_11585x2] ; t5
pmulhrsw m6, [pw_11585x2] ; t6 pmulhrsw m6, [pw_11585x2] ; t6
...@@ -1183,7 +1184,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 ...@@ -1183,7 +1184,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
mova m3, [%1+ 8*%3] ; IN(8) mova m3, [%1+ 8*%3] ; IN(8)
; from 3 stages back ; from 3 stages back
%if cpuflag(ssse3) %if cpuflag(ssse3) && %6 == 0
SUMSUB_BA w, 3, 2, 5 SUMSUB_BA w, 3, 2, 5
pmulhrsw m3, [pw_11585x2] ; t0 pmulhrsw m3, [pw_11585x2] ; t0
pmulhrsw m2, [pw_11585x2] ; t1 pmulhrsw m2, [pw_11585x2] ; t1
...@@ -1248,9 +1249,9 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 ...@@ -1248,9 +1249,9 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
%endif %endif
%endmacro %endmacro
%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc %macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
%if %2 == 1 %if %2 == 1
VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16 VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
%if ARCH_X86_64 %if ARCH_X86_64
; backup a different register ; backup a different register
...@@ -1317,7 +1318,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 ...@@ -1317,7 +1318,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
mova [tmpq+15*16], m7 mova [tmpq+15*16], m7
%endif %endif
%else ; %2 == 2 %else ; %2 == 2
VP9_IDCT16_1D_START %1, %3, 32, %1, 32 VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
%if cpuflag(ssse3) %if cpuflag(ssse3)
%define ROUND_REG [pw_512] %define ROUND_REG [pw_512]
...@@ -1467,12 +1468,12 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob ...@@ -1467,12 +1468,12 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
%if cpuflag(ssse3) %if cpuflag(ssse3)
.idct8x8: .idct8x8:
mov tmpq, rsp mov tmpq, rsp
VP9_IDCT16_1D blockq, 1, 8 VP9_IDCT16_1D blockq, 1, 8, 0
mov cntd, 2 mov cntd, 2
mov dst_bakq, dstq mov dst_bakq, dstq
.loop2_8x8: .loop2_8x8:
VP9_IDCT16_1D tmpq, 2, 8 VP9_IDCT16_1D tmpq, 2, 8, 0
lea dstq, [dst_bakq+8] lea dstq, [dst_bakq+8]
add tmpq, 16 add tmpq, 16
dec cntd dec cntd
...@@ -1488,7 +1489,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob ...@@ -1488,7 +1489,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
mov cntd, 2 mov cntd, 2
mov tmpq, rsp mov tmpq, rsp
.loop1_full: .loop1_full:
VP9_IDCT16_1D blockq, 1 VP9_IDCT16_1D blockq, 1, 16, 0
add blockq, 16 add blockq, 16
add tmpq, 256 add tmpq, 256
dec cntd dec cntd
...@@ -1499,7 +1500,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob ...@@ -1499,7 +1500,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
mov tmpq, rsp mov tmpq, rsp
mov dst_bakq, dstq mov dst_bakq, dstq
.loop2_full: .loop2_full:
VP9_IDCT16_1D tmpq, 2 VP9_IDCT16_1D tmpq, 2, 16, 0
lea dstq, [dst_bakq+8] lea dstq, [dst_bakq+8]
add tmpq, 16 add tmpq, 16
dec cntd dec cntd
...@@ -1647,7 +1648,8 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx ...@@ -1647,7 +1648,8 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192] VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192]
PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w] PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w]
%if cpuflag(ssse3) ; unfortunately, the code below overflows in some cases
%if 0; cpuflag(ssse3)
SUMSUB_BA w, 7, 6, 1 SUMSUB_BA w, 7, 6, 1
pmulhrsw m7, [pw_m11585x2] ; m7=out5[w] pmulhrsw m7, [pw_m11585x2] ; m7=out5[w]
pmulhrsw m6, [pw_11585x2] ; m6=out10[w] pmulhrsw m6, [pw_11585x2] ; m6=out10[w]
...@@ -1899,7 +1901,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx ...@@ -1899,7 +1901,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc %macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
%assign %%str 16*%2*%2 %assign %%str 16*%2*%2
; first do t0-15, this can be done identical to idct16x16 ; first do t0-15, this can be done identical to idct16x16
VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
; store everything on stack to make space available for t16-31 ; store everything on stack to make space available for t16-31
; we store interleaved with the output of the second half (t16-31) ; we store interleaved with the output of the second half (t16-31)
...@@ -2130,7 +2132,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx ...@@ -2130,7 +2132,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
%if cpuflag(ssse3) %if 0; cpuflag(ssse3)
%if ARCH_X86_64 %if ARCH_X86_64
SUMSUB_BA w, 4, 7, 8 SUMSUB_BA w, 4, 7, 8
SUMSUB_BA w, 5, 1, 8 SUMSUB_BA w, 5, 1, 8
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment