x86/vf_fspp: port inline asm to yasm

Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>

x86/vf_fspp: port inline asm to yasm
Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>
466e32bf · James Almer · 9224c7f0 · 466e32bf · 466e32bf · 466e32bf
Commit 466e32bf authored Dec 26, 2014 by James Almer
6 changed files
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -151,11 +151,11 @@ static void store_slice2_c(uint8_t *dst, int16_t *src,
    }
 }

-static void mul_thrmat_c(FSPPContext *p, int q)
+static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 {
    int a;
    for (a = 0; a < 64; a++)
-        ((int16_t *)p->threshold_mtx)[a] = q * ((int16_t *)p->threshold_mtx_noq)[a];//ints faster in C
+        thr_adr[a] = q * thr_adr_noq[a];
 }

 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
@@ -220,7 +220,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
                    t = qp_store[qy + (t >> qpsh)];
                    t = norm_qscale(t, p->qscale_type);

-                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat(p, t);
+                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
                    p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
                }
            p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
@@ -378,7 +378,7 @@ static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int
    }
 }

-static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_stride, int cnt)
+static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 {
    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -440,7 +440,7 @@ static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_strid
    }
 }

-static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 {
    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -582,7 +582,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
    }

    if (fspp->qp)
-        fspp->prev_q = fspp->qp, fspp->mul_thrmat(fspp, fspp->qp);
+        fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);

    /* if we are not in a constant user quantizer mode and we don't want to use
     * the quantizers from the B-frames (B-frames often have a higher QP), we

--- a/libavfilter/vf_fspp.h
+++ b/libavfilter/vf_fspp.h
@@ -79,16 +79,16 @@ typedef struct FSPPContext {
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);

-    void (*mul_thrmat)(struct FSPPContext *fspp, int q);
+    void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);

    void (*column_fidct)(int16_t *thr_adr, int16_t *data,
                         int16_t *output, int cnt);

    void (*row_idct)(int16_t *workspace, int16_t *output_adr,
-                     int output_stride, int cnt);
+                     ptrdiff_t output_stride, int cnt);

    void (*row_fdct)(int16_t *data, const uint8_t *pixels,
-                     int line_size, int cnt);
+                     ptrdiff_t line_size, int cnt);

 } FSPPContext;


--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
-OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp.o
+OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o

+YASM-OBJS-$(CONFIG_FSPP_FILTER)              += x86/vf_fspp.o
 YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_IDET_FILTER)              += x86/vf_idet.o

--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
--- a/libavfilter/x86/vf_fspp.c
+++ b/libavfilter/x86/vf_fspp.c
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_fspp.h"
+
+void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+
+av_cold void ff_fspp_init_x86(FSPPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->store_slice  = ff_store_slice_mmx;
+        s->store_slice2 = ff_store_slice2_mmx;
+        s->mul_thrmat   = ff_mul_thrmat_mmx;
+        s->column_fidct = ff_column_fidct_mmx;
+        s->row_idct     = ff_row_idct_mmx;
+        s->row_fdct     = ff_row_fdct_mmx;
+    }
+}