avcodec/synth_filter: split off remaining code from dcadec files

Signed-off-by: James Almer <jamrial@gmail.com>

avcodec/synth_filter: split off remaining code from dcadec files
Signed-off-by: James Almer <jamrial@gmail.com>
209f50e1 · James Almer · 5dc37a5d · 209f50e1 · 209f50e1 · 209f50e1
Commit 209f50e1 authored Jan 25, 2016 by James Almer
11 changed files
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
-OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o               \
+                                           aarch64/synth_filter_init.o
 OBJS-$(CONFIG_FFT)                      += aarch64/fft_init_aarch64.o
 OBJS-$(CONFIG_FMTCONVERT)               += aarch64/fmtconvert_init.o
 OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o

--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/dcadsp_init.c
@@ -24,23 +24,10 @@
 #include "libavutil/attributes.h"
 #include "libavutil/internal.h"
 #include "libavcodec/dcadsp.h"
-#include "libavcodec/fft.h"
-
-#include "asm-offsets.h"
-
-#if HAVE_NEON || HAVE_VFP
-AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
-#endif

 void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
 void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);

-void ff_synth_filter_float_neon(FFTContext *imdct,
-                                float *synth_buf_ptr, int *synth_buf_offset,
-                                float synth_buf2[32], const float window[512],
-                                float out[32], const float in[32],
-                                float scale);
-
 av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -50,11 +37,3 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
    }
 }
-
-av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags))
-        s->synth_filter_float = ff_synth_filter_float_neon;
-}
--- a/libavcodec/aarch64/synth_filter_init.c
+++ b/libavcodec/aarch64/synth_filter_init.c
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
+
+#include "asm-offsets.h"
+
+#if HAVE_NEON || HAVE_VFP
+AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
+#endif
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+                                float *synth_buf_ptr, int *synth_buf_offset,
+                                float synth_buf2[32], const float window[512],
+                                float out[32], const float in[32],
+                                float scale);
+
+av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        s->synth_filter_float = ff_synth_filter_float_neon;
+}
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -36,7 +36,8 @@ OBJS-$(CONFIG_VP8DSP)                  += arm/vp8dsp_init_arm.o
 # decoders/encoders
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                          arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
+OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o         \
+                                          arm/synth_filter_init_arm.o
 OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o

--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -37,18 +37,6 @@ void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
                                const float window[512], float *samples_out,
                                float raXin[32], float scale);

-void ff_synth_filter_float_vfp(FFTContext *imdct,
-                               float *synth_buf_ptr, int *synth_buf_offset,
-                               float synth_buf2[32], const float window[512],
-                               float out[32], const float in[32],
-                               float scale);
-
-void ff_synth_filter_float_neon(FFTContext *imdct,
-                                float *synth_buf_ptr, int *synth_buf_offset,
-                                float synth_buf2[32], const float window[512],
-                                float out[32], const float in[32],
-                                float scale);
-
 av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -63,13 +51,3 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
    }
 }
-
-av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_vfp_vm(cpu_flags))
-        s->synth_filter_float = ff_synth_filter_float_vfp;
-    if (have_neon(cpu_flags))
-        s->synth_filter_float = ff_synth_filter_float_neon;
-}
--- a/libavcodec/arm/synth_filter_init_arm.c
+++ b/libavcodec/arm/synth_filter_init_arm.c
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
+
+void ff_synth_filter_float_vfp(FFTContext *imdct,
+                               float *synth_buf_ptr, int *synth_buf_offset,
+                               float synth_buf2[32], const float window[512],
+                               float out[32], const float in[32],
+                               float scale);
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+                                float *synth_buf_ptr, int *synth_buf_offset,
+                                float synth_buf2[32], const float window[512],
+                                float out[32], const float in[32],
+                                float scale);
+
+av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_vfp_vm(cpu_flags))
+        s->synth_filter_float = ff_synth_filter_float_vfp;
+    if (have_neon(cpu_flags))
+        s->synth_filter_float = ff_synth_filter_float_neon;
+}
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -44,7 +44,8 @@ OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
 OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
 OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
-OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o            \
+                                          x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
 OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
@@ -132,7 +133,8 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
 YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 YASM-OBJS-$(CONFIG_ALAC_DECODER)       += x86/alacdsp.o
 YASM-OBJS-$(CONFIG_APNG_DECODER)       += x86/pngdsp.o
-YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
+YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o                  \
+                                          x86/synth_filter.o
 YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp_mmx.o x86/diracdsp_yasm.o \
                                          x86/dwt_yasm.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o

--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -121,225 +121,3 @@ DCA_LFE_FIR 1
 INIT_XMM fma3
 DCA_LFE_FIR 0
 %endif
-
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
-    pxor          %1, %1
-%else
-    xorps         %1, %1, %1
-%endif
-%endmacro
-
-%macro SHUF 3
-%if cpuflag(avx)
-    mova          %3, [%2 - 16]
-    vperm2f128    %1, %3, %3, 1
-    vshufps       %1, %1, %1, q0123
-%elif cpuflag(sse2)
-    pshufd        %1, [%2], q0123
-%else
-    mova          %1, [%2]
-    shufps        %1, %1, q0123
-%endif
-%endmacro
-
-%macro INNER_LOOP   1
-    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
-    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
-    ;~ b += window[i + j + 16] * (synth_buf[i + j])
-    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
-    mova          m6, [ptr1 + j]
-%if ARCH_X86_64
-    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
-    mova         m12, [ptr1 + j + mmsize]
-%endif
-%if cpuflag(fma3)
-    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
-    fnmaddps      m1, m5,  [win + %1 + j], m1
-%if ARCH_X86_64
-    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
-    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
-    mulps         m6, m6,  [win + %1 + j + 16 * 4]
-    mulps         m5, m5,  [win + %1 + j]
-%if ARCH_X86_64
-    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
-    mulps        m11, m11, [win + %1 + j + mmsize]
-%endif
-    addps         m2, m2, m6
-    subps         m1, m1, m5
-%if ARCH_X86_64
-    addps         m8, m8, m12
-    subps         m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
-    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
-    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
-    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
-    mova          m5, [ptr1 + j + 16 * 4]
-%if ARCH_X86_64
-    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
-    mova         m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
-%if cpuflag(fma3)
-    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
-    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
-    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
-    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
-    mulps         m5, m5,  [win + %1 + j + 32 * 4]
-    mulps         m6, m6,  [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
-    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
-    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
-    addps         m3, m3, m5
-    addps         m4, m4, m6
-%if ARCH_X86_64
-    addps         m9, m9, m11
-    addps        m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
-    sub            j, 64 * 4
-%endmacro
-
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-;                                  const float window[512], float out[32],
-;                                  intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
-                              synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
-    movd       scale, scalem
-    SPLATD        m0
-%else
-    VBROADCASTSS  m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ  r4q
-%else
-    SPLATD      xmm0
-%if cpuflag(avx)
-    vinsertf128   m0, m0, xmm0, 1
-%endif
-%define OFFQ  offq
-%endif
-    ; prepare inner counter limit 1
-    mov          r5q, 480
-    sub          r5q, offmp
-    and          r5q, -64
-    shl          r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
-    mov         OFFQ, r5q
-%define i        r5q
-    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
-%else
-%define i 0
-%define OFFQ  r5q
-%endif
-
-%define buf2     synth_buf2q
-%if ARCH_X86_32
-    mov         buf2, synth_buf2mp
-%endif
-.mainloop:
-    ; m1 = a  m2 = b  m3 = c  m4 = d
-    SETZERO       m3
-    SETZERO       m4
-    mova          m1, [buf2 + i]
-    mova          m2, [buf2 + i + 16 * 4]
-%if ARCH_X86_32
-%define ptr1     r0q
-%define ptr2     r1q
-%define win      r2q
-%define j        r3q
-    mov          win, windowm
-    mov         ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
-    add          win, i
-    add         ptr1, i
-%endif
-%else ; ARCH_X86_64
-%define ptr1     r6q
-%define ptr2     r7q ; must be loaded
-%define win      r8q
-%define j        r9q
-    SETZERO       m9
-    SETZERO      m10
-    mova          m7, [buf2 + i + mmsize]
-    mova          m8, [buf2 + i + mmsize + 16 * 4]
-    lea          win, [windowq + i]
-    lea         ptr1, [synth_bufq + i]
-%endif
-    mov         ptr2, synth_bufmp
-    ; prepare the inner loop counter
-    mov            j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub         ptr2, i
-%endif
-.loop1:
-    INNER_LOOP  0
-    jge       .loop1
-
-    mov            j, 448 * 4
-    sub            j, OFFQ
-    jz          .end
-    sub         ptr1, j
-    sub         ptr2, j
-    add          win, OFFQ ; now at j-64, so define OFFSET
-    sub            j, 64 * 4
-.loop2:
-    INNER_LOOP  64 * 4
-    jge       .loop2
-
-.end:
-%if ARCH_X86_32
-    mov         buf2, synth_buf2m ; needed for next iteration anyway
-    mov         outq, outmp       ; j, which will be set again during it
-%endif
-    ;~ out[i]      = a * scale;
-    ;~ out[i + 16] = b * scale;
-    mulps         m1, m1, scale
-    mulps         m2, m2, scale
-%if ARCH_X86_64
-    mulps         m7, m7, scale
-    mulps         m8, m8, scale
-%endif
-    ;~ synth_buf2[i]      = c;
-    ;~ synth_buf2[i + 16] = d;
-    mova   [buf2 + i +  0 * 4], m3
-    mova   [buf2 + i + 16 * 4], m4
-%if ARCH_X86_64
-    mova   [buf2 + i +  0 * 4 + mmsize], m9
-    mova   [buf2 + i + 16 * 4 + mmsize], m10
-%endif
-    ;~ out[i]      = a;
-    ;~ out[i + 16] = a;
-    mova   [outq + i +  0 * 4], m1
-    mova   [outq + i + 16 * 4], m2
-%if ARCH_X86_64
-    mova   [outq + i +  0 * 4 + mmsize], m7
-    mova   [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub            i, (ARCH_X86_64 + 1) * mmsize
-    jge    .mainloop
-%endif
-    RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
-%endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -40,54 +40,3 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
        s->lfe_fir[0]        = ff_dca_lfe_fir0_fma3;
    }
 }
-
-
-#define SYNTH_FILTER_FUNC(opt)                                                 \
-void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
-                                 const float window[512],                      \
-                                 float out[32], intptr_t offset, float scale); \
-static void synth_filter_##opt(FFTContext *imdct,                              \
-                               float *synth_buf_ptr, int *synth_buf_offset,    \
-                               float synth_buf2[32], const float window[512],  \
-                               float out[32], const float in[32], float scale) \
-{                                                                              \
-    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
-                                                                               \
-    imdct->imdct_half(imdct, synth_buf, in);                                   \
-                                                                               \
-    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
-                                out, *synth_buf_offset, scale);                \
-                                                                               \
-    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
-}                                                                              \
-
-#if HAVE_YASM
-#if ARCH_X86_32
-SYNTH_FILTER_FUNC(sse)
-#endif
-SYNTH_FILTER_FUNC(sse2)
-SYNTH_FILTER_FUNC(avx)
-SYNTH_FILTER_FUNC(fma3)
-#endif /* HAVE_YASM */
-
-av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
-{
-#if HAVE_YASM
-    int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse;
-    }
-#endif
-    if (EXTERNAL_SSE2(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse2;
-    }
-    if (EXTERNAL_AVX_FAST(cpu_flags)) {
-        s->synth_filter_float = synth_filter_avx;
-    }
-    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
-        s->synth_filter_float = synth_filter_fma3;
-    }
-#endif /* HAVE_YASM */
-}
--- a/libavcodec/x86/synth_filter.asm
+++ b/libavcodec/x86/synth_filter.asm
+;******************************************************************************
+;* SSE-optimized functions for the DCA decoder
+;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+    pxor          %1, %1
+%else
+    xorps         %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 3
+%if cpuflag(avx)
+    mova          %3, [%2 - 16]
+    vperm2f128    %1, %3, %3, 1
+    vshufps       %1, %1, %1, q0123
+%elif cpuflag(sse2)
+    pshufd        %1, [%2], q0123
+%else
+    mova          %1, [%2]
+    shufps        %1, %1, q0123
+%endif
+%endmacro
+
+%macro INNER_LOOP   1
+    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
+    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
+    ;~ b += window[i + j + 16] * (synth_buf[i + j])
+    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
+    mova          m6, [ptr1 + j]
+%if ARCH_X86_64
+    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
+    mova         m12, [ptr1 + j + mmsize]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
+    fnmaddps      m1, m5,  [win + %1 + j], m1
+%if ARCH_X86_64
+    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
+    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
+%endif
+%else ; non-FMA
+    mulps         m6, m6,  [win + %1 + j + 16 * 4]
+    mulps         m5, m5,  [win + %1 + j]
+%if ARCH_X86_64
+    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
+    mulps        m11, m11, [win + %1 + j + mmsize]
+%endif
+    addps         m2, m2, m6
+    subps         m1, m1, m5
+%if ARCH_X86_64
+    addps         m8, m8, m12
+    subps         m7, m7, m11
+%endif
+%endif ; cpuflag(fma3)
+    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
+    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
+    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
+    mova          m5, [ptr1 + j + 16 * 4]
+%if ARCH_X86_64
+    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
+    mova         m11, [ptr1 + j + mmsize + 16 * 4]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
+    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
+%if ARCH_X86_64
+    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
+    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
+%endif
+%else ; non-FMA
+    mulps         m5, m5,  [win + %1 + j + 32 * 4]
+    mulps         m6, m6,  [win + %1 + j + 48 * 4]
+%if ARCH_X86_64
+    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
+    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
+%endif
+    addps         m3, m3, m5
+    addps         m4, m4, m6
+%if ARCH_X86_64
+    addps         m9, m9, m11
+    addps        m10, m10, m12
+%endif
+%endif ; cpuflag(fma3)
+    sub            j, 64 * 4
+%endmacro
+
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+;                                  const float window[512], float out[32],
+;                                  intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
+cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
+                              synth_buf, synth_buf2, window, out, off, scale
+%define scale m0
+%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+    movd       scale, scalem
+    SPLATD        m0
+%else
+    VBROADCASTSS  m0, scalem
+%endif
+; Make sure offset is in a register and not on the stack
+%define OFFQ  r4q
+%else
+    SPLATD      xmm0
+%if cpuflag(avx)
+    vinsertf128   m0, m0, xmm0, 1
+%endif
+%define OFFQ  offq
+%endif
+    ; prepare inner counter limit 1
+    mov          r5q, 480
+    sub          r5q, offmp
+    and          r5q, -64
+    shl          r5q, 2
+%if ARCH_X86_32 || notcpuflag(avx)
+    mov         OFFQ, r5q
+%define i        r5q
+    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
+%else
+%define i 0
+%define OFFQ  r5q
+%endif
+
+%define buf2     synth_buf2q
+%if ARCH_X86_32
+    mov         buf2, synth_buf2mp
+%endif
+.mainloop:
+    ; m1 = a  m2 = b  m3 = c  m4 = d
+    SETZERO       m3
+    SETZERO       m4
+    mova          m1, [buf2 + i]
+    mova          m2, [buf2 + i + 16 * 4]
+%if ARCH_X86_32
+%define ptr1     r0q
+%define ptr2     r1q
+%define win      r2q
+%define j        r3q
+    mov          win, windowm
+    mov         ptr1, synth_bufm
+%if ARCH_X86_32 || notcpuflag(avx)
+    add          win, i
+    add         ptr1, i
+%endif
+%else ; ARCH_X86_64
+%define ptr1     r6q
+%define ptr2     r7q ; must be loaded
+%define win      r8q
+%define j        r9q
+    SETZERO       m9
+    SETZERO      m10
+    mova          m7, [buf2 + i + mmsize]
+    mova          m8, [buf2 + i + mmsize + 16 * 4]
+    lea          win, [windowq + i]
+    lea         ptr1, [synth_bufq + i]
+%endif
+    mov         ptr2, synth_bufmp
+    ; prepare the inner loop counter
+    mov            j, OFFQ
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub         ptr2, i
+%endif
+.loop1:
+    INNER_LOOP  0
+    jge       .loop1
+
+    mov            j, 448 * 4
+    sub            j, OFFQ
+    jz          .end
+    sub         ptr1, j
+    sub         ptr2, j
+    add          win, OFFQ ; now at j-64, so define OFFSET
+    sub            j, 64 * 4
+.loop2:
+    INNER_LOOP  64 * 4
+    jge       .loop2
+
+.end:
+%if ARCH_X86_32
+    mov         buf2, synth_buf2m ; needed for next iteration anyway
+    mov         outq, outmp       ; j, which will be set again during it
+%endif
+    ;~ out[i]      = a * scale;
+    ;~ out[i + 16] = b * scale;
+    mulps         m1, m1, scale
+    mulps         m2, m2, scale
+%if ARCH_X86_64
+    mulps         m7, m7, scale
+    mulps         m8, m8, scale
+%endif
+    ;~ synth_buf2[i]      = c;
+    ;~ synth_buf2[i + 16] = d;
+    mova   [buf2 + i +  0 * 4], m3
+    mova   [buf2 + i + 16 * 4], m4
+%if ARCH_X86_64
+    mova   [buf2 + i +  0 * 4 + mmsize], m9
+    mova   [buf2 + i + 16 * 4 + mmsize], m10
+%endif
+    ;~ out[i]      = a;
+    ;~ out[i + 16] = a;
+    mova   [outq + i +  0 * 4], m1
+    mova   [outq + i + 16 * 4], m2
+%if ARCH_X86_64
+    mova   [outq + i +  0 * 4 + mmsize], m7
+    mova   [outq + i + 16 * 4 + mmsize], m8
+%endif
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub            i, (ARCH_X86_64 + 1) * mmsize
+    jge    .mainloop
+%endif
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
+INIT_YMM avx
+SYNTH_FILTER
+INIT_YMM fma3
+SYNTH_FILTER
--- a/libavcodec/x86/synth_filter_init.c
+++ b/libavcodec/x86/synth_filter_init.c
+/*
+ * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/synth_filter.h"
+
+#define SYNTH_FILTER_FUNC(opt)                                                 \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
+                                 const float window[512],                      \
+                                 float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct,                              \
+                               float *synth_buf_ptr, int *synth_buf_offset,    \
+                               float synth_buf2[32], const float window[512],  \
+                               float out[32], const float in[32], float scale) \
+{                                                                              \
+    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
+                                                                               \
+    imdct->imdct_half(imdct, synth_buf, in);                                   \
+                                                                               \
+    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
+                                out, *synth_buf_offset, scale);                \
+                                                                               \
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
+}                                                                              \
+
+#if HAVE_YASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
+#endif /* HAVE_YASM */
+
+av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse;
+    }
+#endif
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse2;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_avx;
+    }
+    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+        s->synth_filter_float = synth_filter_fma3;
+    }
+#endif /* HAVE_YASM */
+}