row_x86.asm 3.73 KB
Newer Older
1
;
2
; Copyright 2012 The LibYuv Project Authors. All rights reserved.
3
;
4 5 6
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
7
; in the file PATENTS. All contributing project authors may
8
; be found in the AUTHORS file in the root of the source tree.
9
;
fbarchard@google.com's avatar
fbarchard@google.com committed
10

11 12 13 14 15
%ifdef __YASM_VERSION_ID__
%if __YASM_VERSION_ID__ < 01020000h
%error AVX2 is supported only by yasm 1.2.0 or later.
%endif
%endif
fbarchard@google.com's avatar
fbarchard@google.com committed
16 17 18 19
%include "x86inc.asm"

SECTION .text

20 21
; cglobal numeric constants are parameters, gpr regs, mm regs

22
; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
fbarchard@google.com's avatar
fbarchard@google.com committed
23 24 25 26

%macro YUY2TOYROW 2-3
cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
%ifidn %1,YUY2
27 28
    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
    psrlw      m2, m2, 8
fbarchard@google.com's avatar
fbarchard@google.com committed
29 30 31 32 33 34 35 36
%endif

    ALIGN      16
.convertloop:
    mov%2      m0, [src_yuy2q]
    mov%2      m1, [src_yuy2q + mmsize]
    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
%ifidn %1,YUY2
37 38
    pand       m0, m0, m2   ; YUY2 even bytes are Y
    pand       m1, m1, m2
fbarchard@google.com's avatar
fbarchard@google.com committed
39
%else
40 41
    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
    psrlw      m1, m1, 8
fbarchard@google.com's avatar
fbarchard@google.com committed
42
%endif
43
    packuswb   m0, m0, m1
44
%if cpuflag(AVX2)
fbarchard@google.com's avatar
fbarchard@google.com committed
45
    vpermq     m0, m0, 0xd8
46
%endif
fbarchard@google.com's avatar
fbarchard@google.com committed
47 48 49 50
    sub        pixd, mmsize
    mov%2      [dst_yq], m0
    lea        dst_yq, [dst_yq + mmsize]
    jg         .convertloop
51
    REP_RET
fbarchard@google.com's avatar
fbarchard@google.com committed
52 53
%endmacro

54
; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
fbarchard@google.com's avatar
fbarchard@google.com committed
55 56 57 58 59 60 61 62 63 64 65 66 67 68
INIT_MMX MMX
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_XMM SSE2
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_YMM AVX2
YUY2TOYROW YUY2,a,
YUY2TOYROW UYVY,a,

69
; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
70

71 72
%macro SplitUVRow 1-2
cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
73 74 75 76 77 78 79 80 81
    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
    psrlw      m4, m4, 8
    sub        dst_vq, dst_uq

    ALIGN      16
.convertloop:
    mov%1      m0, [src_uvq]
    mov%1      m1, [src_uvq + mmsize]
    lea        src_uvq, [src_uvq + mmsize * 2]
82 83
    psrlw      m2, m0, 8         ; odd bytes
    psrlw      m3, m1, 8
84 85 86 87
    pand       m0, m0, m4        ; even bytes
    pand       m1, m1, m4
    packuswb   m0, m0, m1
    packuswb   m2, m2, m3
88
%if cpuflag(AVX2)
fbarchard@google.com's avatar
fbarchard@google.com committed
89 90
    vpermq     m0, m0, 0xd8
    vpermq     m2, m2, 0xd8
91
%endif
92 93 94 95 96 97 98 99 100
    mov%1      [dst_uq], m0
    mov%1      [dst_uq + dst_vq], m2
    lea        dst_uq, [dst_uq + mmsize]
    sub        pixd, mmsize
    jg         .convertloop
    REP_RET
%endmacro

INIT_MMX MMX
101 102
SplitUVRow a,
SplitUVRow u,_Unaligned
103
INIT_XMM SSE2
104 105
SplitUVRow a,
SplitUVRow u,_Unaligned
106
INIT_YMM AVX2
107
SplitUVRow a,
108

109 110
; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
;                      int width);
111

112 113
%macro MergeUVRow_ 1-2
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
114 115 116 117 118 119 120
    sub        src_vq, src_uq

    ALIGN      16
.convertloop:
    mov%1      m0, [src_uq]
    mov%1      m1, [src_vq]
    lea        src_uq, [src_uq + mmsize]
121 122 123 124 125 126
    punpcklbw  m2, m0, m1       // first 8 UV pairs
    punpckhbw  m0, m0, m1       // next 8 UV pairs
%if cpuflag(AVX2)
    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
    mov%1      [dst_uvq], m1
127
    mov%1      [dst_uvq + mmsize], m2
128 129 130 131
%else
    mov%1      [dst_uvq], m2
    mov%1      [dst_uvq + mmsize], m0
%endif
132 133 134 135 136 137 138
    lea        dst_uvq, [dst_uvq + mmsize * 2]
    sub        pixd, mmsize
    jg         .convertloop
    REP_RET
%endmacro

INIT_MMX MMX
139 140
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
141
INIT_XMM SSE2
142 143
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
144
INIT_YMM AVX2
145
MergeUVRow_ a,
146