row_x86.asm 3.53 KB
Newer Older
1
;
2
; Copyright 2012 The LibYuv Project Authors. All rights reserved.
3
;
4 5 6 7 8
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS.  All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
9
;
fbarchard@google.com's avatar
fbarchard@google.com committed
10

11 12 13 14 15
%ifdef __YASM_VERSION_ID__
%if __YASM_VERSION_ID__ < 01020000h
%error AVX2 is supported only by yasm 1.2.0 or later.
%endif
%endif
fbarchard@google.com's avatar
fbarchard@google.com committed
16 17 18 19
%include "x86inc.asm"

SECTION .text

20 21
; cglobal numeric constants are parameters, gpr regs, mm regs

22
; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
fbarchard@google.com's avatar
fbarchard@google.com committed
23 24 25 26

%macro YUY2TOYROW 2-3
cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
%ifidn %1,YUY2
27 28
    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
    psrlw      m2, m2, 8
fbarchard@google.com's avatar
fbarchard@google.com committed
29 30 31 32 33 34 35 36
%endif

    ALIGN      16
.convertloop:
    mov%2      m0, [src_yuy2q]
    mov%2      m1, [src_yuy2q + mmsize]
    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
%ifidn %1,YUY2
37 38
    pand       m0, m0, m2   ; YUY2 even bytes are Y
    pand       m1, m1, m2
fbarchard@google.com's avatar
fbarchard@google.com committed
39
%else
40 41
    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
    psrlw      m1, m1, 8
fbarchard@google.com's avatar
fbarchard@google.com committed
42
%endif
43
    packuswb   m0, m0, m1
fbarchard@google.com's avatar
fbarchard@google.com committed
44 45 46 47
    sub        pixd, mmsize
    mov%2      [dst_yq], m0
    lea        dst_yq, [dst_yq + mmsize]
    jg         .convertloop
48
    REP_RET
fbarchard@google.com's avatar
fbarchard@google.com committed
49 50
%endmacro

51
; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
fbarchard@google.com's avatar
fbarchard@google.com committed
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
INIT_MMX MMX
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_XMM SSE2
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_YMM AVX2
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned

68
; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
69

70 71
%macro SplitUVRow 1-2
cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
    psrlw      m4, m4, 8
    sub        dst_vq, dst_uq

    ALIGN      16
.convertloop:
    mov%1      m0, [src_uvq]
    mov%1      m1, [src_uvq + mmsize]
    lea        src_uvq, [src_uvq + mmsize * 2]
    mova       m2, m0
    mova       m3, m1
    pand       m0, m0, m4        ; even bytes
    pand       m1, m1, m4
    packuswb   m0, m0, m1
    psrlw      m2, m2, 8         ; odd bytes
    psrlw      m3, m3, 8
    packuswb   m2, m2, m3
    mov%1      [dst_uq], m0
    mov%1      [dst_uq + dst_vq], m2
    lea        dst_uq, [dst_uq + mmsize]
    sub        pixd, mmsize
    jg         .convertloop
    REP_RET
%endmacro

INIT_MMX MMX
98 99
SplitUVRow a,
SplitUVRow u,_Unaligned
100
INIT_XMM SSE2
101 102
SplitUVRow a,
SplitUVRow u,_Unaligned
103
INIT_YMM AVX2
104 105
SplitUVRow a,
SplitUVRow u,_Unaligned
106

107 108
; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
;                      int width);
109

110 111
%macro MergeUVRow_ 1-2
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
    sub        src_vq, src_uq

    ALIGN      16
.convertloop:
    mov%1      m0, [src_uq]
    mov%1      m1, [src_vq]
    lea        src_uq, [src_uq + mmsize]
    mova       m2, m0
    punpcklbw  m0, m0, m1       // first 8 UV pairs
    punpckhbw  m2, m2, m1       // next 8 UV pairs
    mov%1      [dst_uvq], m0
    mov%1      [dst_uvq + mmsize], m2
    lea        dst_uvq, [dst_uvq + mmsize * 2]
    sub        pixd, mmsize
    jg         .convertloop
    REP_RET
%endmacro

INIT_MMX MMX
131 132
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
133
INIT_XMM SSE2
134 135
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
136
INIT_YMM AVX2
137 138
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned