ARGB1555ToARGBRow_SSE2

BUG=none TEST=media_unittest Review URL: http://webrtc-codereview.appspot.com/349006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@133 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGB1555ToARGBRow_SSE2
BUG=none TEST=media_unittest Review URL: http://webrtc-codereview.appspot.com/349006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@133 16f28f9a-4ce2-e073-06de-1de4eb20be90
ccd6d9b2 · fbarchard@google.com · 6aa761da · ccd6d9b2 · ccd6d9b2 · ccd6d9b2
Commit ccd6d9b2 authored Jan 13, 2012 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 212 additions and 81 deletions

README.chromium README.chromium +1 -1

row.h source/row.h +17 -63

row_common.cc source/row_common.cc +9 -0

row_win.cc source/row_win.cc +185 -17

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 132
+Version: 133
 License: BSD
 License File: LICENSE


--- a/source/row.h
+++ b/source/row.h
@@ -60,8 +60,9 @@

 // The following are available on Windows platforms
 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_ARGB4444TOARGBROW_SSE2
 #define HAS_RGB565TOARGBROW_SSE2
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
 #endif

 // The following are available on Neon platforms
@@ -82,64 +83,60 @@ namespace libyuv {
 extern "C" {
 #endif

-#ifdef HAS_FASTCONVERTYUVTOARGBROW_NEON
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) signed char vec8[16];
+typedef __declspec(align(16)) unsigned char uvec8[16];
+typedef __declspec(align(16)) signed short vec16[8];
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef signed char __attribute__((vector_size(16))) vec8;
+typedef unsigned char __attribute__((vector_size(16))) uvec8;
+typedef signed short __attribute__((vector_size(16))) vec16;
+#endif
+
+
 void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
                                  uint8* rgb_buf,
                                  int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOBGRAROW_NEON
 void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
                                  uint8* rgb_buf,
                                  int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOABGRROW_NEON
 void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
                                  uint8* rgb_buf,
                                  int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTORGB565ROW_NEON
 void FastConvertYUVToRGB565Row_NEON(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* rgb_buf,
                                    int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOARGB1555ROW_NEON
 void FastConvertYUVToARGB1555Row_NEON(const uint8* y_buf,
                                      const uint8* u_buf,
                                      const uint8* v_buf,
                                      uint8* rgb_buf,
                                      int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOARGB4444ROW_NEON
 void FastConvertYUVToARGB4444Row_NEON(const uint8* y_buf,
                                      const uint8* u_buf,
                                      const uint8* v_buf,
                                      uint8* rgb_buf,
                                      int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTORGB24ROW_NEON
 void FastConvertYUVToRGB24Row_NEON(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
                                   uint8* rgb_buf,
                                   int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTORAWROW_NEON
 void FastConvertYUVToRAWRow_NEON(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);
-#endif

-#ifdef HAS_ARGBTOYROW_SSSE3
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@@ -149,11 +146,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
-#define HASRGB24TOYROW_SSSE3
-#endif
-#ifdef HASRGB24TOYROW_SSSE3
 void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@@ -171,16 +163,9 @@ void ARGB1555ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 void ARGB4444ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int width);

-#endif
-#ifdef HAS_REVERSE_ROW_SSSE3
 void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
-#endif
-#ifdef HAS_REVERSE_ROW_SSE2
 void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
-#endif
-#ifdef HAS_REVERSE_ROW_NEON
 void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
-#endif
 void ReverseRow_C(const uint8* src, uint8* dst, int width);

 void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
@@ -209,20 +194,14 @@ void ARGB1555ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
 void ARGB4444ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width);

-#ifdef HAS_RGB24TOARGBROW_SSSE3
 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
-// TODO(fbarchard): SSE2 555
-//void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-#endif
-#ifdef HAS_RGB565TOARGBROW_SSE2
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_SSE2
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-#endif
+
 void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
 void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
@@ -231,27 +210,9 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
 void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);

-#ifdef HAS_I400TOARGBROW_SSE2
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-#endif
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);

-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-typedef __declspec(align(16)) signed char vec8[16];
-typedef __declspec(align(16)) unsigned char uvec8[16];
-typedef __declspec(align(16)) signed short vec16[8];
-#else // __GNUC__
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-typedef signed char __attribute__((vector_size(16))) vec8;
-typedef unsigned char __attribute__((vector_size(16))) uvec8;
-typedef signed short __attribute__((vector_size(16))) vec16;
-#endif
-
-extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
-extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
-extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
-
 void FastConvertYUVToARGBRow_C(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
@@ -310,7 +271,6 @@ void FastConvertYToARGBRow_C(const uint8* y_buf,
                             uint8* rgb_buf,
                             int width);

-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
 void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
@@ -344,9 +304,7 @@ void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                uint8* rgb_buf,
                                int width);
-#endif

-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
@@ -400,15 +358,11 @@ void FastConvertYUVToRAWRow_SSSE3(const uint8* y_buf,
                                  const uint8* v_buf,
                                  uint8* rgb_buf,
                                  int width);
-#endif

-#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                uint8* rgb_buf,
                                int width);

-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -328,7 +328,11 @@ void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

 void ARGB1555ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  SIMD_ALIGNED(uint8 row[kMaxStride]);
+#ifdef HAS_ARGB1555TOARGBROW_SSE2
+  ARGB1555ToARGBRow_SSE2(src_argb, row, pix);
+#else
  ARGB1555ToARGBRow_C(src_argb, row, pix);
+#endif
  ARGBToYRow_SSSE3(row, dst_y, pix);
 }

@@ -378,8 +382,13 @@ void RGB565ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
 void ARGB1555ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int pix) {
  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+#ifdef HAS_ARGB1555TOARGBROW_SSE2
+  ARGB1555ToARGBRow_SSE2(src_argb, row, pix);
+  ARGB1555ToARGBRow_SSE2(src_argb + src_stride_argb, row + kMaxStride, pix);
+#else
  ARGB1555ToARGBRow_C(src_argb, row, pix);
  ARGB1555ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+#endif
  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
 }


--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -229,53 +229,107 @@ __asm {
  }
 }

+#ifdef SHIFT565
+// Below shift/mask code is efficient and works, but more instructions than
+// pmul method
 // TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc
+// 29 instructions
 __declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int pix) {
+void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                             int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_rgb565
    mov       edx, [esp + 8]   // dst_argb
    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000 for Alpha
    pslld     xmm5, 24
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
    psllw     xmm4, 11
-    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f
+    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
    psrlw     xmm6, 11
-    pcmpeqb   xmm7, xmm7       // generate mask 0x00fc00fc
+    pcmpeqb   xmm7, xmm7       // generate mask 0x00fc00fc for Green
    psrlw     xmm7, 10
    psllw     xmm7, 2

-
 convertloop:
    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
    lea       eax, [eax + 16]
-
    movdqa    xmm1, xmm0
    movdqa    xmm2, xmm0
    pand      xmm1, xmm4    // R in upper 5 bits
    psrlw     xmm2, 13      // R 3 bits
    psllw     xmm2, 8
    por       xmm1, xmm2
-
    movdqa    xmm2, xmm0
    pand      xmm2, xmm6    // mask B 5 bits
    movdqa    xmm3, xmm2
    psllw     xmm2, 3
    psrlw     xmm3, 2
    por       xmm2, xmm3
-
    por       xmm1, xmm2    // RB
-
    psrlw     xmm0, 3       // G in top 6 bits of lower byte
    pand      xmm0, xmm7    // mask G 6 bits
    movdqa    xmm2, xmm0
    psrlw     xmm2, 6
    por       xmm0, xmm2
-
    por       xmm0, xmm5   // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       edx, [edx + 32]
+    sub       ecx, 8
+    ja        convertloop
+    ret
+  }
+}

+// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
+// 33 instructions
+__declspec(naked)
+void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                               int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm5, 8
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
+    psllw     xmm4, 11
+    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
+    psrlw     xmm6, 11
+    pcmpeqb   xmm7, xmm7       // generate mask 0x00f800f8 for Green
+    psrlw     xmm7, 11
+    psllw     xmm7, 3
+
+ convertloop:
+    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
+    lea       eax, [eax + 16]
+    movdqa    xmm1, xmm0
+    psllw     xmm1, 1
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm4    // R in upper 5 bits
+    psrlw     xmm2, 13      // R 3 bits
+    psllw     xmm2, 8
+    por       xmm1, xmm2
+    movdqa    xmm2, xmm0
+    pand      xmm2, xmm6    // mask B 5 bits
+    movdqa    xmm3, xmm2
+    psllw     xmm2, 3
+    psrlw     xmm3, 2
+    por       xmm2, xmm3
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    psrlw     xmm2, 2       // G in top 5 bits of lower byte
+    pand      xmm2, xmm7    // mask G 5 bits
+    movdqa    xmm3, xmm2
+    psrlw     xmm3, 5
+    por       xmm2, xmm3
+    psraw     xmm0, 8       // A
+    pand      xmm0, xmm5
+    por       xmm0, xmm2    // AG
    movdqa    xmm2, xmm1
    punpcklbw xmm1, xmm0
    punpckhbw xmm2, xmm0
@@ -287,8 +341,121 @@ __asm {
    ret
  }
 }
+#endif
+
+// pmul method to replicate bits
+// Math to replicate bits
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions
+__declspec(naked)
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+__asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20082008  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_rgb565
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3    // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    pand      xmm0, xmm4    // G in middle 6 bits
+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
+    por       xmm0, xmm7    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    ja        convertloop
+    ret
+  }
+}
+
+// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
+// 24 instructions
+__declspec(naked)
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+__asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x03e003e0 for Green
+    psllw     xmm4, 11
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1       // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // G in middle 5 bits
+    psraw     xmm2, 8       // A
+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    ja        convertloop
+    ret
+  }
+}

 // TODO(fbarchard): Port ARGB4444ToARGBRow_SSE2 to gcc
+// 18 instructions
 __declspec(naked)
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix) {
@@ -301,10 +468,11 @@ __asm {
    mov       eax, [esp + 4]   // src_argb4444
    mov       edx, [esp + 8]   // dst_argb
    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax

 convertloop:
-    movdqa    xmm0, qword ptr [eax] // fetch 8 pixels of bgra4444
-    lea       eax, [eax + 16]
+    movdqa    xmm0, [eax]   // fetch 8 pixels of bgra4444
    movdqa    xmm2, xmm0
    pand      xmm0, xmm4    // mask low nibbles
    pand      xmm2, xmm5    // mask high nibbles
@@ -317,9 +485,9 @@ __asm {
    movdqa    xmm1, xmm0
    punpcklbw xmm0, xmm2
    punpckhbw xmm1, xmm2
-    movdqa    [edx], xmm0  // store 4 pixels of ARGB
-    movdqa    [edx + 16], xmm1  // store next 4 pixels of ARGB
-    lea       edx, [edx + 32]
+    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
    sub       ecx, 8
    ja        convertloop
    ret