Fixed image rotators.

90, 180, 270 rotate of array with a minimum size of 8x8. Also deinterleave on rotate for NV12/NV21 formats. Review URL: http://webrtc-codereview.appspot.com/195002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@23 16f28f9a-4ce2-e073-06de-1de4eb20be90

Fixed image rotators.
90, 180, 270 rotate of array with a minimum size of 8x8. Also deinterleave on rotate for NV12/NV21 formats. Review URL: http://webrtc-codereview.appspot.com/195002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@23 16f28f9a-4ce2-e073-06de-1de4eb20be90
ed6edcab · frkoenig@google.com · 43575c8f · ed6edcab · ed6edcab · ed6edcab
Commit ed6edcab authored Oct 12, 2011 by frkoenig@google.com
9 changed files
--- a/libyuv_test.gyp
+++ b/libyuv_test.gyp
@@ -23,6 +23,7 @@

         # sources
         'unit_test/unit_test.cc',
+         'unit_test/rotate_test.cc',
      ], # source
      'conditions': [
        ['OS=="linux"', {

--- a/source/rotate.cc
+++ b/source/rotate.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rotate.h"
+
+namespace libyuv {
+
+typedef void (*reverse_func)(const uint8*, uint8*, int);
+typedef void (*rotate_wx8func)(const uint8*, int, uint8*, int, int);
+typedef void (*rotate_wxhfunc)(const uint8*, int, uint8*, int, int, int);
+
+#ifdef __ARM_NEON__
+extern "C" {
+void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
+void Transpose_wx8_NEON(const uint8* src, int src_pitch,
+                        uint8* dst, int dst_pitch, int width);
+}  // extern "C"
+#endif
+
+static void Transpose_wx8_C(const uint8* src, int src_pitch,
+                            uint8* dst, int dst_pitch,
+                            int w) {
+  int i, j;
+  for (i = 0; i < w; ++i)
+    for (j = 0; j < 8; ++j)
+      dst[i * dst_pitch + j] = src[j * src_pitch + i];
+}
+
+static void Transpose_wxh_C(const uint8* src, int src_pitch,
+                            uint8* dst, int dst_pitch,
+                            int width, int height) {
+  int i, j;
+  for (i = 0; i < width; ++i)
+    for (j = 0; j < height; ++j)
+      dst[i * dst_pitch + j] = src[j * src_pitch + i];
+}
+
+void Transpose(const uint8* src, int src_pitch,
+               uint8* dst, int dst_pitch,
+               int width, int height) {
+  int i = height;
+  rotate_wx8func Transpose_wx8;
+  rotate_wxhfunc Transpose_wxh;
+
+  // do processor detection here.
+#ifdef __ARM_NEON__
+  Transpose_wx8 = Transpose_wx8_NEON;
+  Transpose_wxh = Transpose_wxh_C;
+#else
+  Transpose_wx8 = Transpose_wx8_C;
+  Transpose_wxh = Transpose_wxh_C;
+#endif
+
+  // work across the source in 8x8 tiles
+  do {
+    Transpose_wx8(src, src_pitch, dst, dst_pitch, width);
+
+    src += 8 * src_pitch;
+    dst += 8;
+    i   -= 8;
+  } while (i >= 8);
+
+// TODO(frkoenig): Have wx4 and maybe wx2
+  Transpose_wxh(src, src_pitch, dst, dst_pitch, width, i);
+}
+
+void Rotate90(const uint8* src, int src_pitch,
+              uint8* dst, int dst_pitch,
+              int width, int height) {
+  src += src_pitch*(height-1);
+  src_pitch = -src_pitch;
+
+  Transpose(src, src_pitch, dst, dst_pitch, width, height);
+}
+
+void Rotate270(const uint8* src, int src_pitch,
+               uint8* dst, int dst_pitch,
+               int width, int height) {
+  dst += dst_pitch*(width-1);
+  dst_pitch = -dst_pitch;
+
+  Transpose(src, src_pitch, dst, dst_pitch, width, height);
+}
+
+void ReverseLine_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width; ++i)
+    dst[width-1 - i] = src[i];
+}
+
+void Rotate180(const uint8* src, int src_pitch,
+               uint8* dst, int dst_pitch,
+               int width, int height) {
+  int i;
+  reverse_func ReverseLine;
+
+  // do processor detection here.
+#ifdef __ARM_NEON__
+  ReverseLine = ReverseLine_NEON;
+#else
+  ReverseLine = ReverseLine_C;
+#endif
+
+  dst += dst_pitch*(height-1);
+
+  for (i = 0; i < height; ++i) {
+    ReverseLine(src, dst, width);
+
+    src += src_pitch;
+    dst -= dst_pitch;
+  }
+}
+
+}  // namespace libyuv
--- a/source/rotate.h
+++ b/source/rotate.h
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBYUV_SOURCE_ROTATE_H_
+#define LIBYUV_SOURCE_ROTATE_H_
+
+#include "basic_types.h"
+
+namespace libyuv {
+void Rotate90(const uint8* src, int src_pitch,
+              uint8* dst, int dst_pitch,
+              int width, int height);
+void Rotate180(const uint8* src, int src_pitch,
+               uint8* dst, int dst_pitch,
+               int width, int height);
+void Rotate270(const uint8* src, int src_pitch,
+               uint8* dst, int dst_pitch,
+               int width, int height);
+
+void Rotate90_deinterleave(const uint8* src, int src_pitch,
+                           uint8* dst_a, int dst_pitch_a,
+                           uint8* dst_b, int dst_pitch_b,
+                           int width, int height);
+void Rotate180_deinterleave(const uint8* src, int src_pitch,
+                            uint8* dst_a, int dst_pitch_a,
+                            uint8* dst_b, int dst_pitch_b,
+                            int width, int height);
+void Rotate270_deinterleave(const uint8* src, int src_pitch,
+                            uint8* dst_a, int dst_pitch_a,
+                            uint8* dst_b, int dst_pitch_b,
+                            int width, int height);
+
+void Transpose(const uint8* src, int src_pitch,
+               uint8* dst, int dst_pitch,
+               int width, int height);
+}  // namespace libyuv
+
+#endif  // LIBYUV_SOURCE_ROTATE_H_
--- a/source/rotate_deinterleave.cc
+++ b/source/rotate_deinterleave.cc
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rotate.h"
+
+namespace libyuv {
+
+typedef void (*reverse_func)(const uint8*, uint8*, uint8*, int);
+typedef void (*rotate_wx8func)(const uint8*, int,
+                               uint8*, int,
+                               uint8*, int, int);
+typedef void (*rotate_wxhfunc)(const uint8*, int,
+                               uint8*, int,
+                               uint8*, int, int, int);
+
+#ifdef __ARM_NEON__
+extern "C" {
+void RestoreRegisters_NEON(unsigned long long *restore);
+void ReverseLine_di_NEON(const uint8* src,
+                         uint8* dst_a, uint8* dst_b,
+                         int width);
+void SaveRegisters_NEON(unsigned long long *store);
+void Transpose_di_wx8_NEON(const uint8* src, int src_pitch,
+                           uint8* dst_a, int dst_pitch_a,
+                           uint8* dst_b, int dst_pitch_b,
+                           int width);
+}  // extern "C"
+#endif
+
+static void Transpose_di_wx8_C(const uint8* src, int src_pitch,
+                               uint8* dst_a, int dst_pitch_a,
+                               uint8* dst_b, int dst_pitch_b,
+                               int w) {
+  int i, j;
+  for (i = 0; i < w*2; i += 2)
+    for (j = 0; j < 8; ++j) {
+      dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch];
+      dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1];
+    }
+}
+
+static void Transpose_di_wxh_C(const uint8* src, int src_pitch,
+                               uint8* dst_a, int dst_pitch_a,
+                               uint8* dst_b, int dst_pitch_b,
+                               int w, int h) {
+  int i, j;
+  for (i = 0; i < w*2; i += 2)
+    for (j = 0; j < h; ++j) {
+      dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch];
+      dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1];
+    }
+}
+
+void Transpose_deinterleave(const uint8* src, int src_pitch,
+                            uint8* dst_a, int dst_pitch_a,
+                            uint8* dst_b, int dst_pitch_b,
+                            int width, int height) {
+  int i = height;
+  rotate_wx8func Transpose_wx8;
+  rotate_wxhfunc Transpose_wxh;
+
+  // do processor detection here.
+#ifdef __ARM_NEON__
+  unsigned long long store_reg[8];
+  SaveRegisters_NEON(store_reg);
+  Transpose_wx8 = Transpose_di_wx8_NEON;
+  Transpose_wxh = Transpose_di_wxh_C;
+#else
+  Transpose_wx8 = Transpose_di_wx8_C;
+  Transpose_wxh = Transpose_di_wxh_C;
+#endif
+
+  width >>= 1;
+
+  // work across the source in 8x8 tiles
+  do {
+    Transpose_wx8(src, src_pitch,
+                  dst_a, dst_pitch_a,
+                  dst_b, dst_pitch_b,
+                  width);
+
+    src   += 8 * src_pitch;
+    dst_a += 8;
+    dst_b += 8;
+    i     -= 8;
+  } while (i >= 8);
+
+  Transpose_wxh(src, src_pitch,
+                dst_a, dst_pitch_a,
+                dst_b, dst_pitch_b,
+                width, i);
+
+#ifdef __ARM_NEON__
+  RestoreRegisters_NEON(store_reg);
+#endif
+}
+
+void Rotate90_deinterleave(const uint8* src, int src_pitch,
+                           uint8* dst_a, int dst_pitch_a,
+                           uint8* dst_b, int dst_pitch_b,
+                            int width, int height) {
+  src += src_pitch*(height-1);
+  src_pitch = -src_pitch;
+
+  Transpose_deinterleave(src, src_pitch,
+                         dst_a, dst_pitch_a,
+                         dst_b, dst_pitch_b,
+                         width, height);
+}
+
+void Rotate270_deinterleave(const uint8* src, int src_pitch,
+                            uint8* dst_a, int dst_pitch_a,
+                            uint8* dst_b, int dst_pitch_b,
+                            int width, int height) {
+  dst_a += dst_pitch_a*((width>>1)-1);
+  dst_b += dst_pitch_b*((width>>1)-1);
+  dst_pitch_a = -dst_pitch_a;
+  dst_pitch_b = -dst_pitch_b;
+
+  Transpose_deinterleave(src, src_pitch,
+                         dst_a, dst_pitch_a,
+                         dst_b, dst_pitch_b,
+                         width, height);
+}
+
+static void ReverseLine_di_C(const uint8* src,
+                             uint8* dst_a, uint8* dst_b,
+                             int width) {
+  int i;
+  for (i = 0; i < width*2; i += 2) {
+    dst_a[width-1 - (i>>1)] = src[i];
+    dst_b[width-1 - (i>>1)] = src[i+1];
+  }
+}
+
+void Rotate180_deinterleave(const uint8* src, int src_pitch,
+                            uint8* dst_a, int dst_pitch_a,
+                            uint8* dst_b, int dst_pitch_b,
+                            int width, int height) {
+  int i;
+  reverse_func ReverseLine;
+
+  // do processor detection here.
+#ifdef __ARM_NEON__
+  ReverseLine = ReverseLine_di_NEON;
+#else
+  ReverseLine = ReverseLine_di_C;
+#endif
+
+  dst_a += dst_pitch_a*(height-1);
+  dst_b += dst_pitch_b*(height-1);
+
+  width >>= 1;
+
+  for (i = 0; i < height; ++i) {
+    ReverseLine(src, dst_a, dst_b, width);
+
+    src   += src_pitch;
+    dst_a -= dst_pitch_a;
+    dst_b -= dst_pitch_b;
+  }
+}
+
+}  // namespace libyuv
--- a/source/rotate_deinterleave_neon.s
+++ b/source/rotate_deinterleave_neon.s
+  .global RestoreRegisters_NEON
+  .global ReverseLine_di_NEON
+  .global SaveRegisters_NEON
+  .global Transpose_di_wx8_NEON
+  .type RestoreRegisters_NEON, function
+  .type ReverseLine_di_NEON, function
+  .type SaveRegisters_NEON, function
+  .type Transpose_di_wx8_NEON, function
+
+@ void SaveRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+SaveRegisters_NEON:
+  vst1.i64    {d8, d9, d10, d11}, [r0]!
+  vst1.i64    {d12, d13, d14, d15}, [r0]!
+  bx          lr
+
+@ void RestoreRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+RestoreRegisters_NEON:
+  vld1.i64    {d8, d9, d10, d11}, [r0]!
+  vld1.i64    {d12, d13, d14, d15}, [r0]!
+  bx          lr
+
+
+@ void ReverseLine_NEON (const uint8* src,
+@                        uint8* dst_a,
+@                        uint8* dst_b,
+@                        int width)
+@ r0 const uint8* src
+@ r1 uint8* dst_a
+@ r2 uint8* dst_b
+@ r3 width
+ReverseLine_di_NEON:
+
+  @ compute where to start writing destination
+  add         r1, r1, r3      @ dst_a + width
+  add         r2, r2, r3      @ dst_b + width
+
+  @ work on input segments that are multiples of 16, but
+  @ width that has been passed is output segments, half
+  @ the size of input.
+  lsrs        r12, r3, #3
+
+  beq         .line_residuals
+
+  @ the output is written in to two blocks.
+  mov         r12, #-8
+
+  @ back of destination by the size of the register that is
+  @ going to be reversed
+  sub         r1, r1, #8
+  sub         r2, r2, #8
+
+  @ the loop needs to run on blocks of 16.  what will be left
+  @ over is either a negative number, the residuals that need
+  @ to be done, or 0.  if this isn't subtracted off here the
+  @ loop will run one extra time.
+  sub         r3, r3, #8
+
+.segments_of_8:
+    vld2.8      {d0, d1}, [r0]!         @ src += 16
+
+    @ reverse the bytes in the 64 bit segments
+    vrev64.8    q0, q0
+
+    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
+    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
+
+    subs        r3, r3, #8
+    bge         .segments_of_8
+
+  @ add 16 back to the counter.  if the result is 0 there is no
+  @ residuals so return
+  adds        r3, r3, #8
+  bxeq        lr
+
+  add         r1, r1, #8
+  add         r2, r2, #8
+
+.line_residuals:
+
+  mov         r12, #-1
+
+  sub         r1, r1, #1
+  sub         r2, r2, #1
+
+@ do this in neon registers as per
+@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+.segments_of_2:
+    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
+
+    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
+    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
+
+    subs        r3, r3, #1
+    bgt         .segments_of_2
+
+  bx          lr
+
+@ void Transpose_di_wx8_NEON (const uint8* src, int src_pitch,
+@                             uint8* dst_a, int dst_pitch_a,
+@                             uint8* dst_b, int dst_pitch_b,
+@                             int width)
+@ r0 const uint8* src
+@ r1 int src_pitch
+@ r2 uint8* dst_a
+@ r3 int dst_pitch_a
+@ stack uint8* dst_b
+@ stack int dst_pitch_b
+@ stack int width
+Transpose_di_wx8_NEON:
+  push        {r4-r9,lr}
+
+  ldr         r4, [sp, #28]         @ dst_b
+  ldr         r5, [sp, #32]         @ dst_pitch_b
+  ldr         r7, [sp, #36]         @ width
+  @ loops are on blocks of 8.  loop will stop when
+  @ counter gets to or below 0.  starting the counter
+  @ at w-8 allow for this
+  sub         r8, #8
+
+@ handle 8x8 blocks.  this should be the majority of the plane
+.loop_8x8:
+    mov         r9, r0
+
+    vld2.8      {d0,  d1},  [r9], r1
+    vld2.8      {d2,  d3},  [r9], r1
+    vld2.8      {d4,  d5},  [r9], r1
+    vld2.8      {d6,  d7},  [r9], r1
+    vld2.8      {d8,  d9},  [r9], r1
+    vld2.8      {d10, d11}, [r9], r1
+    vld2.8      {d12, d13}, [r9], r1
+    vld2.8      {d14, d15}, [r9]
+
+    vtrn.8      q1, q0
+    vtrn.8      q3, q2
+    vtrn.8      q5, q4
+    vtrn.8      q7, q6
+
+    vtrn.16     q1, q3
+    vtrn.16     q0, q2
+    vtrn.16     q5, q7
+    vtrn.16     q4, q6
+
+    vtrn.32     q1, q5
+    vtrn.32     q0, q4
+    vtrn.32     q3, q7
+    vtrn.32     q2, q6
+
+    vrev16.8    q0, q0
+    vrev16.8    q1, q1
+    vrev16.8    q2, q2
+    vrev16.8    q3, q3
+    vrev16.8    q4, q4
+    vrev16.8    q5, q5
+    vrev16.8    q6, q6
+    vrev16.8    q7, q7
+
+    mov         r9, r2
+
+    vst1.8      {d2},  [r9], r3
+    vst1.8      {d0},  [r9], r3
+    vst1.8      {d6},  [r9], r3
+    vst1.8      {d4},  [r9], r3
+    vst1.8      {d10}, [r9], r3
+    vst1.8      {d8},  [r9], r3
+    vst1.8      {d14}, [r9], r3
+    vst1.8      {d12}, [r9]
+
+    mov         r9, r4
+
+    vst1.8      {d3},  [r9], r5
+    vst1.8      {d1},  [r9], r5
+    vst1.8      {d7},  [r9], r5
+    vst1.8      {d5},  [r9], r5
+    vst1.8      {d11}, [r9], r5
+    vst1.8      {d9},  [r9], r5
+    vst1.8      {d15}, [r9], r5
+    vst1.8      {d13}, [r9]
+
+    add         r0, #8*2          @ src   += 8*2
+    add         r2, r3, lsl #3    @ dst_a += 8 * dst_pitch_a
+    add         r4, r5, lsl #3    @ dst_b += 8 * dst_pitch_b
+    subs        r8,  #8           @ w     -= 8
+    bge         .loop_8x8
+
+  @ add 8 back to counter.  if the result is 0 there are
+  @ no residuals.
+  adds        r8, #8
+  beq         .done
+
+  @ some residual, so between 1 and 7 lines left to transpose
+  cmp         r8, #2
+  blt         .block_1x8
+
+  cmp         r8, #4
+  blt         .block_2x8
+
+@ TODO(frkoenig) : clean this up
+.block_4x8:
+  mov         r9, r0
+  vld1.64     {d0}, [r9], r1
+  vld1.64     {d1}, [r9], r1
+  vld1.64     {d2}, [r9], r1
+  vld1.64     {d3}, [r9], r1
+  vld1.64     {d4}, [r9], r1
+  vld1.64     {d5}, [r9], r1
+  vld1.64     {d6}, [r9], r1
+  vld1.64     {d7}, [r9]
+
+  adr         r12, vtbl_4x4_transpose
+  vld1.8      {q7}, [r12]
+
+  vtrn.8      q0, q1
+  vtrn.8      q2, q3
+
+  vtbl.8      d8,  {d0, d1}, d14
+  vtbl.8      d9,  {d0, d1}, d15
+  vtbl.8      d10, {d2, d3}, d14
+  vtbl.8      d11, {d2, d3}, d15
+  vtbl.8      d12, {d4, d5}, d14
+  vtbl.8      d13, {d4, d5}, d15
+  vtbl.8      d0,  {d6, d7}, d14
+  vtbl.8      d1,  {d6, d7}, d15
+
+  mov         r9, r2
+
+  vst1.32     {d8[0]},  [r9], r3
+  vst1.32     {d8[1]},  [r9], r3
+  vst1.32     {d9[0]},  [r9], r3
+  vst1.32     {d9[1]},  [r9], r3
+
+  add         r9, r2, #4
+  vst1.32     {d12[0]}, [r9], r3
+  vst1.32     {d12[1]}, [r9], r3
+  vst1.32     {d13[0]}, [r9], r3
+  vst1.32     {d13[1]}, [r9]
+
+  mov         r9, r4
+
+  vst1.32     {d10[0]}, [r9], r5
+  vst1.32     {d10[1]}, [r9], r5
+  vst1.32     {d11[0]}, [r9], r5
+  vst1.32     {d11[1]}, [r9], r5
+
+  add         r9, r4, #4
+  vst1.32     {d0[0]},  [r9], r5
+  vst1.32     {d0[1]},  [r9], r5
+  vst1.32     {d1[0]},  [r9], r5
+  vst1.32     {d1[1]},  [r9]
+
+  add         r0, #4*2          @ src   += 4 * 2
+  add         r2, r3, lsl #2    @ dst_a += 4 * dst_pitch_a
+  add         r4, r5, lsl #2    @ dst_b += 4 * dst_pitch_b
+  subs        r8,  #4           @ w     -= 4
+  beq         .done
+
+  @ some residual, check to see if it includes a 2x8 block,
+  @ or less
+  cmp         r8, #2
+  blt         .block_1x8
+
+.block_2x8:
+  mov         r9, r0
+  vld2.16     {d0[0], d2[0]}, [r9], r1
+  vld2.16     {d1[0], d3[0]}, [r9], r1
+  vld2.16     {d0[1], d2[1]}, [r9], r1
+  vld2.16     {d1[1], d3[1]}, [r9], r1
+  vld2.16     {d0[2], d2[2]}, [r9], r1
+  vld2.16     {d1[2], d3[2]}, [r9], r1
+  vld2.16     {d0[3], d2[3]}, [r9], r1
+  vld2.16     {d1[3], d3[3]}, [r9]
+
+  vtrn.8      d0, d1
+  vtrn.8      d2, d3
+
+  mov         r9, r2
+
+  vst1.64     {d0}, [r9], r3
+  vst1.64     {d2}, [r9]
+
+  mov         r9, r4
+
+  vst1.64     {d1}, [r9], r5
+  vst1.64     {d3}, [r9]
+
+  add         r0, #2*2          @ src   += 2 * 2
+  add         r2, r3, lsl #1    @ dst_a += 2 * dst_pitch_a
+  add         r4, r5, lsl #1    @ dst_a += 2 * dst_pitch_a
+  subs        r8,  #2           @ w     -= 2
+  beq         .done
+
+.block_1x8:
+  vld2.8      {d0[0], d1[0]}, [r0], r1
+  vld2.8      {d0[1], d1[1]}, [r0], r1
+  vld2.8      {d0[2], d1[2]}, [r0], r1
+  vld2.8      {d0[3], d1[3]}, [r0], r1
+  vld2.8      {d0[4], d1[4]}, [r0], r1
+  vld2.8      {d0[5], d1[5]}, [r0], r1
+  vld2.8      {d0[6], d1[6]}, [r0], r1
+  vld2.8      {d0[7], d1[7]}, [r0]
+
+  vst1.64     {d0}, [r2]
+  vst1.64     {d1}, [r4]
+
+.done:
+  pop         {r4-r9, pc}
+
+vtbl_4x4_transpose:
+  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
--- a/source/rotate_neon.s
+++ b/source/rotate_neon.s
+  .global ReverseLine_NEON
+  .global Transpose_wx8_NEON
+  .type ReverseLine_NEON, function
+  .type Transpose_wx8_NEON, function
+
+@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
+@ r0 const uint8* src
+@ r1 uint8* dst
+@ r2 width
+ReverseLine_NEON:
+
+  @ compute where to start writing destination
+  add         r1, r2      @ dst + width
+
+  @ work on segments that are multiples of 16
+  lsrs        r3, r2, #4
+
+  @ the output is written in two block.  8 bytes followed
+  @ by another 8.  reading is done sequentially, from left to
+  @ right.  writing is done from right to left in block sizes
+  @ r1, the destination pointer is incremented after writing
+  @ the first of the two blocks.  need to subtract that 8 off
+  @ along with 16 to get the next location.
+  mov         r3, #-24
+
+  beq         .line_residuals
+
+  @ back of destination by the size of the register that is
+  @ going to be reversed
+  sub         r1, #16
+
+  @ the loop needs to run on blocks of 16.  what will be left
+  @ over is either a negative number, the residuals that need
+  @ to be done, or 0.  if this isn't subtracted off here the
+  @ loop will run one extra time.
+  sub         r2, #16
+
+.segments_of_16:
+    vld1.8      {q0}, [r0]!               @ src += 16
+
+    @ reverse the bytes in the 64 bit segments.  unable to reverse
+    @ the bytes in the entire 128 bits in one go.
+    vrev64.8    q0, q0
+
+    @ because of the inability to reverse the entire 128 bits
+    @ reverse the writing out of the two 64 bit segments.
+    vst1.8      {d1}, [r1]!
+    vst1.8      {d0}, [r1], r3            @ dst -= 16
+
+    subs        r2, #16
+    bge         .segments_of_16
+
+  @ add 16 back to the counter.  if the result is 0 there is no
+  @ residuals so return
+  adds        r2, #16
+  bxeq        lr
+
+  add         r1, #16
+
+.line_residuals:
+
+  mov         r3, #-3
+
+  sub         r1, #2
+  subs        r2, #2
+  @ check for 16*n+1 scenarios where segments_of_2 should not
+  @ be run, but there is something left over.
+  blt         .segment_of_1
+
+@ do this in neon registers as per
+@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+.segments_of_2:
+    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
+
+    vst1.8      {d1[0]}, [r1]!
+    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2
+
+    subs        r2, #2
+    bge         .segments_of_2
+
+  adds        r2, #2
+  bxeq        lr
+
+.segment_of_1:
+  add         r1, #1
+  vld1.8      {d0[0]}, [r0]
+  vst1.8      {d0[0]}, [r1]
+
+  bx          lr
+
+@ void Transpose_wx8_NEON (const uint8* src, int src_pitch,
+@                          uint8* dst, int dst_pitch,
+@                          int w)
+@ r0 const uint8* src
+@ r1 int src_pitch
+@ r2 uint8* dst
+@ r3 int dst_pitch
+@ stack int w
+Transpose_wx8_NEON:
+  push        {r4,r8,r9,lr}
+
+  ldr         r8, [sp, #16]        @ width
+
+  @ loops are on blocks of 8.  loop will stop when
+  @ counter gets to or below 0.  starting the counter
+  @ at w-8 allow for this
+  sub         r8, #8
+
+@ handle 8x8 blocks.  this should be the majority of the plane
+.loop_8x8:
+    mov         r9, r0
+
+    vld1.8      {d0}, [r9], r1
+    vld1.8      {d1}, [r9], r1
+    vld1.8      {d2}, [r9], r1
+    vld1.8      {d3}, [r9], r1
+    vld1.8      {d4}, [r9], r1
+    vld1.8      {d5}, [r9], r1
+    vld1.8      {d6}, [r9], r1
+    vld1.8      {d7}, [r9]
+
+    vtrn.8      d1, d0
+    vtrn.8      d3, d2
+    vtrn.8      d5, d4
+    vtrn.8      d7, d6
+
+    vtrn.16     d1, d3
+    vtrn.16     d0, d2
+    vtrn.16     d5, d7
+    vtrn.16     d4, d6
+
+    vtrn.32     d1, d5
+    vtrn.32     d0, d4
+    vtrn.32     d3, d7
+    vtrn.32     d2, d6
+
+    vrev16.8    q0, q0
+    vrev16.8    q1, q1
+    vrev16.8    q2, q2
+    vrev16.8    q3, q3
+
+    mov         r9, r2
+
+    vst1.8      {d1}, [r9], r3
+    vst1.8      {d0}, [r9], r3
+    vst1.8      {d3}, [r9], r3
+    vst1.8      {d2}, [r9], r3
+    vst1.8      {d5}, [r9], r3
+    vst1.8      {d4}, [r9], r3
+    vst1.8      {d7}, [r9], r3
+    vst1.8      {d6}, [r9]
+
+    add         r0, #8            @ src += 8
+    add         r2, r3, lsl #3    @ dst += 8 * dst_pitch
+    subs        r8,  #8           @ w   -= 8
+    bge         .loop_8x8
+
+  @ add 8 back to counter.  if the result is 0 there are
+  @ no residuals.
+  adds        r8, #8
+  beq         .done
+
+  @ some residual, so between 1 and 7 lines left to transpose
+  cmp         r8, #2
+  blt         .block_1x8
+
+  cmp         r8, #4
+  blt         .block_2x8
+
+.block_4x8:
+  mov         r9, r0
+  vld1.32     {d0[0]}, [r9], r1
+  vld1.32     {d0[1]}, [r9], r1
+  vld1.32     {d1[0]}, [r9], r1
+  vld1.32     {d1[1]}, [r9], r1
+  vld1.32     {d2[0]}, [r9], r1
+  vld1.32     {d2[1]}, [r9], r1
+  vld1.32     {d3[0]}, [r9], r1
+  vld1.32     {d3[1]}, [r9]
+
+  mov         r9, r2
+
+  adr         r12, vtbl_4x4_transpose
+  vld1.8      {q3}, [r12]
+
+  vtbl.8      d4, {d0, d1}, d6
+  vtbl.8      d5, {d0, d1}, d7
+  vtbl.8      d0, {d2, d3}, d6
+  vtbl.8      d1, {d2, d3}, d7
+
+  @ TODO: rework shuffle above to write
+  @       out with 4 instead of 8 writes
+  vst1.32     {d4[0]}, [r9], r3
+  vst1.32     {d4[1]}, [r9], r3
+  vst1.32     {d5[0]}, [r9], r3
+  vst1.32     {d5[1]}, [r9]
+
+  add         r9, r2, #4
+  vst1.32     {d0[0]}, [r9], r3
+  vst1.32     {d0[1]}, [r9], r3
+  vst1.32     {d1[0]}, [r9], r3
+  vst1.32     {d1[1]}, [r9]
+
+  add         r0, #4            @ src += 4
+  add         r2, r3, lsl #2    @ dst += 4 * dst_pitch
+  subs        r8,  #4           @ w   -= 4
+  beq         .done
+
+  @ some residual, check to see if it includes a 2x8 block,
+  @ or less
+  cmp         r8, #2
+  blt         .block_1x8
+
+.block_2x8:
+  mov         r9, r0
+  vld1.16     {d0[0]}, [r9], r1
+  vld1.16     {d1[0]}, [r9], r1
+  vld1.16     {d0[1]}, [r9], r1
+  vld1.16     {d1[1]}, [r9], r1
+  vld1.16     {d0[2]}, [r9], r1
+  vld1.16     {d1[2]}, [r9], r1
+  vld1.16     {d0[3]}, [r9], r1
+  vld1.16     {d1[3]}, [r9]
+
+  vtrn.8      d0, d1
+
+  mov         r9, r2
+
+  vst1.64     {d0}, [r9], r3
+  vst1.64     {d1}, [r9]
+
+  add         r0, #2            @ src += 2
+  add         r2, r3, lsl #1    @ dst += 2 * dst_pitch
+  subs        r8,  #2           @ w   -= 2
+  beq         .done
+
+.block_1x8:
+  vld1.8      {d0[0]}, [r0], r1
+  vld1.8      {d0[1]}, [r0], r1
+  vld1.8      {d0[2]}, [r0], r1
+  vld1.8      {d0[3]}, [r0], r1
+  vld1.8      {d0[4]}, [r0], r1
+  vld1.8      {d0[5]}, [r0], r1
+  vld1.8      {d0[6]}, [r0], r1
+  vld1.8      {d0[7]}, [r0]
+
+  vst1.64     {d0}, [r2]
+
+.done:
+
+  pop         {r4,r8,r9,pc}
+
+vtbl_4x4_transpose:
+  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -20,7 +20,9 @@ class libyuvEnvironment : public ::testing::Environment {
  }
 };

-libyuvTest::libyuvTest()
+libyuvTest::libyuvTest() :
+  _rotate_max_w(128),
+  _rotate_max_h(128)
 {
 }


--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -11,6 +11,7 @@
 #ifndef UINIT_TEST_H_
 #define UINIT_TEST_H_

+#include "basic_types.h"
 #include <gtest/gtest.h>

 class libyuvTest : public ::testing::Test {
@@ -18,6 +19,10 @@ class libyuvTest : public ::testing::Test {
  libyuvTest();
  virtual void SetUp();
  virtual void TearDown();
+
+  const uint32 _rotate_max_w;
+  const uint32 _rotate_max_h;
+
 };

 #endif // UNIT_TEST_H_