rotate_neon.cc 15.8 KB
Newer Older
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 4 5 6
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "libyuv/row.h"
12

13 14
#include "libyuv/basic_types.h"

15
#ifdef __cplusplus
16
namespace libyuv {
17 18
extern "C" {
#endif
19

20
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
21
static uvec8 kVTbl4x4Transpose =
22 23 24 25 26
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride,
                       int width) {
27
  asm volatile (
28 29
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
30
    // at w-8 allow for this
31
    "sub         %4, #8                        \n"
32

33
    // handle 8x8 blocks. this should be the majority of the plane
34
    ".p2align  2                               \n"
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
    "1:                                        \n"
      "mov         r9, %0                      \n"

      "vld1.8      {d0}, [r9], %1              \n"
      "vld1.8      {d1}, [r9], %1              \n"
      "vld1.8      {d2}, [r9], %1              \n"
      "vld1.8      {d3}, [r9], %1              \n"
      "vld1.8      {d4}, [r9], %1              \n"
      "vld1.8      {d5}, [r9], %1              \n"
      "vld1.8      {d6}, [r9], %1              \n"
      "vld1.8      {d7}, [r9]                  \n"

      "vtrn.8      d1, d0                      \n"
      "vtrn.8      d3, d2                      \n"
      "vtrn.8      d5, d4                      \n"
      "vtrn.8      d7, d6                      \n"

      "vtrn.16     d1, d3                      \n"
      "vtrn.16     d0, d2                      \n"
      "vtrn.16     d5, d7                      \n"
      "vtrn.16     d4, d6                      \n"

      "vtrn.32     d1, d5                      \n"
      "vtrn.32     d0, d4                      \n"
      "vtrn.32     d3, d7                      \n"
      "vtrn.32     d2, d6                      \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"

      "mov         r9, %2                      \n"

      "vst1.8      {d1}, [r9], %3              \n"
      "vst1.8      {d0}, [r9], %3              \n"
      "vst1.8      {d3}, [r9], %3              \n"
      "vst1.8      {d2}, [r9], %3              \n"
      "vst1.8      {d5}, [r9], %3              \n"
      "vst1.8      {d4}, [r9], %3              \n"
      "vst1.8      {d7}, [r9], %3              \n"
      "vst1.8      {d6}, [r9]                  \n"

      "add         %0, #8                      \n"  // src += 8
      "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
      "subs        %4,  #8                     \n"  // w   -= 8
      "bge         1b                          \n"
82

83
    // add 8 back to counter. if the result is 0 there are
84
    // no residuals.
85 86
    "adds        %4, #8                        \n"
    "beq         4f                            \n"
87 88

    // some residual, so between 1 and 7 lines left to transpose
89 90
    "cmp         %4, #2                        \n"
    "blt         3f                            \n"
91

92 93
    "cmp         %4, #4                        \n"
    "blt         2f                            \n"
94 95

    // 4x8 block
96 97 98 99 100 101 102 103 104
    "mov         r9, %0                        \n"
    "vld1.32     {d0[0]}, [r9], %1             \n"
    "vld1.32     {d0[1]}, [r9], %1             \n"
    "vld1.32     {d1[0]}, [r9], %1             \n"
    "vld1.32     {d1[1]}, [r9], %1             \n"
    "vld1.32     {d2[0]}, [r9], %1             \n"
    "vld1.32     {d2[1]}, [r9], %1             \n"
    "vld1.32     {d3[0]}, [r9], %1             \n"
    "vld1.32     {d3[1]}, [r9]                 \n"
105

106
    "mov         r9, %2                        \n"
107

108
    "vld1.8      {q3}, [%5]                    \n"
109

110 111 112 113
    "vtbl.8      d4, {d0, d1}, d6              \n"
    "vtbl.8      d5, {d0, d1}, d7              \n"
    "vtbl.8      d0, {d2, d3}, d6              \n"
    "vtbl.8      d1, {d2, d3}, d7              \n"
114

115 116
    // TODO(frkoenig): Rework shuffle above to
    // write out with 4 instead of 8 writes.
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
    "vst1.32     {d4[0]}, [r9], %3             \n"
    "vst1.32     {d4[1]}, [r9], %3             \n"
    "vst1.32     {d5[0]}, [r9], %3             \n"
    "vst1.32     {d5[1]}, [r9]                 \n"

    "add         r9, %2, #4                    \n"
    "vst1.32     {d0[0]}, [r9], %3             \n"
    "vst1.32     {d0[1]}, [r9], %3             \n"
    "vst1.32     {d1[0]}, [r9], %3             \n"
    "vst1.32     {d1[1]}, [r9]                 \n"

    "add         %0, #4                        \n"  // src += 4
    "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
    "subs        %4,  #4                       \n"  // w   -= 4
    "beq         4f                            \n"
132 133 134

    // some residual, check to see if it includes a 2x8 block,
    // or less
135 136
    "cmp         %4, #2                        \n"
    "blt         3f                            \n"
137 138

    // 2x8 block
139 140 141 142 143 144 145 146 147 148
    "2:                                        \n"
    "mov         r9, %0                        \n"
    "vld1.16     {d0[0]}, [r9], %1             \n"
    "vld1.16     {d1[0]}, [r9], %1             \n"
    "vld1.16     {d0[1]}, [r9], %1             \n"
    "vld1.16     {d1[1]}, [r9], %1             \n"
    "vld1.16     {d0[2]}, [r9], %1             \n"
    "vld1.16     {d1[2]}, [r9], %1             \n"
    "vld1.16     {d0[3]}, [r9], %1             \n"
    "vld1.16     {d1[3]}, [r9]                 \n"
149

150
    "vtrn.8      d0, d1                        \n"
151

152
    "mov         r9, %2                        \n"
153

154 155
    "vst1.64     {d0}, [r9], %3                \n"
    "vst1.64     {d1}, [r9]                    \n"
156

157 158 159 160
    "add         %0, #2                        \n"  // src += 2
    "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
    "subs        %4,  #2                       \n"  // w   -= 2
    "beq         4f                            \n"
161 162

    // 1x8 block
163 164 165 166 167 168 169 170 171
    "3:                                        \n"
    "vld1.8      {d0[0]}, [%0], %1             \n"
    "vld1.8      {d0[1]}, [%0], %1             \n"
    "vld1.8      {d0[2]}, [%0], %1             \n"
    "vld1.8      {d0[3]}, [%0], %1             \n"
    "vld1.8      {d0[4]}, [%0], %1             \n"
    "vld1.8      {d0[5]}, [%0], %1             \n"
    "vld1.8      {d0[6]}, [%0], %1             \n"
    "vld1.8      {d0[7]}, [%0]                 \n"
172

173
    "vst1.64     {d0}, [%2]                    \n"
174

175
    "4:                                        \n"
176

177 178 179 180 181 182
    : "+r"(src),               // %0
      "+r"(src_stride),        // %1
      "+r"(dst),               // %2
      "+r"(dst_stride),        // %3
      "+r"(width)              // %4
    : "r"(&kVTbl4x4Transpose)  // %5
183 184 185 186
    : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
  );
}

187
static uvec8 kVTbl4x4TransposeDi =
188 189 190 191 192 193
  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };

void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
194
  asm volatile (
195 196
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
197
    // at w-8 allow for this
198
    "sub         %6, #8                        \n"
199

200
    // handle 8x8 blocks. this should be the majority of the plane
201
    ".p2align  2                               \n"
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
    "1:                                        \n"
      "mov         r9, %0                      \n"

      "vld2.8      {d0,  d1},  [r9], %1        \n"
      "vld2.8      {d2,  d3},  [r9], %1        \n"
      "vld2.8      {d4,  d5},  [r9], %1        \n"
      "vld2.8      {d6,  d7},  [r9], %1        \n"
      "vld2.8      {d16, d17}, [r9], %1        \n"
      "vld2.8      {d18, d19}, [r9], %1        \n"
      "vld2.8      {d20, d21}, [r9], %1        \n"
      "vld2.8      {d22, d23}, [r9]            \n"

      "vtrn.8      q1, q0                      \n"
      "vtrn.8      q3, q2                      \n"
      "vtrn.8      q9, q8                      \n"
      "vtrn.8      q11, q10                    \n"

      "vtrn.16     q1, q3                      \n"
      "vtrn.16     q0, q2                      \n"
      "vtrn.16     q9, q11                     \n"
      "vtrn.16     q8, q10                     \n"

      "vtrn.32     q1, q9                      \n"
      "vtrn.32     q0, q8                      \n"
      "vtrn.32     q3, q11                     \n"
      "vtrn.32     q2, q10                     \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"
      "vrev16.8    q8, q8                      \n"
      "vrev16.8    q9, q9                      \n"
      "vrev16.8    q10, q10                    \n"
      "vrev16.8    q11, q11                    \n"

      "mov         r9, %2                      \n"

      "vst1.8      {d2},  [r9], %3             \n"
      "vst1.8      {d0},  [r9], %3             \n"
      "vst1.8      {d6},  [r9], %3             \n"
      "vst1.8      {d4},  [r9], %3             \n"
      "vst1.8      {d18}, [r9], %3             \n"
      "vst1.8      {d16}, [r9], %3             \n"
      "vst1.8      {d22}, [r9], %3             \n"
      "vst1.8      {d20}, [r9]                 \n"

      "mov         r9, %4                      \n"

      "vst1.8      {d3},  [r9], %5             \n"
      "vst1.8      {d1},  [r9], %5             \n"
      "vst1.8      {d7},  [r9], %5             \n"
      "vst1.8      {d5},  [r9], %5             \n"
      "vst1.8      {d19}, [r9], %5             \n"
      "vst1.8      {d17}, [r9], %5             \n"
      "vst1.8      {d23}, [r9], %5             \n"
      "vst1.8      {d21}, [r9]                 \n"

      "add         %0, #8*2                    \n"  // src   += 8*2
      "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
      "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
      "subs        %6,  #8                     \n"  // w     -= 8
      "bge         1b                          \n"
265

266
    // add 8 back to counter. if the result is 0 there are
267
    // no residuals.
268 269
    "adds        %6, #8                        \n"
    "beq         4f                            \n"
270 271

    // some residual, so between 1 and 7 lines left to transpose
272 273
    "cmp         %6, #2                        \n"
    "blt         3f                            \n"
274

275 276
    "cmp         %6, #4                        \n"
    "blt         2f                            \n"
277

278
    //TODO(frkoenig): Clean this up
279
    // 4x8 block
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
    "mov         r9, %0                        \n"
    "vld1.64     {d0}, [r9], %1                \n"
    "vld1.64     {d1}, [r9], %1                \n"
    "vld1.64     {d2}, [r9], %1                \n"
    "vld1.64     {d3}, [r9], %1                \n"
    "vld1.64     {d4}, [r9], %1                \n"
    "vld1.64     {d5}, [r9], %1                \n"
    "vld1.64     {d6}, [r9], %1                \n"
    "vld1.64     {d7}, [r9]                    \n"

    "vld1.8      {q15}, [%7]                   \n"

    "vtrn.8      q0, q1                        \n"
    "vtrn.8      q2, q3                        \n"

    "vtbl.8      d16, {d0, d1}, d30            \n"
    "vtbl.8      d17, {d0, d1}, d31            \n"
    "vtbl.8      d18, {d2, d3}, d30            \n"
    "vtbl.8      d19, {d2, d3}, d31            \n"
    "vtbl.8      d20, {d4, d5}, d30            \n"
    "vtbl.8      d21, {d4, d5}, d31            \n"
    "vtbl.8      d22, {d6, d7}, d30            \n"
    "vtbl.8      d23, {d6, d7}, d31            \n"

    "mov         r9, %2                        \n"

    "vst1.32     {d16[0]},  [r9], %3           \n"
    "vst1.32     {d16[1]},  [r9], %3           \n"
    "vst1.32     {d17[0]},  [r9], %3           \n"
    "vst1.32     {d17[1]},  [r9], %3           \n"

    "add         r9, %2, #4                    \n"
    "vst1.32     {d20[0]}, [r9], %3            \n"
    "vst1.32     {d20[1]}, [r9], %3            \n"
    "vst1.32     {d21[0]}, [r9], %3            \n"
    "vst1.32     {d21[1]}, [r9]                \n"

    "mov         r9, %4                        \n"

    "vst1.32     {d18[0]}, [r9], %5            \n"
    "vst1.32     {d18[1]}, [r9], %5            \n"
    "vst1.32     {d19[0]}, [r9], %5            \n"
    "vst1.32     {d19[1]}, [r9], %5            \n"

    "add         r9, %4, #4                    \n"
    "vst1.32     {d22[0]},  [r9], %5           \n"
    "vst1.32     {d22[1]},  [r9], %5           \n"
    "vst1.32     {d23[0]},  [r9], %5           \n"
    "vst1.32     {d23[1]},  [r9]               \n"

    "add         %0, #4*2                      \n"  // src   += 4 * 2
    "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    "subs        %6,  #4                       \n"  // w     -= 4
    "beq         4f                            \n"
335 336 337

    // some residual, check to see if it includes a 2x8 block,
    // or less
338 339
    "cmp         %6, #2                        \n"
    "blt         3f                            \n"
340 341

    // 2x8 block
342 343 344 345 346 347 348 349 350 351
    "2:                                        \n"
    "mov         r9, %0                        \n"
    "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
    "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
    "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
    "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
    "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
    "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
    "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
    "vld2.16     {d1[3], d3[3]}, [r9]          \n"
352

353 354
    "vtrn.8      d0, d1                        \n"
    "vtrn.8      d2, d3                        \n"
355

356
    "mov         r9, %2                        \n"
357

358 359
    "vst1.64     {d0}, [r9], %3                \n"
    "vst1.64     {d2}, [r9]                    \n"
360

361
    "mov         r9, %4                        \n"
362

363 364
    "vst1.64     {d1}, [r9], %5                \n"
    "vst1.64     {d3}, [r9]                    \n"
365

366 367 368 369 370
    "add         %0, #2*2                      \n"  // src   += 2 * 2
    "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    "subs        %6,  #2                       \n"  // w     -= 2
    "beq         4f                            \n"
371 372

    // 1x8 block
373 374 375 376 377 378 379 380 381 382 383 384 385 386
    "3:                                        \n"
    "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
    "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
    "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
    "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
    "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
    "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
    "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
    "vld2.8      {d0[7], d1[7]}, [%0]          \n"

    "vst1.64     {d0}, [%2]                    \n"
    "vst1.64     {d1}, [%4]                    \n"

    "4:                                        \n"
387

388 389 390 391 392 393 394 395
    : "+r"(src),                 // %0
      "+r"(src_stride),          // %1
      "+r"(dst_a),               // %2
      "+r"(dst_stride_a),        // %3
      "+r"(dst_b),               // %4
      "+r"(dst_stride_b),        // %5
      "+r"(width)                // %6
    : "r"(&kVTbl4x4TransposeDi)  // %7
396 397 398 399 400
    : "memory", "cc", "r9",
      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
}
#endif
401 402 403 404 405

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif