rotate_neon.cc 18.1 KB
Newer Older
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 4 5 6
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "libyuv/row.h"
12
#include "libyuv/rotate_row.h"
13

14 15
#include "libyuv/basic_types.h"

16
#ifdef __cplusplus
17
namespace libyuv {
18 19
extern "C" {
#endif
20

21 22
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
23

24
static uvec8 kVTbl4x4Transpose =
25 26 27 28 29
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride,
                       int width) {
30
  const uint8* src_temp = NULL;
31
  asm volatile (
32 33
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
34
    // at w-8 allow for this
35
    "sub         %5, #8                        \n"
36

37
    // handle 8x8 blocks. this should be the majority of the plane
38
    "1:                                        \n"
39
      "mov         %0, %1                      \n"
40

41
      MEMACCESS(0)
42
      "vld1.8      {d0}, [%0], %2              \n"
43
      MEMACCESS(0)
44
      "vld1.8      {d1}, [%0], %2              \n"
45
      MEMACCESS(0)
46
      "vld1.8      {d2}, [%0], %2              \n"
47
      MEMACCESS(0)
48
      "vld1.8      {d3}, [%0], %2              \n"
49
      MEMACCESS(0)
50
      "vld1.8      {d4}, [%0], %2              \n"
51
      MEMACCESS(0)
52
      "vld1.8      {d5}, [%0], %2              \n"
53
      MEMACCESS(0)
54
      "vld1.8      {d6}, [%0], %2              \n"
55
      MEMACCESS(0)
56
      "vld1.8      {d7}, [%0]                  \n"
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77

      "vtrn.8      d1, d0                      \n"
      "vtrn.8      d3, d2                      \n"
      "vtrn.8      d5, d4                      \n"
      "vtrn.8      d7, d6                      \n"

      "vtrn.16     d1, d3                      \n"
      "vtrn.16     d0, d2                      \n"
      "vtrn.16     d5, d7                      \n"
      "vtrn.16     d4, d6                      \n"

      "vtrn.32     d1, d5                      \n"
      "vtrn.32     d0, d4                      \n"
      "vtrn.32     d3, d7                      \n"
      "vtrn.32     d2, d6                      \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"

78
      "mov         %0, %3                      \n"
79

80
    MEMACCESS(0)
81
      "vst1.8      {d1}, [%0], %4              \n"
82
    MEMACCESS(0)
83
      "vst1.8      {d0}, [%0], %4              \n"
84
    MEMACCESS(0)
85
      "vst1.8      {d3}, [%0], %4              \n"
86
    MEMACCESS(0)
87
      "vst1.8      {d2}, [%0], %4              \n"
88
    MEMACCESS(0)
89
      "vst1.8      {d5}, [%0], %4              \n"
90
    MEMACCESS(0)
91
      "vst1.8      {d4}, [%0], %4              \n"
92
    MEMACCESS(0)
93
      "vst1.8      {d7}, [%0], %4              \n"
94
    MEMACCESS(0)
95
      "vst1.8      {d6}, [%0]                  \n"
96

97 98 99
      "add         %1, #8                      \n"  // src += 8
      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
      "subs        %5,  #8                     \n"  // w   -= 8
100
      "bge         1b                          \n"
101

102
    // add 8 back to counter. if the result is 0 there are
103
    // no residuals.
104
    "adds        %5, #8                        \n"
105
    "beq         4f                            \n"
106 107

    // some residual, so between 1 and 7 lines left to transpose
108
    "cmp         %5, #2                        \n"
109
    "blt         3f                            \n"
110

111
    "cmp         %5, #4                        \n"
112
    "blt         2f                            \n"
113 114

    // 4x8 block
115
    "mov         %0, %1                        \n"
116
    MEMACCESS(0)
117
    "vld1.32     {d0[0]}, [%0], %2             \n"
118
    MEMACCESS(0)
119
    "vld1.32     {d0[1]}, [%0], %2             \n"
120
    MEMACCESS(0)
121
    "vld1.32     {d1[0]}, [%0], %2             \n"
122
    MEMACCESS(0)
123
    "vld1.32     {d1[1]}, [%0], %2             \n"
124
    MEMACCESS(0)
125
    "vld1.32     {d2[0]}, [%0], %2             \n"
126
    MEMACCESS(0)
127
    "vld1.32     {d2[1]}, [%0], %2             \n"
128
    MEMACCESS(0)
129
    "vld1.32     {d3[0]}, [%0], %2             \n"
130
    MEMACCESS(0)
131
    "vld1.32     {d3[1]}, [%0]                 \n"
132

133
    "mov         %0, %3                        \n"
134

135
    MEMACCESS(6)
136
    "vld1.8      {q3}, [%6]                    \n"
137

138 139 140 141
    "vtbl.8      d4, {d0, d1}, d6              \n"
    "vtbl.8      d5, {d0, d1}, d7              \n"
    "vtbl.8      d0, {d2, d3}, d6              \n"
    "vtbl.8      d1, {d2, d3}, d7              \n"
142

143 144
    // TODO(frkoenig): Rework shuffle above to
    // write out with 4 instead of 8 writes.
145
    MEMACCESS(0)
146
    "vst1.32     {d4[0]}, [%0], %4             \n"
147
    MEMACCESS(0)
148
    "vst1.32     {d4[1]}, [%0], %4             \n"
149
    MEMACCESS(0)
150
    "vst1.32     {d5[0]}, [%0], %4             \n"
151
    MEMACCESS(0)
152 153 154
    "vst1.32     {d5[1]}, [%0]                 \n"

    "add         %0, %3, #4                    \n"
155
    MEMACCESS(0)
156
    "vst1.32     {d0[0]}, [%0], %4             \n"
157
    MEMACCESS(0)
158
    "vst1.32     {d0[1]}, [%0], %4             \n"
159
    MEMACCESS(0)
160
    "vst1.32     {d1[0]}, [%0], %4             \n"
161
    MEMACCESS(0)
162 163 164 165 166
    "vst1.32     {d1[1]}, [%0]                 \n"

    "add         %1, #4                        \n"  // src += 4
    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
    "subs        %5,  #4                       \n"  // w   -= 4
167
    "beq         4f                            \n"
168 169 170

    // some residual, check to see if it includes a 2x8 block,
    // or less
171
    "cmp         %5, #2                        \n"
172
    "blt         3f                            \n"
173 174

    // 2x8 block
175
    "2:                                        \n"
176
    "mov         %0, %1                        \n"
177
    MEMACCESS(0)
178
    "vld1.16     {d0[0]}, [%0], %2             \n"
179
    MEMACCESS(0)
180
    "vld1.16     {d1[0]}, [%0], %2             \n"
181
    MEMACCESS(0)
182
    "vld1.16     {d0[1]}, [%0], %2             \n"
183
    MEMACCESS(0)
184
    "vld1.16     {d1[1]}, [%0], %2             \n"
185
    MEMACCESS(0)
186
    "vld1.16     {d0[2]}, [%0], %2             \n"
187
    MEMACCESS(0)
188
    "vld1.16     {d1[2]}, [%0], %2             \n"
189
    MEMACCESS(0)
190
    "vld1.16     {d0[3]}, [%0], %2             \n"
191
    MEMACCESS(0)
192
    "vld1.16     {d1[3]}, [%0]                 \n"
193

194
    "vtrn.8      d0, d1                        \n"
195

196
    "mov         %0, %3                        \n"
197

198
    MEMACCESS(0)
199
    "vst1.64     {d0}, [%0], %4                \n"
200
    MEMACCESS(0)
201
    "vst1.64     {d1}, [%0]                    \n"
202

203 204 205
    "add         %1, #2                        \n"  // src += 2
    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
    "subs        %5,  #2                       \n"  // w   -= 2
206
    "beq         4f                            \n"
207 208

    // 1x8 block
209
    "3:                                        \n"
210
    MEMACCESS(1)
211
    "vld1.8      {d0[0]}, [%1], %2             \n"
212
    MEMACCESS(1)
213
    "vld1.8      {d0[1]}, [%1], %2             \n"
214
    MEMACCESS(1)
215
    "vld1.8      {d0[2]}, [%1], %2             \n"
216
    MEMACCESS(1)
217
    "vld1.8      {d0[3]}, [%1], %2             \n"
218
    MEMACCESS(1)
219
    "vld1.8      {d0[4]}, [%1], %2             \n"
220
    MEMACCESS(1)
221
    "vld1.8      {d0[5]}, [%1], %2             \n"
222
    MEMACCESS(1)
223
    "vld1.8      {d0[6]}, [%1], %2             \n"
224
    MEMACCESS(1)
225
    "vld1.8      {d0[7]}, [%1]                 \n"
226

227
    MEMACCESS(3)
228
    "vst1.64     {d0}, [%3]                    \n"
229

230
    "4:                                        \n"
231

232 233 234 235 236 237 238 239
    : "+r"(src_temp),          // %0
      "+r"(src),               // %1
      "+r"(src_stride),        // %2
      "+r"(dst),               // %3
      "+r"(dst_stride),        // %4
      "+r"(width)              // %5
    : "r"(&kVTbl4x4Transpose)  // %6
    : "memory", "cc", "q0", "q1", "q2", "q3"
240 241 242
  );
}

243
static uvec8 kVTbl4x4TransposeDi =
244 245 246 247 248 249
  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };

void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
250
  const uint8* src_temp = NULL;
251
  asm volatile (
252 253
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
254
    // at w-8 allow for this
255
    "sub         %7, #8                        \n"
256

257
    // handle 8x8 blocks. this should be the majority of the plane
258
    "1:                                        \n"
259
      "mov         %0, %1                      \n"
260

261
      MEMACCESS(0)
262
      "vld2.8      {d0,  d1},  [%0], %2        \n"
263
      MEMACCESS(0)
264
      "vld2.8      {d2,  d3},  [%0], %2        \n"
265
      MEMACCESS(0)
266
      "vld2.8      {d4,  d5},  [%0], %2        \n"
267
      MEMACCESS(0)
268
      "vld2.8      {d6,  d7},  [%0], %2        \n"
269
      MEMACCESS(0)
270
      "vld2.8      {d16, d17}, [%0], %2        \n"
271
      MEMACCESS(0)
272
      "vld2.8      {d18, d19}, [%0], %2        \n"
273
      MEMACCESS(0)
274
      "vld2.8      {d20, d21}, [%0], %2        \n"
275
      MEMACCESS(0)
276
      "vld2.8      {d22, d23}, [%0]            \n"
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301

      "vtrn.8      q1, q0                      \n"
      "vtrn.8      q3, q2                      \n"
      "vtrn.8      q9, q8                      \n"
      "vtrn.8      q11, q10                    \n"

      "vtrn.16     q1, q3                      \n"
      "vtrn.16     q0, q2                      \n"
      "vtrn.16     q9, q11                     \n"
      "vtrn.16     q8, q10                     \n"

      "vtrn.32     q1, q9                      \n"
      "vtrn.32     q0, q8                      \n"
      "vtrn.32     q3, q11                     \n"
      "vtrn.32     q2, q10                     \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"
      "vrev16.8    q8, q8                      \n"
      "vrev16.8    q9, q9                      \n"
      "vrev16.8    q10, q10                    \n"
      "vrev16.8    q11, q11                    \n"

302 303
      "mov         %0, %3                      \n"

304
    MEMACCESS(0)
305
      "vst1.8      {d2},  [%0], %4             \n"
306
    MEMACCESS(0)
307
      "vst1.8      {d0},  [%0], %4             \n"
308
    MEMACCESS(0)
309
      "vst1.8      {d6},  [%0], %4             \n"
310
    MEMACCESS(0)
311
      "vst1.8      {d4},  [%0], %4             \n"
312
    MEMACCESS(0)
313
      "vst1.8      {d18}, [%0], %4             \n"
314
    MEMACCESS(0)
315
      "vst1.8      {d16}, [%0], %4             \n"
316
    MEMACCESS(0)
317
      "vst1.8      {d22}, [%0], %4             \n"
318
    MEMACCESS(0)
319 320 321 322
      "vst1.8      {d20}, [%0]                 \n"

      "mov         %0, %5                      \n"

323
    MEMACCESS(0)
324
      "vst1.8      {d3},  [%0], %6             \n"
325
    MEMACCESS(0)
326
      "vst1.8      {d1},  [%0], %6             \n"
327
    MEMACCESS(0)
328
      "vst1.8      {d7},  [%0], %6             \n"
329
    MEMACCESS(0)
330
      "vst1.8      {d5},  [%0], %6             \n"
331
    MEMACCESS(0)
332
      "vst1.8      {d19}, [%0], %6             \n"
333
    MEMACCESS(0)
334
      "vst1.8      {d17}, [%0], %6             \n"
335
    MEMACCESS(0)
336
      "vst1.8      {d23}, [%0], %6             \n"
337
    MEMACCESS(0)
338 339 340 341 342 343
      "vst1.8      {d21}, [%0]                 \n"

      "add         %1, #8*2                    \n"  // src   += 8*2
      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
      "subs        %7,  #8                     \n"  // w     -= 8
344
      "bge         1b                          \n"
345

346
    // add 8 back to counter. if the result is 0 there are
347
    // no residuals.
348
    "adds        %7, #8                        \n"
349
    "beq         4f                            \n"
350 351

    // some residual, so between 1 and 7 lines left to transpose
352
    "cmp         %7, #2                        \n"
353
    "blt         3f                            \n"
354

355
    "cmp         %7, #4                        \n"
356
    "blt         2f                            \n"
357

358
    // TODO(frkoenig): Clean this up
359
    // 4x8 block
360
    "mov         %0, %1                        \n"
361
    MEMACCESS(0)
362
    "vld1.64     {d0}, [%0], %2                \n"
363
    MEMACCESS(0)
364
    "vld1.64     {d1}, [%0], %2                \n"
365
    MEMACCESS(0)
366
    "vld1.64     {d2}, [%0], %2                \n"
367
    MEMACCESS(0)
368
    "vld1.64     {d3}, [%0], %2                \n"
369
    MEMACCESS(0)
370
    "vld1.64     {d4}, [%0], %2                \n"
371
    MEMACCESS(0)
372
    "vld1.64     {d5}, [%0], %2                \n"
373
    MEMACCESS(0)
374
    "vld1.64     {d6}, [%0], %2                \n"
375
    MEMACCESS(0)
376 377
    "vld1.64     {d7}, [%0]                    \n"

378
    MEMACCESS(8)
379
    "vld1.8      {q15}, [%8]                   \n"
380 381 382 383 384 385 386 387 388 389 390 391 392

    "vtrn.8      q0, q1                        \n"
    "vtrn.8      q2, q3                        \n"

    "vtbl.8      d16, {d0, d1}, d30            \n"
    "vtbl.8      d17, {d0, d1}, d31            \n"
    "vtbl.8      d18, {d2, d3}, d30            \n"
    "vtbl.8      d19, {d2, d3}, d31            \n"
    "vtbl.8      d20, {d4, d5}, d30            \n"
    "vtbl.8      d21, {d4, d5}, d31            \n"
    "vtbl.8      d22, {d6, d7}, d30            \n"
    "vtbl.8      d23, {d6, d7}, d31            \n"

393
    "mov         %0, %3                        \n"
394

395
    MEMACCESS(0)
396
    "vst1.32     {d16[0]},  [%0], %4           \n"
397
    MEMACCESS(0)
398
    "vst1.32     {d16[1]},  [%0], %4           \n"
399
    MEMACCESS(0)
400
    "vst1.32     {d17[0]},  [%0], %4           \n"
401
    MEMACCESS(0)
402
    "vst1.32     {d17[1]},  [%0], %4           \n"
403

404
    "add         %0, %3, #4                    \n"
405
    MEMACCESS(0)
406
    "vst1.32     {d20[0]}, [%0], %4            \n"
407
    MEMACCESS(0)
408
    "vst1.32     {d20[1]}, [%0], %4            \n"
409
    MEMACCESS(0)
410
    "vst1.32     {d21[0]}, [%0], %4            \n"
411
    MEMACCESS(0)
412
    "vst1.32     {d21[1]}, [%0]                \n"
413

414
    "mov         %0, %5                        \n"
415

416
    MEMACCESS(0)
417
    "vst1.32     {d18[0]}, [%0], %6            \n"
418
    MEMACCESS(0)
419
    "vst1.32     {d18[1]}, [%0], %6            \n"
420
    MEMACCESS(0)
421
    "vst1.32     {d19[0]}, [%0], %6            \n"
422
    MEMACCESS(0)
423
    "vst1.32     {d19[1]}, [%0], %6            \n"
424

425
    "add         %0, %5, #4                    \n"
426
    MEMACCESS(0)
427
    "vst1.32     {d22[0]},  [%0], %6           \n"
428
    MEMACCESS(0)
429
    "vst1.32     {d22[1]},  [%0], %6           \n"
430
    MEMACCESS(0)
431
    "vst1.32     {d23[0]},  [%0], %6           \n"
432
    MEMACCESS(0)
433
    "vst1.32     {d23[1]},  [%0]               \n"
434

435 436 437 438
    "add         %1, #4*2                      \n"  // src   += 4 * 2
    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    "subs        %7,  #4                       \n"  // w     -= 4
439
    "beq         4f                            \n"
440 441 442

    // some residual, check to see if it includes a 2x8 block,
    // or less
443
    "cmp         %7, #2                        \n"
444
    "blt         3f                            \n"
445 446

    // 2x8 block
447
    "2:                                        \n"
448
    "mov         %0, %1                        \n"
449
    MEMACCESS(0)
450
    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
451
    MEMACCESS(0)
452
    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
453
    MEMACCESS(0)
454
    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
455
    MEMACCESS(0)
456
    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
457
    MEMACCESS(0)
458
    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
459
    MEMACCESS(0)
460
    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
461
    MEMACCESS(0)
462
    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
463
    MEMACCESS(0)
464
    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
465

466 467
    "vtrn.8      d0, d1                        \n"
    "vtrn.8      d2, d3                        \n"
468

469
    "mov         %0, %3                        \n"
470

471
    MEMACCESS(0)
472
    "vst1.64     {d0}, [%0], %4                \n"
473
    MEMACCESS(0)
474
    "vst1.64     {d2}, [%0]                    \n"
475

476
    "mov         %0, %5                        \n"
477

478
    MEMACCESS(0)
479
    "vst1.64     {d1}, [%0], %6                \n"
480
    MEMACCESS(0)
481
    "vst1.64     {d3}, [%0]                    \n"
482

483 484 485 486
    "add         %1, #2*2                      \n"  // src   += 2 * 2
    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    "subs        %7,  #2                       \n"  // w     -= 2
487
    "beq         4f                            \n"
488 489

    // 1x8 block
490
    "3:                                        \n"
491
    MEMACCESS(1)
492
    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
493
    MEMACCESS(1)
494
    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
495
    MEMACCESS(1)
496
    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
497
    MEMACCESS(1)
498
    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
499
    MEMACCESS(1)
500
    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
501
    MEMACCESS(1)
502
    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
503
    MEMACCESS(1)
504
    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
505
    MEMACCESS(1)
506 507
    "vld2.8      {d0[7], d1[7]}, [%1]          \n"

508
    MEMACCESS(3)
509
    "vst1.64     {d0}, [%3]                    \n"
510
    MEMACCESS(5)
511
    "vst1.64     {d1}, [%5]                    \n"
512 513

    "4:                                        \n"
514

515 516 517 518 519 520 521 522 523 524
    : "+r"(src_temp),            // %0
      "+r"(src),                 // %1
      "+r"(src_stride),          // %2
      "+r"(dst_a),               // %3
      "+r"(dst_stride_a),        // %4
      "+r"(dst_b),               // %5
      "+r"(dst_stride_b),        // %6
      "+r"(width)                // %7
    : "r"(&kVTbl4x4TransposeDi)  // %8
    : "memory", "cc",
525 526 527
      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
}
528
#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
529 530 531 532 533

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif