rotate_neon.cc 18.1 KB
Newer Older
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 4 5 6
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "libyuv/row.h"
12

13 14
#include "libyuv/basic_types.h"

15
#ifdef __cplusplus
16
namespace libyuv {
17 18
extern "C" {
#endif
19

20 21
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
22

23
static uvec8 kVTbl4x4Transpose =
24 25 26 27 28
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride,
                       int width) {
29
  const uint8* src_temp = NULL;
30
  asm volatile (
31 32
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
33
    // at w-8 allow for this
34
    "sub         %5, #8                        \n"
35

36
    // handle 8x8 blocks. this should be the majority of the plane
37
    ".p2align  2                               \n"
38
    "1:                                        \n"
39
      "mov         %0, %1                      \n"
40

41
      MEMACCESS(0)
42
      "vld1.8      {d0}, [%0], %2              \n"
43
      MEMACCESS(0)
44
      "vld1.8      {d1}, [%0], %2              \n"
45
      MEMACCESS(0)
46
      "vld1.8      {d2}, [%0], %2              \n"
47
      MEMACCESS(0)
48
      "vld1.8      {d3}, [%0], %2              \n"
49
      MEMACCESS(0)
50
      "vld1.8      {d4}, [%0], %2              \n"
51
      MEMACCESS(0)
52
      "vld1.8      {d5}, [%0], %2              \n"
53
      MEMACCESS(0)
54
      "vld1.8      {d6}, [%0], %2              \n"
55
      MEMACCESS(0)
56
      "vld1.8      {d7}, [%0]                  \n"
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77

      "vtrn.8      d1, d0                      \n"
      "vtrn.8      d3, d2                      \n"
      "vtrn.8      d5, d4                      \n"
      "vtrn.8      d7, d6                      \n"

      "vtrn.16     d1, d3                      \n"
      "vtrn.16     d0, d2                      \n"
      "vtrn.16     d5, d7                      \n"
      "vtrn.16     d4, d6                      \n"

      "vtrn.32     d1, d5                      \n"
      "vtrn.32     d0, d4                      \n"
      "vtrn.32     d3, d7                      \n"
      "vtrn.32     d2, d6                      \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"

78
      "mov         %0, %3                      \n"
79

80
    MEMACCESS(0)
81
      "vst1.8      {d1}, [%0], %4              \n"
82
    MEMACCESS(0)
83
      "vst1.8      {d0}, [%0], %4              \n"
84
    MEMACCESS(0)
85
      "vst1.8      {d3}, [%0], %4              \n"
86
    MEMACCESS(0)
87
      "vst1.8      {d2}, [%0], %4              \n"
88
    MEMACCESS(0)
89
      "vst1.8      {d5}, [%0], %4              \n"
90
    MEMACCESS(0)
91
      "vst1.8      {d4}, [%0], %4              \n"
92
    MEMACCESS(0)
93
      "vst1.8      {d7}, [%0], %4              \n"
94
    MEMACCESS(0)
95
      "vst1.8      {d6}, [%0]                  \n"
96

97 98 99
      "add         %1, #8                      \n"  // src += 8
      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
      "subs        %5,  #8                     \n"  // w   -= 8
100
      "bge         1b                          \n"
101

102
    // add 8 back to counter. if the result is 0 there are
103
    // no residuals.
104
    "adds        %5, #8                        \n"
105
    "beq         4f                            \n"
106 107

    // some residual, so between 1 and 7 lines left to transpose
108
    "cmp         %5, #2                        \n"
109
    "blt         3f                            \n"
110

111
    "cmp         %5, #4                        \n"
112
    "blt         2f                            \n"
113 114

    // 4x8 block
115
    "mov         %0, %1                        \n"
116
    MEMACCESS(0)
117
    "vld1.32     {d0[0]}, [%0], %2             \n"
118
    MEMACCESS(0)
119
    "vld1.32     {d0[1]}, [%0], %2             \n"
120
    MEMACCESS(0)
121
    "vld1.32     {d1[0]}, [%0], %2             \n"
122
    MEMACCESS(0)
123
    "vld1.32     {d1[1]}, [%0], %2             \n"
124
    MEMACCESS(0)
125
    "vld1.32     {d2[0]}, [%0], %2             \n"
126
    MEMACCESS(0)
127
    "vld1.32     {d2[1]}, [%0], %2             \n"
128
    MEMACCESS(0)
129
    "vld1.32     {d3[0]}, [%0], %2             \n"
130
    MEMACCESS(0)
131
    "vld1.32     {d3[1]}, [%0]                 \n"
132

133
    "mov         %0, %3                        \n"
134

135
    MEMACCESS(6)
136
    "vld1.8      {q3}, [%6]                    \n"
137

138 139 140 141
    "vtbl.8      d4, {d0, d1}, d6              \n"
    "vtbl.8      d5, {d0, d1}, d7              \n"
    "vtbl.8      d0, {d2, d3}, d6              \n"
    "vtbl.8      d1, {d2, d3}, d7              \n"
142

143 144
    // TODO(frkoenig): Rework shuffle above to
    // write out with 4 instead of 8 writes.
145
    MEMACCESS(0)
146
    "vst1.32     {d4[0]}, [%0], %4             \n"
147
    MEMACCESS(0)
148
    "vst1.32     {d4[1]}, [%0], %4             \n"
149
    MEMACCESS(0)
150
    "vst1.32     {d5[0]}, [%0], %4             \n"
151
    MEMACCESS(0)
152 153 154
    "vst1.32     {d5[1]}, [%0]                 \n"

    "add         %0, %3, #4                    \n"
155
    MEMACCESS(0)
156
    "vst1.32     {d0[0]}, [%0], %4             \n"
157
    MEMACCESS(0)
158
    "vst1.32     {d0[1]}, [%0], %4             \n"
159
    MEMACCESS(0)
160
    "vst1.32     {d1[0]}, [%0], %4             \n"
161
    MEMACCESS(0)
162 163 164 165 166
    "vst1.32     {d1[1]}, [%0]                 \n"

    "add         %1, #4                        \n"  // src += 4
    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
    "subs        %5,  #4                       \n"  // w   -= 4
167
    "beq         4f                            \n"
168 169 170

    // some residual, check to see if it includes a 2x8 block,
    // or less
171
    "cmp         %5, #2                        \n"
172
    "blt         3f                            \n"
173 174

    // 2x8 block
175
    "2:                                        \n"
176
    "mov         %0, %1                        \n"
177
    MEMACCESS(0)
178
    "vld1.16     {d0[0]}, [%0], %2             \n"
179
    MEMACCESS(0)
180
    "vld1.16     {d1[0]}, [%0], %2             \n"
181
    MEMACCESS(0)
182
    "vld1.16     {d0[1]}, [%0], %2             \n"
183
    MEMACCESS(0)
184
    "vld1.16     {d1[1]}, [%0], %2             \n"
185
    MEMACCESS(0)
186
    "vld1.16     {d0[2]}, [%0], %2             \n"
187
    MEMACCESS(0)
188
    "vld1.16     {d1[2]}, [%0], %2             \n"
189
    MEMACCESS(0)
190
    "vld1.16     {d0[3]}, [%0], %2             \n"
191
    MEMACCESS(0)
192
    "vld1.16     {d1[3]}, [%0]                 \n"
193

194
    "vtrn.8      d0, d1                        \n"
195

196
    "mov         %0, %3                        \n"
197

198
    MEMACCESS(0)
199
    "vst1.64     {d0}, [%0], %4                \n"
200
    MEMACCESS(0)
201
    "vst1.64     {d1}, [%0]                    \n"
202

203 204 205
    "add         %1, #2                        \n"  // src += 2
    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
    "subs        %5,  #2                       \n"  // w   -= 2
206
    "beq         4f                            \n"
207 208

    // 1x8 block
209
    "3:                                        \n"
210
    MEMACCESS(1)
211
    "vld1.8      {d0[0]}, [%1], %2             \n"
212
    MEMACCESS(1)
213
    "vld1.8      {d0[1]}, [%1], %2             \n"
214
    MEMACCESS(1)
215
    "vld1.8      {d0[2]}, [%1], %2             \n"
216
    MEMACCESS(1)
217
    "vld1.8      {d0[3]}, [%1], %2             \n"
218
    MEMACCESS(1)
219
    "vld1.8      {d0[4]}, [%1], %2             \n"
220
    MEMACCESS(1)
221
    "vld1.8      {d0[5]}, [%1], %2             \n"
222
    MEMACCESS(1)
223
    "vld1.8      {d0[6]}, [%1], %2             \n"
224
    MEMACCESS(1)
225
    "vld1.8      {d0[7]}, [%1]                 \n"
226

227
    MEMACCESS(3)
228
    "vst1.64     {d0}, [%3]                    \n"
229

230
    "4:                                        \n"
231

232 233 234 235 236 237 238 239
    : "+r"(src_temp),          // %0
      "+r"(src),               // %1
      "+r"(src_stride),        // %2
      "+r"(dst),               // %3
      "+r"(dst_stride),        // %4
      "+r"(width)              // %5
    : "r"(&kVTbl4x4Transpose)  // %6
    : "memory", "cc", "q0", "q1", "q2", "q3"
240 241 242
  );
}

243
static uvec8 kVTbl4x4TransposeDi =
244 245 246 247 248 249
  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };

void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
250
  const uint8* src_temp = NULL;
251
  asm volatile (
252 253
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
254
    // at w-8 allow for this
255
    "sub         %7, #8                        \n"
256

257
    // handle 8x8 blocks. this should be the majority of the plane
258
    ".p2align  2                               \n"
259
    "1:                                        \n"
260
      "mov         %0, %1                      \n"
261

262
      MEMACCESS(0)
263
      "vld2.8      {d0,  d1},  [%0], %2        \n"
264
      MEMACCESS(0)
265
      "vld2.8      {d2,  d3},  [%0], %2        \n"
266
      MEMACCESS(0)
267
      "vld2.8      {d4,  d5},  [%0], %2        \n"
268
      MEMACCESS(0)
269
      "vld2.8      {d6,  d7},  [%0], %2        \n"
270
      MEMACCESS(0)
271
      "vld2.8      {d16, d17}, [%0], %2        \n"
272
      MEMACCESS(0)
273
      "vld2.8      {d18, d19}, [%0], %2        \n"
274
      MEMACCESS(0)
275
      "vld2.8      {d20, d21}, [%0], %2        \n"
276
      MEMACCESS(0)
277
      "vld2.8      {d22, d23}, [%0]            \n"
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302

      "vtrn.8      q1, q0                      \n"
      "vtrn.8      q3, q2                      \n"
      "vtrn.8      q9, q8                      \n"
      "vtrn.8      q11, q10                    \n"

      "vtrn.16     q1, q3                      \n"
      "vtrn.16     q0, q2                      \n"
      "vtrn.16     q9, q11                     \n"
      "vtrn.16     q8, q10                     \n"

      "vtrn.32     q1, q9                      \n"
      "vtrn.32     q0, q8                      \n"
      "vtrn.32     q3, q11                     \n"
      "vtrn.32     q2, q10                     \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"
      "vrev16.8    q8, q8                      \n"
      "vrev16.8    q9, q9                      \n"
      "vrev16.8    q10, q10                    \n"
      "vrev16.8    q11, q11                    \n"

303 304
      "mov         %0, %3                      \n"

305
    MEMACCESS(0)
306
      "vst1.8      {d2},  [%0], %4             \n"
307
    MEMACCESS(0)
308
      "vst1.8      {d0},  [%0], %4             \n"
309
    MEMACCESS(0)
310
      "vst1.8      {d6},  [%0], %4             \n"
311
    MEMACCESS(0)
312
      "vst1.8      {d4},  [%0], %4             \n"
313
    MEMACCESS(0)
314
      "vst1.8      {d18}, [%0], %4             \n"
315
    MEMACCESS(0)
316
      "vst1.8      {d16}, [%0], %4             \n"
317
    MEMACCESS(0)
318
      "vst1.8      {d22}, [%0], %4             \n"
319
    MEMACCESS(0)
320 321 322 323
      "vst1.8      {d20}, [%0]                 \n"

      "mov         %0, %5                      \n"

324
    MEMACCESS(0)
325
      "vst1.8      {d3},  [%0], %6             \n"
326
    MEMACCESS(0)
327
      "vst1.8      {d1},  [%0], %6             \n"
328
    MEMACCESS(0)
329
      "vst1.8      {d7},  [%0], %6             \n"
330
    MEMACCESS(0)
331
      "vst1.8      {d5},  [%0], %6             \n"
332
    MEMACCESS(0)
333
      "vst1.8      {d19}, [%0], %6             \n"
334
    MEMACCESS(0)
335
      "vst1.8      {d17}, [%0], %6             \n"
336
    MEMACCESS(0)
337
      "vst1.8      {d23}, [%0], %6             \n"
338
    MEMACCESS(0)
339 340 341 342 343 344
      "vst1.8      {d21}, [%0]                 \n"

      "add         %1, #8*2                    \n"  // src   += 8*2
      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
      "subs        %7,  #8                     \n"  // w     -= 8
345
      "bge         1b                          \n"
346

347
    // add 8 back to counter. if the result is 0 there are
348
    // no residuals.
349
    "adds        %7, #8                        \n"
350
    "beq         4f                            \n"
351 352

    // some residual, so between 1 and 7 lines left to transpose
353
    "cmp         %7, #2                        \n"
354
    "blt         3f                            \n"
355

356
    "cmp         %7, #4                        \n"
357
    "blt         2f                            \n"
358

359
    // TODO(frkoenig): Clean this up
360
    // 4x8 block
361
    "mov         %0, %1                        \n"
362
    MEMACCESS(0)
363
    "vld1.64     {d0}, [%0], %2                \n"
364
    MEMACCESS(0)
365
    "vld1.64     {d1}, [%0], %2                \n"
366
    MEMACCESS(0)
367
    "vld1.64     {d2}, [%0], %2                \n"
368
    MEMACCESS(0)
369
    "vld1.64     {d3}, [%0], %2                \n"
370
    MEMACCESS(0)
371
    "vld1.64     {d4}, [%0], %2                \n"
372
    MEMACCESS(0)
373
    "vld1.64     {d5}, [%0], %2                \n"
374
    MEMACCESS(0)
375
    "vld1.64     {d6}, [%0], %2                \n"
376
    MEMACCESS(0)
377 378
    "vld1.64     {d7}, [%0]                    \n"

379
    MEMACCESS(8)
380
    "vld1.8      {q15}, [%8]                   \n"
381 382 383 384 385 386 387 388 389 390 391 392 393

    "vtrn.8      q0, q1                        \n"
    "vtrn.8      q2, q3                        \n"

    "vtbl.8      d16, {d0, d1}, d30            \n"
    "vtbl.8      d17, {d0, d1}, d31            \n"
    "vtbl.8      d18, {d2, d3}, d30            \n"
    "vtbl.8      d19, {d2, d3}, d31            \n"
    "vtbl.8      d20, {d4, d5}, d30            \n"
    "vtbl.8      d21, {d4, d5}, d31            \n"
    "vtbl.8      d22, {d6, d7}, d30            \n"
    "vtbl.8      d23, {d6, d7}, d31            \n"

394
    "mov         %0, %3                        \n"
395

396
    MEMACCESS(0)
397
    "vst1.32     {d16[0]},  [%0], %4           \n"
398
    MEMACCESS(0)
399
    "vst1.32     {d16[1]},  [%0], %4           \n"
400
    MEMACCESS(0)
401
    "vst1.32     {d17[0]},  [%0], %4           \n"
402
    MEMACCESS(0)
403
    "vst1.32     {d17[1]},  [%0], %4           \n"
404

405
    "add         %0, %3, #4                    \n"
406
    MEMACCESS(0)
407
    "vst1.32     {d20[0]}, [%0], %4            \n"
408
    MEMACCESS(0)
409
    "vst1.32     {d20[1]}, [%0], %4            \n"
410
    MEMACCESS(0)
411
    "vst1.32     {d21[0]}, [%0], %4            \n"
412
    MEMACCESS(0)
413
    "vst1.32     {d21[1]}, [%0]                \n"
414

415
    "mov         %0, %5                        \n"
416

417
    MEMACCESS(0)
418
    "vst1.32     {d18[0]}, [%0], %6            \n"
419
    MEMACCESS(0)
420
    "vst1.32     {d18[1]}, [%0], %6            \n"
421
    MEMACCESS(0)
422
    "vst1.32     {d19[0]}, [%0], %6            \n"
423
    MEMACCESS(0)
424
    "vst1.32     {d19[1]}, [%0], %6            \n"
425

426
    "add         %0, %5, #4                    \n"
427
    MEMACCESS(0)
428
    "vst1.32     {d22[0]},  [%0], %6           \n"
429
    MEMACCESS(0)
430
    "vst1.32     {d22[1]},  [%0], %6           \n"
431
    MEMACCESS(0)
432
    "vst1.32     {d23[0]},  [%0], %6           \n"
433
    MEMACCESS(0)
434
    "vst1.32     {d23[1]},  [%0]               \n"
435

436 437 438 439
    "add         %1, #4*2                      \n"  // src   += 4 * 2
    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    "subs        %7,  #4                       \n"  // w     -= 4
440
    "beq         4f                            \n"
441 442 443

    // some residual, check to see if it includes a 2x8 block,
    // or less
444
    "cmp         %7, #2                        \n"
445
    "blt         3f                            \n"
446 447

    // 2x8 block
448
    "2:                                        \n"
449
    "mov         %0, %1                        \n"
450
    MEMACCESS(0)
451
    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
452
    MEMACCESS(0)
453
    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
454
    MEMACCESS(0)
455
    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
456
    MEMACCESS(0)
457
    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
458
    MEMACCESS(0)
459
    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
460
    MEMACCESS(0)
461
    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
462
    MEMACCESS(0)
463
    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
464
    MEMACCESS(0)
465
    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
466

467 468
    "vtrn.8      d0, d1                        \n"
    "vtrn.8      d2, d3                        \n"
469

470
    "mov         %0, %3                        \n"
471

472
    MEMACCESS(0)
473
    "vst1.64     {d0}, [%0], %4                \n"
474
    MEMACCESS(0)
475
    "vst1.64     {d2}, [%0]                    \n"
476

477
    "mov         %0, %5                        \n"
478

479
    MEMACCESS(0)
480
    "vst1.64     {d1}, [%0], %6                \n"
481
    MEMACCESS(0)
482
    "vst1.64     {d3}, [%0]                    \n"
483

484 485 486 487
    "add         %1, #2*2                      \n"  // src   += 2 * 2
    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    "subs        %7,  #2                       \n"  // w     -= 2
488
    "beq         4f                            \n"
489 490

    // 1x8 block
491
    "3:                                        \n"
492
    MEMACCESS(1)
493
    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
494
    MEMACCESS(1)
495
    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
496
    MEMACCESS(1)
497
    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
498
    MEMACCESS(1)
499
    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
500
    MEMACCESS(1)
501
    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
502
    MEMACCESS(1)
503
    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
504
    MEMACCESS(1)
505
    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
506
    MEMACCESS(1)
507 508
    "vld2.8      {d0[7], d1[7]}, [%1]          \n"

509
    MEMACCESS(3)
510
    "vst1.64     {d0}, [%3]                    \n"
511
    MEMACCESS(5)
512
    "vst1.64     {d1}, [%5]                    \n"
513 514

    "4:                                        \n"
515

516 517 518 519 520 521 522 523 524 525
    : "+r"(src_temp),            // %0
      "+r"(src),                 // %1
      "+r"(src_stride),          // %2
      "+r"(dst_a),               // %3
      "+r"(dst_stride_a),        // %4
      "+r"(dst_b),               // %5
      "+r"(dst_stride_b),        // %6
      "+r"(width)                // %7
    : "r"(&kVTbl4x4TransposeDi)  // %8
    : "memory", "cc",
526 527 528
      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
}
529
#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
530 531 532 533 534

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif