rotate_neon.cc 18.2 KB
Newer Older
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 4 5 6
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "libyuv/rotate_row.h"
12
#include "libyuv/row.h"
13

14 15
#include "libyuv/basic_types.h"

16
#ifdef __cplusplus
17
namespace libyuv {
18 19
extern "C" {
#endif
20

21 22
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
23

Frank Barchard's avatar
Frank Barchard committed
24 25
static uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
                                  2, 6, 10, 14, 3, 7, 11, 15};
26

Frank Barchard's avatar
Frank Barchard committed
27 28 29 30
void TransposeWx8_NEON(const uint8* src,
                       int src_stride,
                       uint8* dst,
                       int dst_stride,
31
                       int width) {
32
  const uint8* src_temp;
33
  asm volatile (
34 35
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
36
    // at w-8 allow for this
37
    "sub         %5, #8                        \n"
38

39
    // handle 8x8 blocks. this should be the majority of the plane
40
    "1:                                        \n"
41
      "mov         %0, %1                      \n"
42

43
      MEMACCESS(0)
44
      "vld1.8      {d0}, [%0], %2              \n"
45
      MEMACCESS(0)
46
      "vld1.8      {d1}, [%0], %2              \n"
47
      MEMACCESS(0)
48
      "vld1.8      {d2}, [%0], %2              \n"
49
      MEMACCESS(0)
50
      "vld1.8      {d3}, [%0], %2              \n"
51
      MEMACCESS(0)
52
      "vld1.8      {d4}, [%0], %2              \n"
53
      MEMACCESS(0)
54
      "vld1.8      {d5}, [%0], %2              \n"
55
      MEMACCESS(0)
56
      "vld1.8      {d6}, [%0], %2              \n"
57
      MEMACCESS(0)
58
      "vld1.8      {d7}, [%0]                  \n"
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79

      "vtrn.8      d1, d0                      \n"
      "vtrn.8      d3, d2                      \n"
      "vtrn.8      d5, d4                      \n"
      "vtrn.8      d7, d6                      \n"

      "vtrn.16     d1, d3                      \n"
      "vtrn.16     d0, d2                      \n"
      "vtrn.16     d5, d7                      \n"
      "vtrn.16     d4, d6                      \n"

      "vtrn.32     d1, d5                      \n"
      "vtrn.32     d0, d4                      \n"
      "vtrn.32     d3, d7                      \n"
      "vtrn.32     d2, d6                      \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"

80
      "mov         %0, %3                      \n"
81

82
    MEMACCESS(0)
83
      "vst1.8      {d1}, [%0], %4              \n"
84
    MEMACCESS(0)
85
      "vst1.8      {d0}, [%0], %4              \n"
86
    MEMACCESS(0)
87
      "vst1.8      {d3}, [%0], %4              \n"
88
    MEMACCESS(0)
89
      "vst1.8      {d2}, [%0], %4              \n"
90
    MEMACCESS(0)
91
      "vst1.8      {d5}, [%0], %4              \n"
92
    MEMACCESS(0)
93
      "vst1.8      {d4}, [%0], %4              \n"
94
    MEMACCESS(0)
95
      "vst1.8      {d7}, [%0], %4              \n"
96
    MEMACCESS(0)
97
      "vst1.8      {d6}, [%0]                  \n"
98

99 100 101
      "add         %1, #8                      \n"  // src += 8
      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
      "subs        %5,  #8                     \n"  // w   -= 8
102
      "bge         1b                          \n"
103

104
    // add 8 back to counter. if the result is 0 there are
105
    // no residuals.
106
    "adds        %5, #8                        \n"
107
    "beq         4f                            \n"
108 109

    // some residual, so between 1 and 7 lines left to transpose
110
    "cmp         %5, #2                        \n"
111
    "blt         3f                            \n"
112

113
    "cmp         %5, #4                        \n"
114
    "blt         2f                            \n"
115 116

    // 4x8 block
117
    "mov         %0, %1                        \n"
118
    MEMACCESS(0)
119
    "vld1.32     {d0[0]}, [%0], %2             \n"
120
    MEMACCESS(0)
121
    "vld1.32     {d0[1]}, [%0], %2             \n"
122
    MEMACCESS(0)
123
    "vld1.32     {d1[0]}, [%0], %2             \n"
124
    MEMACCESS(0)
125
    "vld1.32     {d1[1]}, [%0], %2             \n"
126
    MEMACCESS(0)
127
    "vld1.32     {d2[0]}, [%0], %2             \n"
128
    MEMACCESS(0)
129
    "vld1.32     {d2[1]}, [%0], %2             \n"
130
    MEMACCESS(0)
131
    "vld1.32     {d3[0]}, [%0], %2             \n"
132
    MEMACCESS(0)
133
    "vld1.32     {d3[1]}, [%0]                 \n"
134

135
    "mov         %0, %3                        \n"
136

137
    MEMACCESS(6)
138
    "vld1.8      {q3}, [%6]                    \n"
139

140 141 142 143
    "vtbl.8      d4, {d0, d1}, d6              \n"
    "vtbl.8      d5, {d0, d1}, d7              \n"
    "vtbl.8      d0, {d2, d3}, d6              \n"
    "vtbl.8      d1, {d2, d3}, d7              \n"
144

145 146
    // TODO(frkoenig): Rework shuffle above to
    // write out with 4 instead of 8 writes.
147
    MEMACCESS(0)
148
    "vst1.32     {d4[0]}, [%0], %4             \n"
149
    MEMACCESS(0)
150
    "vst1.32     {d4[1]}, [%0], %4             \n"
151
    MEMACCESS(0)
152
    "vst1.32     {d5[0]}, [%0], %4             \n"
153
    MEMACCESS(0)
154 155 156
    "vst1.32     {d5[1]}, [%0]                 \n"

    "add         %0, %3, #4                    \n"
157
    MEMACCESS(0)
158
    "vst1.32     {d0[0]}, [%0], %4             \n"
159
    MEMACCESS(0)
160
    "vst1.32     {d0[1]}, [%0], %4             \n"
161
    MEMACCESS(0)
162
    "vst1.32     {d1[0]}, [%0], %4             \n"
163
    MEMACCESS(0)
164 165 166 167 168
    "vst1.32     {d1[1]}, [%0]                 \n"

    "add         %1, #4                        \n"  // src += 4
    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
    "subs        %5,  #4                       \n"  // w   -= 4
169
    "beq         4f                            \n"
170 171 172

    // some residual, check to see if it includes a 2x8 block,
    // or less
173
    "cmp         %5, #2                        \n"
174
    "blt         3f                            \n"
175 176

    // 2x8 block
177
    "2:                                        \n"
178
    "mov         %0, %1                        \n"
179
    MEMACCESS(0)
180
    "vld1.16     {d0[0]}, [%0], %2             \n"
181
    MEMACCESS(0)
182
    "vld1.16     {d1[0]}, [%0], %2             \n"
183
    MEMACCESS(0)
184
    "vld1.16     {d0[1]}, [%0], %2             \n"
185
    MEMACCESS(0)
186
    "vld1.16     {d1[1]}, [%0], %2             \n"
187
    MEMACCESS(0)
188
    "vld1.16     {d0[2]}, [%0], %2             \n"
189
    MEMACCESS(0)
190
    "vld1.16     {d1[2]}, [%0], %2             \n"
191
    MEMACCESS(0)
192
    "vld1.16     {d0[3]}, [%0], %2             \n"
193
    MEMACCESS(0)
194
    "vld1.16     {d1[3]}, [%0]                 \n"
195

196
    "vtrn.8      d0, d1                        \n"
197

198
    "mov         %0, %3                        \n"
199

200
    MEMACCESS(0)
201
    "vst1.64     {d0}, [%0], %4                \n"
202
    MEMACCESS(0)
203
    "vst1.64     {d1}, [%0]                    \n"
204

205 206 207
    "add         %1, #2                        \n"  // src += 2
    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
    "subs        %5,  #2                       \n"  // w   -= 2
208
    "beq         4f                            \n"
209 210

    // 1x8 block
211
    "3:                                        \n"
212
    MEMACCESS(1)
213
    "vld1.8      {d0[0]}, [%1], %2             \n"
214
    MEMACCESS(1)
215
    "vld1.8      {d0[1]}, [%1], %2             \n"
216
    MEMACCESS(1)
217
    "vld1.8      {d0[2]}, [%1], %2             \n"
218
    MEMACCESS(1)
219
    "vld1.8      {d0[3]}, [%1], %2             \n"
220
    MEMACCESS(1)
221
    "vld1.8      {d0[4]}, [%1], %2             \n"
222
    MEMACCESS(1)
223
    "vld1.8      {d0[5]}, [%1], %2             \n"
224
    MEMACCESS(1)
225
    "vld1.8      {d0[6]}, [%1], %2             \n"
226
    MEMACCESS(1)
227
    "vld1.8      {d0[7]}, [%1]                 \n"
228

229
    MEMACCESS(3)
230
    "vst1.64     {d0}, [%3]                    \n"
231

232
    "4:                                        \n"
233

234
    : "=&r"(src_temp),         // %0
235 236 237 238 239 240 241
      "+r"(src),               // %1
      "+r"(src_stride),        // %2
      "+r"(dst),               // %3
      "+r"(dst_stride),        // %4
      "+r"(width)              // %5
    : "r"(&kVTbl4x4Transpose)  // %6
    : "memory", "cc", "q0", "q1", "q2", "q3"
242 243 244
  );
}

Frank Barchard's avatar
Frank Barchard committed
245 246
static uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
                                    4, 12, 5, 13, 6, 14, 7, 15};
247

Frank Barchard's avatar
Frank Barchard committed
248 249 250 251 252 253
void TransposeUVWx8_NEON(const uint8* src,
                         int src_stride,
                         uint8* dst_a,
                         int dst_stride_a,
                         uint8* dst_b,
                         int dst_stride_b,
254
                         int width) {
255
  const uint8* src_temp;
256
  asm volatile (
257 258
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
259
    // at w-8 allow for this
260
    "sub         %7, #8                        \n"
261

262
    // handle 8x8 blocks. this should be the majority of the plane
263
    "1:                                        \n"
264
      "mov         %0, %1                      \n"
265

266
      MEMACCESS(0)
267
      "vld2.8      {d0,  d1},  [%0], %2        \n"
268
      MEMACCESS(0)
269
      "vld2.8      {d2,  d3},  [%0], %2        \n"
270
      MEMACCESS(0)
271
      "vld2.8      {d4,  d5},  [%0], %2        \n"
272
      MEMACCESS(0)
273
      "vld2.8      {d6,  d7},  [%0], %2        \n"
274
      MEMACCESS(0)
275
      "vld2.8      {d16, d17}, [%0], %2        \n"
276
      MEMACCESS(0)
277
      "vld2.8      {d18, d19}, [%0], %2        \n"
278
      MEMACCESS(0)
279
      "vld2.8      {d20, d21}, [%0], %2        \n"
280
      MEMACCESS(0)
281
      "vld2.8      {d22, d23}, [%0]            \n"
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306

      "vtrn.8      q1, q0                      \n"
      "vtrn.8      q3, q2                      \n"
      "vtrn.8      q9, q8                      \n"
      "vtrn.8      q11, q10                    \n"

      "vtrn.16     q1, q3                      \n"
      "vtrn.16     q0, q2                      \n"
      "vtrn.16     q9, q11                     \n"
      "vtrn.16     q8, q10                     \n"

      "vtrn.32     q1, q9                      \n"
      "vtrn.32     q0, q8                      \n"
      "vtrn.32     q3, q11                     \n"
      "vtrn.32     q2, q10                     \n"

      "vrev16.8    q0, q0                      \n"
      "vrev16.8    q1, q1                      \n"
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"
      "vrev16.8    q8, q8                      \n"
      "vrev16.8    q9, q9                      \n"
      "vrev16.8    q10, q10                    \n"
      "vrev16.8    q11, q11                    \n"

307 308
      "mov         %0, %3                      \n"

309
    MEMACCESS(0)
310
      "vst1.8      {d2},  [%0], %4             \n"
311
    MEMACCESS(0)
312
      "vst1.8      {d0},  [%0], %4             \n"
313
    MEMACCESS(0)
314
      "vst1.8      {d6},  [%0], %4             \n"
315
    MEMACCESS(0)
316
      "vst1.8      {d4},  [%0], %4             \n"
317
    MEMACCESS(0)
318
      "vst1.8      {d18}, [%0], %4             \n"
319
    MEMACCESS(0)
320
      "vst1.8      {d16}, [%0], %4             \n"
321
    MEMACCESS(0)
322
      "vst1.8      {d22}, [%0], %4             \n"
323
    MEMACCESS(0)
324 325 326 327
      "vst1.8      {d20}, [%0]                 \n"

      "mov         %0, %5                      \n"

328
    MEMACCESS(0)
329
      "vst1.8      {d3},  [%0], %6             \n"
330
    MEMACCESS(0)
331
      "vst1.8      {d1},  [%0], %6             \n"
332
    MEMACCESS(0)
333
      "vst1.8      {d7},  [%0], %6             \n"
334
    MEMACCESS(0)
335
      "vst1.8      {d5},  [%0], %6             \n"
336
    MEMACCESS(0)
337
      "vst1.8      {d19}, [%0], %6             \n"
338
    MEMACCESS(0)
339
      "vst1.8      {d17}, [%0], %6             \n"
340
    MEMACCESS(0)
341
      "vst1.8      {d23}, [%0], %6             \n"
342
    MEMACCESS(0)
343 344 345 346 347 348
      "vst1.8      {d21}, [%0]                 \n"

      "add         %1, #8*2                    \n"  // src   += 8*2
      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
      "subs        %7,  #8                     \n"  // w     -= 8
349
      "bge         1b                          \n"
350

351
    // add 8 back to counter. if the result is 0 there are
352
    // no residuals.
353
    "adds        %7, #8                        \n"
354
    "beq         4f                            \n"
355 356

    // some residual, so between 1 and 7 lines left to transpose
357
    "cmp         %7, #2                        \n"
358
    "blt         3f                            \n"
359

360
    "cmp         %7, #4                        \n"
361
    "blt         2f                            \n"
362

363
    // TODO(frkoenig): Clean this up
364
    // 4x8 block
365
    "mov         %0, %1                        \n"
366
    MEMACCESS(0)
367
    "vld1.64     {d0}, [%0], %2                \n"
368
    MEMACCESS(0)
369
    "vld1.64     {d1}, [%0], %2                \n"
370
    MEMACCESS(0)
371
    "vld1.64     {d2}, [%0], %2                \n"
372
    MEMACCESS(0)
373
    "vld1.64     {d3}, [%0], %2                \n"
374
    MEMACCESS(0)
375
    "vld1.64     {d4}, [%0], %2                \n"
376
    MEMACCESS(0)
377
    "vld1.64     {d5}, [%0], %2                \n"
378
    MEMACCESS(0)
379
    "vld1.64     {d6}, [%0], %2                \n"
380
    MEMACCESS(0)
381 382
    "vld1.64     {d7}, [%0]                    \n"

383
    MEMACCESS(8)
384
    "vld1.8      {q15}, [%8]                   \n"
385 386 387 388 389 390 391 392 393 394 395 396 397

    "vtrn.8      q0, q1                        \n"
    "vtrn.8      q2, q3                        \n"

    "vtbl.8      d16, {d0, d1}, d30            \n"
    "vtbl.8      d17, {d0, d1}, d31            \n"
    "vtbl.8      d18, {d2, d3}, d30            \n"
    "vtbl.8      d19, {d2, d3}, d31            \n"
    "vtbl.8      d20, {d4, d5}, d30            \n"
    "vtbl.8      d21, {d4, d5}, d31            \n"
    "vtbl.8      d22, {d6, d7}, d30            \n"
    "vtbl.8      d23, {d6, d7}, d31            \n"

398
    "mov         %0, %3                        \n"
399

400
    MEMACCESS(0)
401
    "vst1.32     {d16[0]},  [%0], %4           \n"
402
    MEMACCESS(0)
403
    "vst1.32     {d16[1]},  [%0], %4           \n"
404
    MEMACCESS(0)
405
    "vst1.32     {d17[0]},  [%0], %4           \n"
406
    MEMACCESS(0)
407
    "vst1.32     {d17[1]},  [%0], %4           \n"
408

409
    "add         %0, %3, #4                    \n"
410
    MEMACCESS(0)
411
    "vst1.32     {d20[0]}, [%0], %4            \n"
412
    MEMACCESS(0)
413
    "vst1.32     {d20[1]}, [%0], %4            \n"
414
    MEMACCESS(0)
415
    "vst1.32     {d21[0]}, [%0], %4            \n"
416
    MEMACCESS(0)
417
    "vst1.32     {d21[1]}, [%0]                \n"
418

419
    "mov         %0, %5                        \n"
420

421
    MEMACCESS(0)
422
    "vst1.32     {d18[0]}, [%0], %6            \n"
423
    MEMACCESS(0)
424
    "vst1.32     {d18[1]}, [%0], %6            \n"
425
    MEMACCESS(0)
426
    "vst1.32     {d19[0]}, [%0], %6            \n"
427
    MEMACCESS(0)
428
    "vst1.32     {d19[1]}, [%0], %6            \n"
429

430
    "add         %0, %5, #4                    \n"
431
    MEMACCESS(0)
432
    "vst1.32     {d22[0]},  [%0], %6           \n"
433
    MEMACCESS(0)
434
    "vst1.32     {d22[1]},  [%0], %6           \n"
435
    MEMACCESS(0)
436
    "vst1.32     {d23[0]},  [%0], %6           \n"
437
    MEMACCESS(0)
438
    "vst1.32     {d23[1]},  [%0]               \n"
439

440 441 442 443
    "add         %1, #4*2                      \n"  // src   += 4 * 2
    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
    "subs        %7,  #4                       \n"  // w     -= 4
444
    "beq         4f                            \n"
445 446 447

    // some residual, check to see if it includes a 2x8 block,
    // or less
448
    "cmp         %7, #2                        \n"
449
    "blt         3f                            \n"
450 451

    // 2x8 block
452
    "2:                                        \n"
453
    "mov         %0, %1                        \n"
454
    MEMACCESS(0)
455
    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
456
    MEMACCESS(0)
457
    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
458
    MEMACCESS(0)
459
    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
460
    MEMACCESS(0)
461
    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
462
    MEMACCESS(0)
463
    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
464
    MEMACCESS(0)
465
    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
466
    MEMACCESS(0)
467
    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
468
    MEMACCESS(0)
469
    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
470

471 472
    "vtrn.8      d0, d1                        \n"
    "vtrn.8      d2, d3                        \n"
473

474
    "mov         %0, %3                        \n"
475

476
    MEMACCESS(0)
477
    "vst1.64     {d0}, [%0], %4                \n"
478
    MEMACCESS(0)
479
    "vst1.64     {d2}, [%0]                    \n"
480

481
    "mov         %0, %5                        \n"
482

483
    MEMACCESS(0)
484
    "vst1.64     {d1}, [%0], %6                \n"
485
    MEMACCESS(0)
486
    "vst1.64     {d3}, [%0]                    \n"
487

488 489 490 491
    "add         %1, #2*2                      \n"  // src   += 2 * 2
    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
    "subs        %7,  #2                       \n"  // w     -= 2
492
    "beq         4f                            \n"
493 494

    // 1x8 block
495
    "3:                                        \n"
496
    MEMACCESS(1)
497
    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
498
    MEMACCESS(1)
499
    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
500
    MEMACCESS(1)
501
    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
502
    MEMACCESS(1)
503
    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
504
    MEMACCESS(1)
505
    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
506
    MEMACCESS(1)
507
    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
508
    MEMACCESS(1)
509
    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
510
    MEMACCESS(1)
511 512
    "vld2.8      {d0[7], d1[7]}, [%1]          \n"

513
    MEMACCESS(3)
514
    "vst1.64     {d0}, [%3]                    \n"
515
    MEMACCESS(5)
516
    "vst1.64     {d1}, [%5]                    \n"
517 518

    "4:                                        \n"
519

520
    : "=&r"(src_temp),           // %0
521 522 523 524 525 526 527 528 529
      "+r"(src),                 // %1
      "+r"(src_stride),          // %2
      "+r"(dst_a),               // %3
      "+r"(dst_stride_a),        // %4
      "+r"(dst_b),               // %5
      "+r"(dst_stride_b),        // %6
      "+r"(width)                // %7
    : "r"(&kVTbl4x4TransposeDi)  // %8
    : "memory", "cc",
530 531 532
      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
}
533
#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
534 535 536 537 538

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif