rotate_neon64.cc 19.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "libyuv/row.h"
12
#include "libyuv/rotate_row.h"
13 14 15 16 17 18 19 20

#include "libyuv/basic_types.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

21
// This module is for GCC Neon armv8 64 bit.
22
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
23

24
static uvec8 kVTbl4x4Transpose =
25
  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
26 27

void TransposeWx8_NEON(const uint8* src, int src_stride,
28
                       uint8* dst, int dst_stride, int width) {
29
  const uint8* src_temp;
30
  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
31 32 33 34
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
35
    "sub         %3, %3, #8                      \n"
36 37

    // handle 8x8 blocks. this should be the majority of the plane
38 39
    "1:                                          \n"
      "mov         %0, %1                        \n"
40 41

      MEMACCESS(0)
42
      "ld1        {v0.8b}, [%0], %5              \n"
43
      MEMACCESS(0)
44
      "ld1        {v1.8b}, [%0], %5              \n"
45
      MEMACCESS(0)
46
      "ld1        {v2.8b}, [%0], %5              \n"
47
      MEMACCESS(0)
48
      "ld1        {v3.8b}, [%0], %5              \n"
49
      MEMACCESS(0)
50
      "ld1        {v4.8b}, [%0], %5              \n"
51
      MEMACCESS(0)
52
      "ld1        {v5.8b}, [%0], %5              \n"
53
      MEMACCESS(0)
54
      "ld1        {v6.8b}, [%0], %5              \n"
55
      MEMACCESS(0)
56
      "ld1        {v7.8b}, [%0]                  \n"
57

58 59 60 61 62 63 64 65
      "trn2     v16.8b, v0.8b, v1.8b             \n"
      "trn1     v17.8b, v0.8b, v1.8b             \n"
      "trn2     v18.8b, v2.8b, v3.8b             \n"
      "trn1     v19.8b, v2.8b, v3.8b             \n"
      "trn2     v20.8b, v4.8b, v5.8b             \n"
      "trn1     v21.8b, v4.8b, v5.8b             \n"
      "trn2     v22.8b, v6.8b, v7.8b             \n"
      "trn1     v23.8b, v6.8b, v7.8b             \n"
66

67 68 69 70 71 72 73 74
      "trn2     v3.4h, v17.4h, v19.4h            \n"
      "trn1     v1.4h, v17.4h, v19.4h            \n"
      "trn2     v2.4h, v16.4h, v18.4h            \n"
      "trn1     v0.4h, v16.4h, v18.4h            \n"
      "trn2     v7.4h, v21.4h, v23.4h            \n"
      "trn1     v5.4h, v21.4h, v23.4h            \n"
      "trn2     v6.4h, v20.4h, v22.4h            \n"
      "trn1     v4.4h, v20.4h, v22.4h            \n"
75

76 77 78 79 80 81 82 83
      "trn2     v21.2s, v1.2s, v5.2s             \n"
      "trn1     v17.2s, v1.2s, v5.2s             \n"
      "trn2     v20.2s, v0.2s, v4.2s             \n"
      "trn1     v16.2s, v0.2s, v4.2s             \n"
      "trn2     v23.2s, v3.2s, v7.2s             \n"
      "trn1     v19.2s, v3.2s, v7.2s             \n"
      "trn2     v22.2s, v2.2s, v6.2s             \n"
      "trn1     v18.2s, v2.2s, v6.2s             \n"
84

85
      "mov         %0, %2                        \n"
86 87

    MEMACCESS(0)
88
      "st1      {v17.8b}, [%0], %6               \n"
89
    MEMACCESS(0)
90
      "st1      {v16.8b}, [%0], %6               \n"
91
    MEMACCESS(0)
92
      "st1      {v19.8b}, [%0], %6               \n"
93
    MEMACCESS(0)
94
      "st1      {v18.8b}, [%0], %6               \n"
95
    MEMACCESS(0)
96
      "st1      {v21.8b}, [%0], %6               \n"
97
    MEMACCESS(0)
98
      "st1      {v20.8b}, [%0], %6               \n"
99
    MEMACCESS(0)
100
      "st1      {v23.8b}, [%0], %6               \n"
101
    MEMACCESS(0)
102
      "st1      {v22.8b}, [%0]                   \n"
103

104 105 106
      "add         %1, %1, #8                    \n"  // src += 8
      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
      "subs        %3, %3, #8                    \n"  // w   -= 8
107
      "b.ge        1b                            \n"
108 109 110

    // add 8 back to counter. if the result is 0 there are
    // no residuals.
111
    "adds        %3, %3, #8                      \n"
112
    "b.eq        4f                              \n"
113 114

    // some residual, so between 1 and 7 lines left to transpose
115
    "cmp         %3, #2                          \n"
116
    "b.lt        3f                              \n"
117

118
    "cmp         %3, #4                          \n"
119
    "b.lt        2f                              \n"
120 121

    // 4x8 block
122
    "mov         %0, %1                          \n"
123
    MEMACCESS(0)
124
    "ld1     {v0.s}[0], [%0], %5                 \n"
125
    MEMACCESS(0)
126
    "ld1     {v0.s}[1], [%0], %5                 \n"
127
    MEMACCESS(0)
128
    "ld1     {v0.s}[2], [%0], %5                 \n"
129
    MEMACCESS(0)
130
    "ld1     {v0.s}[3], [%0], %5                 \n"
131
    MEMACCESS(0)
132
    "ld1     {v1.s}[0], [%0], %5                 \n"
133
    MEMACCESS(0)
134
    "ld1     {v1.s}[1], [%0], %5                 \n"
135
    MEMACCESS(0)
136
    "ld1     {v1.s}[2], [%0], %5                 \n"
137
    MEMACCESS(0)
138
    "ld1     {v1.s}[3], [%0]                     \n"
139

140
    "mov         %0, %2                          \n"
141

142 143
    MEMACCESS(4)
    "ld1      {v2.16b}, [%4]                     \n"
144

145 146
    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
147 148 149 150

    // TODO(frkoenig): Rework shuffle above to
    // write out with 4 instead of 8 writes.
    MEMACCESS(0)
151
    "st1 {v3.s}[0], [%0], %6                     \n"
152
    MEMACCESS(0)
153
    "st1 {v3.s}[1], [%0], %6                     \n"
154
    MEMACCESS(0)
155
    "st1 {v3.s}[2], [%0], %6                     \n"
156
    MEMACCESS(0)
157
    "st1 {v3.s}[3], [%0]                         \n"
158

159
    "add         %0, %2, #4                      \n"
160
    MEMACCESS(0)
161
    "st1 {v0.s}[0], [%0], %6                     \n"
162
    MEMACCESS(0)
163
    "st1 {v0.s}[1], [%0], %6                     \n"
164
    MEMACCESS(0)
165
    "st1 {v0.s}[2], [%0], %6                     \n"
166
    MEMACCESS(0)
167
    "st1 {v0.s}[3], [%0]                         \n"
168

169 170 171
    "add         %1, %1, #4                      \n"  // src += 4
    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
    "subs        %3, %3, #4                      \n"  // w   -= 4
172
    "b.eq        4f                              \n"
173 174 175

    // some residual, check to see if it includes a 2x8 block,
    // or less
176
    "cmp         %3, #2                          \n"
177
    "b.lt        3f                              \n"
178 179

    // 2x8 block
180 181
    "2:                                          \n"
    "mov         %0, %1                          \n"
182
    MEMACCESS(0)
183
    "ld1     {v0.h}[0], [%0], %5                 \n"
184
    MEMACCESS(0)
185
    "ld1     {v1.h}[0], [%0], %5                 \n"
186
    MEMACCESS(0)
187
    "ld1     {v0.h}[1], [%0], %5                 \n"
188
    MEMACCESS(0)
189
    "ld1     {v1.h}[1], [%0], %5                 \n"
190
    MEMACCESS(0)
191
    "ld1     {v0.h}[2], [%0], %5                 \n"
192
    MEMACCESS(0)
193
    "ld1     {v1.h}[2], [%0], %5                 \n"
194
    MEMACCESS(0)
195
    "ld1     {v0.h}[3], [%0], %5                 \n"
196
    MEMACCESS(0)
197
    "ld1     {v1.h}[3], [%0]                     \n"
198

199 200
    "trn2    v2.8b, v0.8b, v1.8b                 \n"
    "trn1    v3.8b, v0.8b, v1.8b                 \n"
201

202
    "mov         %0, %2                          \n"
203 204

    MEMACCESS(0)
205
    "st1     {v3.8b}, [%0], %6                   \n"
206
    MEMACCESS(0)
207
    "st1     {v2.8b}, [%0]                       \n"
208

209 210 211
    "add         %1, %1, #2                      \n"  // src += 2
    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
    "subs        %3, %3,  #2                     \n"  // w   -= 2
212
    "b.eq        4f                              \n"
213 214

    // 1x8 block
215
    "3:                                          \n"
216
    MEMACCESS(1)
217
    "ld1         {v0.b}[0], [%1], %5             \n"
218
    MEMACCESS(1)
219
    "ld1         {v0.b}[1], [%1], %5             \n"
220
    MEMACCESS(1)
221
    "ld1         {v0.b}[2], [%1], %5             \n"
222
    MEMACCESS(1)
223
    "ld1         {v0.b}[3], [%1], %5             \n"
224
    MEMACCESS(1)
225
    "ld1         {v0.b}[4], [%1], %5             \n"
226
    MEMACCESS(1)
227
    "ld1         {v0.b}[5], [%1], %5             \n"
228
    MEMACCESS(1)
229
    "ld1         {v0.b}[6], [%1], %5             \n"
230
    MEMACCESS(1)
231 232 233 234 235 236 237
    "ld1         {v0.b}[7], [%1]                 \n"

    MEMACCESS(2)
    "st1         {v0.8b}, [%2]                   \n"

    "4:                                          \n"

238
    : "=&r"(src_temp),                            // %0
239 240
      "+r"(src),                                  // %1
      "+r"(dst),                                  // %2
241
      "+r"(width64)                               // %3
242
    : "r"(&kVTbl4x4Transpose),                    // %4
243 244
      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
245 246
    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
247 248 249
  );
}

250 251 252
static uint8 kVTbl4x4TransposeDi[32] =
  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
253 254 255 256 257

void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
258
  const uint8* src_temp;
259
  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
260 261 262 263
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
264
    "sub       %4, %4, #8                      \n"
265 266 267

    // handle 8x8 blocks. this should be the majority of the plane
    "1:                                        \n"
268
    "mov       %0, %1                          \n"
269

270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
    MEMACCESS(0)
    "ld1       {v0.16b}, [%0], %5              \n"
    MEMACCESS(0)
    "ld1       {v1.16b}, [%0], %5              \n"
    MEMACCESS(0)
    "ld1       {v2.16b}, [%0], %5              \n"
    MEMACCESS(0)
    "ld1       {v3.16b}, [%0], %5              \n"
    MEMACCESS(0)
    "ld1       {v4.16b}, [%0], %5              \n"
    MEMACCESS(0)
    "ld1       {v5.16b}, [%0], %5              \n"
    MEMACCESS(0)
    "ld1       {v6.16b}, [%0], %5              \n"
    MEMACCESS(0)
    "ld1       {v7.16b}, [%0]                  \n"
286

287 288 289 290 291 292 293 294
    "trn1      v16.16b, v0.16b, v1.16b         \n"
    "trn2      v17.16b, v0.16b, v1.16b         \n"
    "trn1      v18.16b, v2.16b, v3.16b         \n"
    "trn2      v19.16b, v2.16b, v3.16b         \n"
    "trn1      v20.16b, v4.16b, v5.16b         \n"
    "trn2      v21.16b, v4.16b, v5.16b         \n"
    "trn1      v22.16b, v6.16b, v7.16b         \n"
    "trn2      v23.16b, v6.16b, v7.16b         \n"
295

296 297 298 299 300 301 302 303
    "trn1      v0.8h, v16.8h, v18.8h           \n"
    "trn2      v1.8h, v16.8h, v18.8h           \n"
    "trn1      v2.8h, v20.8h, v22.8h           \n"
    "trn2      v3.8h, v20.8h, v22.8h           \n"
    "trn1      v4.8h, v17.8h, v19.8h           \n"
    "trn2      v5.8h, v17.8h, v19.8h           \n"
    "trn1      v6.8h, v21.8h, v23.8h           \n"
    "trn2      v7.8h, v21.8h, v23.8h           \n"
304

305 306 307 308 309 310 311 312
    "trn1      v16.4s, v0.4s, v2.4s            \n"
    "trn2      v17.4s, v0.4s, v2.4s            \n"
    "trn1      v18.4s, v1.4s, v3.4s            \n"
    "trn2      v19.4s, v1.4s, v3.4s            \n"
    "trn1      v20.4s, v4.4s, v6.4s            \n"
    "trn2      v21.4s, v4.4s, v6.4s            \n"
    "trn1      v22.4s, v5.4s, v7.4s            \n"
    "trn2      v23.4s, v5.4s, v7.4s            \n"
313

314
    "mov       %0, %2                          \n"
315 316

    MEMACCESS(0)
317
    "st1       {v16.d}[0], [%0], %6            \n"
318
    MEMACCESS(0)
319
    "st1       {v18.d}[0], [%0], %6            \n"
320
    MEMACCESS(0)
321
    "st1       {v17.d}[0], [%0], %6            \n"
322
    MEMACCESS(0)
323
    "st1       {v19.d}[0], [%0], %6            \n"
324
    MEMACCESS(0)
325
    "st1       {v16.d}[1], [%0], %6            \n"
326
    MEMACCESS(0)
327
    "st1       {v18.d}[1], [%0], %6            \n"
328
    MEMACCESS(0)
329
    "st1       {v17.d}[1], [%0], %6            \n"
330
    MEMACCESS(0)
331
    "st1       {v19.d}[1], [%0]                \n"
332

333
    "mov       %0, %3                          \n"
334 335

    MEMACCESS(0)
336
    "st1       {v20.d}[0], [%0], %7            \n"
337
    MEMACCESS(0)
338
    "st1       {v22.d}[0], [%0], %7            \n"
339
    MEMACCESS(0)
340
    "st1       {v21.d}[0], [%0], %7            \n"
341
    MEMACCESS(0)
342
    "st1       {v23.d}[0], [%0], %7            \n"
343
    MEMACCESS(0)
344
    "st1       {v20.d}[1], [%0], %7            \n"
345
    MEMACCESS(0)
346
    "st1       {v22.d}[1], [%0], %7            \n"
347
    MEMACCESS(0)
348
    "st1       {v21.d}[1], [%0], %7            \n"
349
    MEMACCESS(0)
350
    "st1       {v23.d}[1], [%0]                \n"
351

352 353 354 355
    "add       %1, %1, #16                     \n"  // src   += 8*2
    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
    "subs      %4, %4,  #8                     \n"  // w     -= 8
356
    "b.ge      1b                              \n"
357 358 359

    // add 8 back to counter. if the result is 0 there are
    // no residuals.
360
    "adds      %4, %4, #8                      \n"
361
    "b.eq      4f                              \n"
362 363

    // some residual, so between 1 and 7 lines left to transpose
364
    "cmp       %4, #2                          \n"
365
    "b.lt      3f                              \n"
366

367
    "cmp       %4, #4                          \n"
368
    "b.lt      2f                              \n"
369 370 371

    // TODO(frkoenig): Clean this up
    // 4x8 block
372
    "mov       %0, %1                          \n"
373
    MEMACCESS(0)
374
    "ld1       {v0.8b}, [%0], %5               \n"
375
    MEMACCESS(0)
376
    "ld1       {v1.8b}, [%0], %5               \n"
377
    MEMACCESS(0)
378
    "ld1       {v2.8b}, [%0], %5               \n"
379
    MEMACCESS(0)
380
    "ld1       {v3.8b}, [%0], %5               \n"
381
    MEMACCESS(0)
382
    "ld1       {v4.8b}, [%0], %5               \n"
383
    MEMACCESS(0)
384
    "ld1       {v5.8b}, [%0], %5               \n"
385
    MEMACCESS(0)
386
    "ld1       {v6.8b}, [%0], %5               \n"
387
    MEMACCESS(0)
388
    "ld1       {v7.8b}, [%0]                   \n"
389 390

    MEMACCESS(8)
391 392
    "ld1       {v30.16b}, [%8], #16            \n"
    "ld1       {v31.16b}, [%8]                 \n"
393

394 395 396 397
    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
398

399
    "mov       %0, %2                          \n"
400 401

    MEMACCESS(0)
402
    "st1       {v16.s}[0],  [%0], %6           \n"
403
    MEMACCESS(0)
404
    "st1       {v16.s}[1],  [%0], %6           \n"
405
    MEMACCESS(0)
406
    "st1       {v16.s}[2],  [%0], %6           \n"
407
    MEMACCESS(0)
408
    "st1       {v16.s}[3],  [%0], %6           \n"
409

410
    "add       %0, %2, #4                      \n"
411
    MEMACCESS(0)
412
    "st1       {v18.s}[0], [%0], %6            \n"
413
    MEMACCESS(0)
414
    "st1       {v18.s}[1], [%0], %6            \n"
415
    MEMACCESS(0)
416
    "st1       {v18.s}[2], [%0], %6            \n"
417
    MEMACCESS(0)
418
    "st1       {v18.s}[3], [%0]                \n"
419

420
    "mov       %0, %3                          \n"
421 422

    MEMACCESS(0)
423
    "st1       {v17.s}[0], [%0], %7            \n"
424
    MEMACCESS(0)
425
    "st1       {v17.s}[1], [%0], %7            \n"
426
    MEMACCESS(0)
427
    "st1       {v17.s}[2], [%0], %7            \n"
428
    MEMACCESS(0)
429
    "st1       {v17.s}[3], [%0], %7            \n"
430

431
    "add       %0, %3, #4                      \n"
432
    MEMACCESS(0)
433
    "st1       {v19.s}[0],  [%0], %7           \n"
434
    MEMACCESS(0)
435
    "st1       {v19.s}[1],  [%0], %7           \n"
436
    MEMACCESS(0)
437
    "st1       {v19.s}[2],  [%0], %7           \n"
438
    MEMACCESS(0)
439
    "st1       {v19.s}[3],  [%0]               \n"
440

441 442 443 444
    "add       %1, %1, #8                      \n"  // src   += 4 * 2
    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
    "subs      %4,  %4,  #4                    \n"  // w     -= 4
445
    "b.eq      4f                              \n"
446 447 448

    // some residual, check to see if it includes a 2x8 block,
    // or less
449
    "cmp       %4, #2                          \n"
450
    "b.lt      3f                              \n"
451 452 453

    // 2x8 block
    "2:                                        \n"
454
    "mov       %0, %1                          \n"
455
    MEMACCESS(0)
456
    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
457
    MEMACCESS(0)
458
    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
459
    MEMACCESS(0)
460
    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
461
    MEMACCESS(0)
462
    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
463
    MEMACCESS(0)
464
    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
465
    MEMACCESS(0)
466
    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
467
    MEMACCESS(0)
468
    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
469
    MEMACCESS(0)
470
    "ld2       {v2.h, v3.h}[3], [%0]           \n"
471

472 473 474 475
    "trn1      v4.8b, v0.8b, v2.8b             \n"
    "trn2      v5.8b, v0.8b, v2.8b             \n"
    "trn1      v6.8b, v1.8b, v3.8b             \n"
    "trn2      v7.8b, v1.8b, v3.8b             \n"
476

477
    "mov       %0, %2                          \n"
478 479

    MEMACCESS(0)
480
    "st1       {v4.d}[0], [%0], %6             \n"
481
    MEMACCESS(0)
482
    "st1       {v6.d}[0], [%0]                 \n"
483

484
    "mov       %0, %3                          \n"
485 486

    MEMACCESS(0)
487
    "st1       {v5.d}[0], [%0], %7             \n"
488
    MEMACCESS(0)
489
    "st1       {v7.d}[0], [%0]                 \n"
490

491 492 493 494
    "add       %1, %1, #4                      \n"  // src   += 2 * 2
    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
    "subs      %4,  %4,  #2                    \n"  // w     -= 2
495
    "b.eq      4f                              \n"
496 497 498 499

    // 1x8 block
    "3:                                        \n"
    MEMACCESS(1)
500
    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
501
    MEMACCESS(1)
502
    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
503
    MEMACCESS(1)
504
    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
505
    MEMACCESS(1)
506
    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
507
    MEMACCESS(1)
508
    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
509
    MEMACCESS(1)
510
    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
511
    MEMACCESS(1)
512
    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
513
    MEMACCESS(1)
514
    "ld2       {v0.b, v1.b}[7], [%1]           \n"
515

516 517
    MEMACCESS(2)
    "st1       {v0.d}[0], [%2]                 \n"
518
    MEMACCESS(3)
519
    "st1       {v1.d}[0], [%3]                 \n"
520 521 522

    "4:                                        \n"

523
    : "=&r"(src_temp),                            // %0
524 525 526
      "+r"(src),                                  // %1
      "+r"(dst_a),                                // %2
      "+r"(dst_b),                                // %3
527
      "+r"(width64)                               // %4
528 529 530 531
    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
      "r"(&kVTbl4x4TransposeDi)                   // %8
532
    : "memory", "cc",
533 534 535
      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
      "v30", "v31"
536 537
  );
}
538
#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
539 540 541 542 543

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif