scale_neon.cc 38.3 KB
Newer Older
1 2 3 4 5 6
/*
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10 11 12 13 14 15 16 17
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

18
// This module is for GCC Neon.
19 20
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
21

22 23
// NEON downscalers with interpolation.
// Provided by Fritz Koenig
24

25
// Read 32x1 throw away even pixels, and write 16x1.
26
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
27 28
                        uint8* dst, int dst_width) {
  asm volatile (
29
  "1:                                          \n"
30
    // load even pixels into q0, odd into q1
31
    MEMACCESS(0)
32
    "vld2.8     {q0, q1}, [%0]!                \n"
33
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
34
    MEMACCESS(1)
35
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
36
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
37 38 39 40 41
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "q0", "q1"              // Clobber List
42 43 44
  );
}

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // add adjacent
    "vpaddl.u8  q1, q1                         \n"
    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #1                     \n"
    MEMACCESS(1)
    "vst1.8     {q0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "q0", "q1"     // Clobber List
  );
}

68
// Read 32x2 average down and write 16x1.
69
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
70 71 72 73
                           uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %0                         \n"
74
  "1:                                          \n"
75
    MEMACCESS(0)
76
    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
77
    MEMACCESS(1)
78
    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
fbarchard@google.com's avatar
fbarchard@google.com committed
79
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
80 81 82 83 84 85
    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
    "vpaddl.u8  q1, q1                         \n"
    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
    "vpadal.u8  q1, q3                         \n"
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
86
    MEMACCESS(2)
87
    "vst1.8     {q0}, [%2]!                    \n"
88
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
89 90 91 92 93 94 95
  : "+r"(src_ptr),          // %0
    "+r"(src_stride),       // %1
    "+r"(dst),              // %2
    "+r"(dst_width)         // %3
  :
  : "q0", "q1", "q2", "q3"     // Clobber List
  );
96 97
}

98
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
99 100
                        uint8* dst_ptr, int dst_width) {
  asm volatile (
101
  "1:                                          \n"
102
    MEMACCESS(0)
103 104
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
    "subs       %2, %2, #8                     \n" // 8 processed per loop
105
    MEMACCESS(1)
106 107
    "vst1.8     {d2}, [%1]!                    \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
108 109 110 111 112
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width)         // %2
  :
  : "q0", "q1", "memory", "cc"
113 114 115
  );
}

116
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
117
                           uint8* dst_ptr, int dst_width) {
118 119 120 121
  const uint8* src_ptr1 = src_ptr + src_stride;
  const uint8* src_ptr2 = src_ptr + src_stride * 2;
  const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
122
  "1:                                          \n"
123
    MEMACCESS(0)
124
    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
125
    MEMACCESS(3)
126 127 128 129 130
    "vld1.8     {q1}, [%3]!                    \n"
    MEMACCESS(4)
    "vld1.8     {q2}, [%4]!                    \n"
    MEMACCESS(5)
    "vld1.8     {q3}, [%5]!                    \n"
131
    "subs       %2, %2, #4                     \n"
132 133 134 135 136 137 138
    "vpaddl.u8  q0, q0                         \n"
    "vpadal.u8  q0, q1                         \n"
    "vpadal.u8  q0, q2                         \n"
    "vpadal.u8  q0, q3                         \n"
    "vpaddl.u16 q0, q0                         \n"
    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
    "vmovn.u16  d0, q0                         \n"
139
    MEMACCESS(1)
140
    "vst1.32    {d0[0]}, [%1]!                 \n"
141
    "bgt        1b                             \n"
142 143 144 145 146 147 148 149
  : "+r"(src_ptr),   // %0
    "+r"(dst_ptr),   // %1
    "+r"(dst_width), // %2
    "+r"(src_ptr1),  // %3
    "+r"(src_ptr2),  // %4
    "+r"(src_ptr3)   // %5
  :
  : "q0", "q1", "q2", "q3", "memory", "cc"
150 151 152 153 154 155 156
  );
}

// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
157
                         ptrdiff_t src_stride,
158 159
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
160
  "1:                                          \n"
161
    MEMACCESS(0)
162 163 164
    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "subs       %2, %2, #24                  \n"
    "vmov       d2, d3                       \n" // order d0, d1, d2
165
    MEMACCESS(1)
166 167
    "vst3.8     {d0, d1, d2}, [%1]!          \n"
    "bgt        1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
168 169 170 171 172
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width)         // %2
  :
  : "d0", "d1", "d2", "d3", "memory", "cc"
173 174 175
  );
}

176
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
177 178 179
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
180 181
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
182
  "1:                                          \n"
183
    MEMACCESS(0)
184
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
185
    MEMACCESS(3)
186
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
187
    "subs         %2, %2, #24                  \n"
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221

    // filter src line 0 with src line 1
    // expand chars to shorts to allow for room
    // when adding lines together
    "vmovl.u8     q8, d4                       \n"
    "vmovl.u8     q9, d5                       \n"
    "vmovl.u8     q10, d6                      \n"
    "vmovl.u8     q11, d7                      \n"

    // 3 * line_0 + line_1
    "vmlal.u8     q8, d0, d24                  \n"
    "vmlal.u8     q9, d1, d24                  \n"
    "vmlal.u8     q10, d2, d24                 \n"
    "vmlal.u8     q11, d3, d24                 \n"

    // (3 * line_0 + line_1) >> 2
    "vqrshrn.u16  d0, q8, #2                   \n"
    "vqrshrn.u16  d1, q9, #2                   \n"
    "vqrshrn.u16  d2, q10, #2                  \n"
    "vqrshrn.u16  d3, q11, #2                  \n"

    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q8, d1                       \n"
    "vmlal.u8     q8, d0, d24                  \n"
    "vqrshrn.u16  d0, q8, #2                   \n"

    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"

    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q8, d2                       \n"
    "vmlal.u8     q8, d3, d24                  \n"
    "vqrshrn.u16  d2, q8, #2                   \n"

222
    MEMACCESS(1)
223
    "vst3.8       {d0, d1, d2}, [%1]!          \n"
224 225

    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
226 227 228 229 230 231
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
    "+r"(src_stride)        // %3
  :
  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
232 233 234
  );
}

235
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
236 237 238
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
239 240
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
241
  "1:                                          \n"
242
    MEMACCESS(0)
243
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
244
    MEMACCESS(3)
245
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
246
    "subs         %2, %2, #24                  \n"
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
    // average src line 0 with src line 1
    "vrhadd.u8    q0, q0, q2                   \n"
    "vrhadd.u8    q1, q1, q3                   \n"

    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q3, d1                       \n"
    "vmlal.u8     q3, d0, d24                  \n"
    "vqrshrn.u16  d0, q3, #2                   \n"

    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"

    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q3, d2                       \n"
    "vmlal.u8     q3, d3, d24                  \n"
    "vqrshrn.u16  d2, q3, #2                   \n"

264
    MEMACCESS(1)
265
    "vst3.8       {d0, d1, d2}, [%1]!          \n"
266
    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
267 268 269 270 271 272
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
    "+r"(src_stride)        // %3
  :
  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
273 274 275 276
  );
}

#define HAS_SCALEROWDOWN38_NEON
277
static uvec8 kShuf38 =
278
  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
279
static uvec8 kShuf38_2 =
280
  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
281
static vec16 kMult38_Div6 =
282 283
  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
284
static vec16 kMult38_Div9 =
285 286 287 288 289
  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
290
                         ptrdiff_t src_stride,
291 292
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
293
    MEMACCESS(3)
294
    "vld1.8     {q3}, [%3]                     \n"
295
  "1:                                          \n"
296
    MEMACCESS(0)
297 298 299 300
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
    "subs       %2, %2, #12                    \n"
    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
301
    MEMACCESS(1)
302
    "vst1.8     {d4}, [%1]!                    \n"
303
    MEMACCESS(1)
304 305
    "vst1.32    {d5[0]}, [%1]!                 \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
306 307 308 309 310
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width)         // %2
  : "r"(&kShuf38)           // %3
  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
311 312 313 314
  );
}

// 32x3 -> 12x1
315
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
316 317
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
318 319
  const uint8* src_ptr1 = src_ptr + src_stride * 2;

320
  asm volatile (
321
    MEMACCESS(5)
322
    "vld1.16    {q13}, [%5]                    \n"
323
    MEMACCESS(6)
324
    "vld1.8     {q14}, [%6]                    \n"
325
    MEMACCESS(7)
326
    "vld1.8     {q15}, [%7]                    \n"
327
    "add        %3, %0                         \n"
328
  "1:                                          \n"
329 330 331 332 333

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
334
    MEMACCESS(0)
335
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
336
    MEMACCESS(3)
337
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
338
    MEMACCESS(4)
339
    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
340
    "subs         %2, %2, #12                  \n"
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415

    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"
    "vtrn.u8      d16, d17                     \n"

    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"
    "vtrn.u8      d18, d19                     \n"

    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"
    "vpaddl.u8    q8, q8                       \n"

    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"
    "vpaddl.u8    d19, d19                     \n"

    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     q0, q8                       \n"
    "vadd.u16     d4, d3, d7                   \n"
    "vadd.u16     d4, d19                      \n"

    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
    //             + s[6 + st * 1] + s[7 + st * 1]
    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
    "vqrdmulh.s16 q2, q2, q13                  \n"
    "vmovn.u16    d4, q2                       \n"

    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg. This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded. Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"
    "vmovl.u8     q9, d18                      \n"

    // combine source lines
    "vadd.u16     q1, q3                       \n"
    "vadd.u16     q1, q9                       \n"

    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"

    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"

    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"

    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2. So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q15                  \n"

    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"

    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

416
    MEMACCESS(1)
417
    "vst1.8       {d3}, [%1]!                  \n"
418
    MEMACCESS(1)
419
    "vst1.32      {d4[0]}, [%1]!               \n"
420
    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
421 422 423
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
424 425 426 427 428 429
    "+r"(src_stride),       // %3
    "+r"(src_ptr1)          // %4
  : "r"(&kMult38_Div6),     // %5
    "r"(&kShuf38_2),        // %6
    "r"(&kMult38_Div9)      // %7
  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
430 431 432 433
  );
}

// 32x2 -> 12x1
434
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
435 436 437
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
438
    MEMACCESS(4)
439
    "vld1.16    {q13}, [%4]                    \n"
440
    MEMACCESS(5)
441 442
    "vld1.8     {q14}, [%5]                    \n"
    "add        %3, %0                         \n"
443
  "1:                                          \n"
444 445 446 447 448

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
449
    MEMACCESS(0)
450
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
451
    MEMACCESS(3)
452
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
453
    "subs         %2, %2, #12                  \n"
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517

    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"

    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"

    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"

    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"

    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     d4, d3, d7                   \n"

    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
    "vqrshrn.u16  d4, q2, #2                   \n"

    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg. This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded. Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"

    // combine source lines
    "vadd.u16     q1, q3                       \n"

    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"

    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"

    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"

    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2. So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q13                  \n"

    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"

    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

518
    MEMACCESS(1)
519
    "vst1.8       {d3}, [%1]!                  \n"
520
    MEMACCESS(1)
521
    "vst1.32      {d4[0]}, [%1]!               \n"
522
    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
523 524 525 526 527 528 529
  : "+r"(src_ptr),       // %0
    "+r"(dst_ptr),       // %1
    "+r"(dst_width),     // %2
    "+r"(src_stride)     // %3
  : "r"(&kMult38_Div6),  // %4
    "r"(&kShuf38_2)      // %5
  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
530 531 532
  );
}

533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                    uint16* dst_ptr, int src_width, int src_height) {
  const uint8* src_tmp = NULL;
  asm volatile (
  "1:                                          \n"
    "mov       %0, %1                          \n"
    "mov       r12, %5                         \n"
    "veor      q2, q2, q2                      \n"
    "veor      q3, q3, q3                      \n"
  "2:                                          \n"
    // load 16 pixels into q0
    MEMACCESS(0)
    "vld1.8     {q0}, [%0], %3                 \n"
    "vaddw.u8   q3, q3, d1                     \n"
    "vaddw.u8   q2, q2, d0                     \n"
    "subs       r12, r12, #1                   \n"
    "bgt        2b                             \n"
    MEMACCESS(2)
    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
    "add        %1, %1, #16                    \n"
    "subs       %4, %4, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
  : "+r"(src_tmp),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_ptr),          // %2
    "+r"(src_stride),       // %3
    "+r"(src_width),        // %4
    "+r"(src_height)        // %5
  :
  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
  );
}

566 567 568 569
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n)                                    \
    "lsr        %5, %3, #16                    \n"             \
570
    "add        %6, %1, %5                     \n"             \
571
    "add        %3, %3, %4                     \n"             \
572 573
    MEMACCESS(6)                                               \
    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
574 575 576

void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx) {
577
  int dx_offset[4] = {0, 1, 2, 3};
578
  int* tmp = dx_offset;
579
  const uint8* src_tmp = src_ptr;
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
  asm volatile (
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
    "vmul.s32   q1, q1, q2                     \n"
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "vadd.s32   q1, q1, q0                     \n"
    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
    "vadd.s32   q2, q1, q3                     \n"
    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
  "1:                                          \n"
    LOAD2_DATA8_LANE(0)
    LOAD2_DATA8_LANE(1)
    LOAD2_DATA8_LANE(2)
    LOAD2_DATA8_LANE(3)
    LOAD2_DATA8_LANE(4)
    LOAD2_DATA8_LANE(5)
    LOAD2_DATA8_LANE(6)
    LOAD2_DATA8_LANE(7)
    "vmov       q10, q1                        \n"
    "vmov       q11, q2                        \n"
    "vuzp.16    q10, q11                       \n"
    "vmovl.u8   q8, d6                         \n"
    "vmovl.u8   q9, d7                         \n"
    "vsubl.s16  q11, d18, d16                  \n"
    "vsubl.s16  q12, d19, d17                  \n"
    "vmovl.u16  q13, d20                       \n"
    "vmovl.u16  q10, d21                       \n"
    "vmul.s32   q11, q11, q13                  \n"
    "vmul.s32   q12, q12, q10                  \n"
    "vshrn.s32  d18, q11, #16                  \n"
    "vshrn.s32  d19, q12, #16                  \n"
    "vadd.s16   q8, q8, q9                     \n"
    "vmovn.s16  d6, q8                         \n"

    MEMACCESS(0)
    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
    "vadd.s32   q1, q1, q0                     \n"
    "vadd.s32   q2, q2, q0                     \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
622 623 624 625 626 627 628 629 630
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1", "q2", "q3",
631 632 633 634 635 636
    "q8", "q9", "q10", "q11", "q12", "q13"
  );
}

#undef LOAD2_DATA8_LANE

637 638 639 640 641 642
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction) {
  asm volatile (
    "cmp          %4, #0                       \n"
643
    "beq          100f                         \n"
644
    "add          %2, %1                       \n"
645 646
    "cmp          %4, #64                      \n"
    "beq          75f                          \n"
647
    "cmp          %4, #128                     \n"
648 649 650
    "beq          50f                          \n"
    "cmp          %4, #192                     \n"
    "beq          25f                          \n"
651 652 653 654

    "vdup.8       d5, %4                       \n"
    "rsb          %4, #256                     \n"
    "vdup.8       d4, %4                       \n"
655
    // General purpose row blend.
656
  "1:                                          \n"
657
    MEMACCESS(1)
658
    "vld1.8       {q0}, [%1]!                  \n"
659
    MEMACCESS(2)
660
    "vld1.8       {q1}, [%2]!                  \n"
661
    "subs         %3, %3, #16                  \n"
662 663 664 665 666 667
    "vmull.u8     q13, d0, d4                  \n"
    "vmull.u8     q14, d1, d4                  \n"
    "vmlal.u8     q13, d2, d5                  \n"
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
668
    MEMACCESS(0)
669
    "vst1.8       {q0}, [%0]!                  \n"
670
    "bgt          1b                           \n"
671
    "b            99f                          \n"
672

673 674
    // Blend 25 / 75.
  "25:                                         \n"
675
    MEMACCESS(1)
676
    "vld1.8       {q0}, [%1]!                  \n"
677
    MEMACCESS(2)
678
    "vld1.8       {q1}, [%2]!                  \n"
679
    "subs         %3, %3, #16                  \n"
680 681
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
682
    MEMACCESS(0)
683
    "vst1.8       {q0}, [%0]!                  \n"
684 685
    "bgt          25b                          \n"
    "b            99f                          \n"
686

687 688
    // Blend 50 / 50.
  "50:                                         \n"
689
    MEMACCESS(1)
690
    "vld1.8       {q0}, [%1]!                  \n"
691
    MEMACCESS(2)
692
    "vld1.8       {q1}, [%2]!                  \n"
693
    "subs         %3, %3, #16                  \n"
694
    "vrhadd.u8    q0, q1                       \n"
695
    MEMACCESS(0)
696
    "vst1.8       {q0}, [%0]!                  \n"
697 698 699 700 701
    "bgt          50b                          \n"
    "b            99f                          \n"

    // Blend 75 / 25.
  "75:                                         \n"
702
    MEMACCESS(1)
703
    "vld1.8       {q1}, [%1]!                  \n"
704
    MEMACCESS(2)
705
    "vld1.8       {q0}, [%2]!                  \n"
706
    "subs         %3, %3, #16                  \n"
707 708
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
709
    MEMACCESS(0)
710
    "vst1.8       {q0}, [%0]!                  \n"
711 712 713 714 715
    "bgt          75b                          \n"
    "b            99f                          \n"

    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
716
    MEMACCESS(1)
717
    "vld1.8       {q0}, [%1]!                  \n"
718
    "subs         %3, %3, #16                  \n"
719
    MEMACCESS(0)
720
    "vst1.8       {q0}, [%0]!                  \n"
721 722 723
    "bgt          100b                         \n"

  "99:                                         \n"
724
    MEMACCESS(0)
725
    "vst1.8       {d1[7]}, [%0]                \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
726 727 728 729 730 731 732
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(src_stride),       // %2
    "+r"(dst_width),        // %3
    "+r"(source_y_fraction) // %4
  :
  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
733 734
  );
}
735

736
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
737 738 739 740
                            uint8* dst, int dst_width) {
  asm volatile (
  "1:                                          \n"
    // load even pixels into q0, odd into q1
741
    MEMACCESS(0)
742
    "vld2.32    {q0, q1}, [%0]!                \n"
743
    MEMACCESS(0)
744 745
    "vld2.32    {q2, q3}, [%0]!                \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
746
    MEMACCESS(1)
747
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
748
    MEMACCESS(1)
749 750 751 752 753 754 755 756 757 758
    "vst1.8     {q3}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
  );
}

759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
    MEMACCESS(0)
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #1                     \n"
    "vrshrn.u16 d2, q2, #1                     \n"
    "vrshrn.u16 d3, q3, #1                     \n"
    MEMACCESS(1)
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
    "bgt       1b                              \n"
  : "+r"(src_argb),         // %0
    "+r"(dst_argb),         // %1
    "+r"(dst_width)         // %2
  :
  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
  );
}

787 788 789 790 791 792
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %1, %0                     \n"
  "1:                                          \n"
793
    MEMACCESS(0)
794
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
795
    MEMACCESS(0)
796 797 798 799 800 801
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
802
    MEMACCESS(1)
803
    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
804
    MEMACCESS(1)
805 806 807 808 809 810 811 812 813
    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
    "vrshrn.u16 d2, q2, #2                     \n"
    "vrshrn.u16 d3, q3, #2                     \n"
814
    MEMACCESS(2)
815 816 817 818 819 820 821 822 823 824 825 826 827
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(src_stride),       // %1
    "+r"(dst),              // %2
    "+r"(dst_width)         // %3
  :
  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
}

// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
828 829
void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
                               int src_stepx, uint8* dst_argb, int dst_width) {
830 831 832
  asm volatile (
    "mov        r12, %3, lsl #2                \n"
  "1:                                          \n"
833
    MEMACCESS(0)
834
    "vld1.32    {d0[0]}, [%0], r12             \n"
835
    MEMACCESS(0)
836
    "vld1.32    {d0[1]}, [%0], r12             \n"
837
    MEMACCESS(0)
838
    "vld1.32    {d1[0]}, [%0], r12             \n"
839
    MEMACCESS(0)
840 841
    "vld1.32    {d1[1]}, [%0], r12             \n"
    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
842
    MEMACCESS(1)
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
    "vst1.8     {q0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
    "+r"(dst_argb),    // %1
    "+r"(dst_width)    // %2
  : "r"(src_stepx)     // %3
  : "memory", "cc", "r12", "q0"
  );
}

// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width) {
  asm volatile (
859 860
    "mov        r12, %4, lsl #2                \n"
    "add        %1, %1, %0                     \n"
861
  "1:                                          \n"
862
    MEMACCESS(0)
863
    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
864
    MEMACCESS(1)
865
    "vld1.8     {d1}, [%1], r12                \n"
866
    MEMACCESS(0)
867
    "vld1.8     {d2}, [%0], r12                \n"
868
    MEMACCESS(1)
869
    "vld1.8     {d3}, [%1], r12                \n"
870
    MEMACCESS(0)
871
    "vld1.8     {d4}, [%0], r12                \n"
872
    MEMACCESS(1)
873
    "vld1.8     {d5}, [%1], r12                \n"
874
    MEMACCESS(0)
875
    "vld1.8     {d6}, [%0], r12                \n"
876
    MEMACCESS(1)
877 878 879 880 881 882 883 884 885
    "vld1.8     {d7}, [%1], r12                \n"
    "vaddl.u8   q0, d0, d1                     \n"
    "vaddl.u8   q1, d2, d3                     \n"
    "vaddl.u8   q2, d4, d5                     \n"
    "vaddl.u8   q3, d6, d7                     \n"
    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
886 887 888
    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
889
    MEMACCESS(2)
890 891 892 893 894 895 896 897 898 899 900
    "vst1.8     {q0}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
    "+r"(src_stride),  // %1
    "+r"(dst_argb),    // %2
    "+r"(dst_width)    // %3
  : "r"(src_stepx)     // %4
  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
  );
}

901 902 903 904
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n)                               \
    "lsr        %5, %3, #16                    \n"             \
905
    "add        %6, %1, %5, lsl #2             \n"             \
906 907 908 909 910 911
    "add        %3, %3, %4                     \n"             \
    MEMACCESS(6)                                               \
    "vld1.32    {"#dn"["#n"]}, [%6]            \n"

void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
912
  int tmp = 0;
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
  const uint8* src_tmp = src_argb;
  asm volatile (
  "1:                                          \n"
    LOAD1_DATA32_LANE(d0, 0)
    LOAD1_DATA32_LANE(d0, 1)
    LOAD1_DATA32_LANE(d1, 0)
    LOAD1_DATA32_LANE(d1, 1)
    LOAD1_DATA32_LANE(d2, 0)
    LOAD1_DATA32_LANE(d2, 1)
    LOAD1_DATA32_LANE(d3, 0)
    LOAD1_DATA32_LANE(d3, 1)

    MEMACCESS(0)
    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1"
  );
}

#undef LOAD1_DATA32_LANE

943 944 945 946 947 948 949 950 951 952 953 954
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
    "lsr        %5, %3, #16                           \n"      \
    "add        %6, %1, %5, lsl #2                    \n"      \
    "add        %3, %3, %4                            \n"      \
    MEMACCESS(6)                                               \
    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"

void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
                              int dst_width, int x, int dx) {
  int dx_offset[4] = {0, 1, 2, 3};
955
  int* tmp = dx_offset;
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
  const uint8* src_tmp = src_argb;
  asm volatile (
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
    "vmul.s32   q1, q1, q2                     \n"
    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "vadd.s32   q8, q1, q0                     \n"
  "1:                                          \n"
    // d0, d1: a
    // d2, d3: b
    LOAD2_DATA32_LANE(d0, d2, 0)
    LOAD2_DATA32_LANE(d0, d2, 1)
    LOAD2_DATA32_LANE(d1, d3, 0)
    LOAD2_DATA32_LANE(d1, d3, 1)
    "vshrn.i32   d22, q8, #9                   \n"
    "vand.16     d22, d22, d30                 \n"
    "vdup.8      d24, d22[0]                   \n"
    "vdup.8      d25, d22[2]                   \n"
    "vdup.8      d26, d22[4]                   \n"
    "vdup.8      d27, d22[6]                   \n"
    "vext.8      d4, d24, d25, #4              \n"
    "vext.8      d5, d26, d27, #4              \n"  // f
    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
    "vmull.u8    q11, d0, d20                  \n"
    "vmull.u8    q12, d1, d21                  \n"
    "vmull.u8    q13, d2, d4                   \n"
    "vmull.u8    q14, d3, d5                   \n"
    "vadd.i16    q11, q11, q13                 \n"
    "vadd.i16    q12, q12, q14                 \n"
    "vshrn.i16   d0, q11, #7                   \n"
    "vshrn.i16   d1, q12, #7                   \n"

    MEMACCESS(0)
    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
    "vadd.s32    q8, q8, q9                    \n"
    "subs        %2, %2, #4                    \n"  // 4 processed per loop
    "bgt         1b                            \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
    "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

#undef LOAD2_DATA32_LANE

1012
#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
1013 1014 1015 1016 1017

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif