scale_neon.cc 38.4 KB
Newer Older
1 2 3 4 5 6
/*
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10 11 12 13 14 15 16 17
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

18
// This module is for GCC Neon.
19 20
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
21

22 23
// NEON downscalers with interpolation.
// Provided by Fritz Koenig
24

25
// Read 32x1 throw away even pixels, and write 16x1.
26
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
27 28
                        uint8* dst, int dst_width) {
  asm volatile (
29
  "1:                                          \n"
30
    // load even pixels into q0, odd into q1
31
    MEMACCESS(0)
32
    "vld2.8     {q0, q1}, [%0]!                \n"
33
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
34
    MEMACCESS(1)
35
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
36
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
37 38 39 40 41
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "q0", "q1"              // Clobber List
42 43 44
  );
}

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // add adjacent
    "vpaddl.u8  q1, q1                         \n"
    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #1                     \n"
    MEMACCESS(1)
    "vst1.8     {q0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "q0", "q1"     // Clobber List
  );
}

68
// Read 32x2 average down and write 16x1.
69
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
70 71 72 73
                           uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %0                         \n"
74
  "1:                                          \n"
75
    MEMACCESS(0)
76
    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
77
    MEMACCESS(1)
78
    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
fbarchard@google.com's avatar
fbarchard@google.com committed
79
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
80 81 82 83 84 85
    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
    "vpaddl.u8  q1, q1                         \n"
    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
    "vpadal.u8  q1, q3                         \n"
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
86
    MEMACCESS(2)
87
    "vst1.8     {q0}, [%2]!                    \n"
88
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
89 90 91 92 93 94 95
  : "+r"(src_ptr),          // %0
    "+r"(src_stride),       // %1
    "+r"(dst),              // %2
    "+r"(dst_width)         // %3
  :
  : "q0", "q1", "q2", "q3"     // Clobber List
  );
96 97
}

98
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
99 100
                        uint8* dst_ptr, int dst_width) {
  asm volatile (
101
  "1:                                          \n"
102
    MEMACCESS(0)
103 104
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
    "subs       %2, %2, #8                     \n" // 8 processed per loop
105
    MEMACCESS(1)
106 107
    "vst1.8     {d2}, [%1]!                    \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
108 109 110 111 112
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width)         // %2
  :
  : "q0", "q1", "memory", "cc"
113 114 115
  );
}

116
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
117
                           uint8* dst_ptr, int dst_width) {
118 119 120 121
  const uint8* src_ptr1 = src_ptr + src_stride;
  const uint8* src_ptr2 = src_ptr + src_stride * 2;
  const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
122
  "1:                                          \n"
123
    MEMACCESS(0)
124
    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
125
    MEMACCESS(3)
126 127 128 129 130
    "vld1.8     {q1}, [%3]!                    \n"
    MEMACCESS(4)
    "vld1.8     {q2}, [%4]!                    \n"
    MEMACCESS(5)
    "vld1.8     {q3}, [%5]!                    \n"
131
    "subs       %2, %2, #4                     \n"
132 133 134 135 136 137 138
    "vpaddl.u8  q0, q0                         \n"
    "vpadal.u8  q0, q1                         \n"
    "vpadal.u8  q0, q2                         \n"
    "vpadal.u8  q0, q3                         \n"
    "vpaddl.u16 q0, q0                         \n"
    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
    "vmovn.u16  d0, q0                         \n"
139
    MEMACCESS(1)
140
    "vst1.32    {d0[0]}, [%1]!                 \n"
141
    "bgt        1b                             \n"
142 143 144 145 146 147 148 149
  : "+r"(src_ptr),   // %0
    "+r"(dst_ptr),   // %1
    "+r"(dst_width), // %2
    "+r"(src_ptr1),  // %3
    "+r"(src_ptr2),  // %4
    "+r"(src_ptr3)   // %5
  :
  : "q0", "q1", "q2", "q3", "memory", "cc"
150 151 152 153 154 155 156
  );
}

// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
157
                         ptrdiff_t src_stride,
158 159
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
160
  "1:                                          \n"
161
    MEMACCESS(0)
162 163 164
    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "subs       %2, %2, #24                  \n"
    "vmov       d2, d3                       \n" // order d0, d1, d2
165
    MEMACCESS(1)
166 167
    "vst3.8     {d0, d1, d2}, [%1]!          \n"
    "bgt        1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
168 169 170 171 172
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width)         // %2
  :
  : "d0", "d1", "d2", "d3", "memory", "cc"
173 174 175
  );
}

176
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
177 178 179
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
180 181
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
182
  "1:                                          \n"
183
    MEMACCESS(0)
184
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
185
    MEMACCESS(3)
186
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
187
    "subs         %2, %2, #24                  \n"
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221

    // filter src line 0 with src line 1
    // expand chars to shorts to allow for room
    // when adding lines together
    "vmovl.u8     q8, d4                       \n"
    "vmovl.u8     q9, d5                       \n"
    "vmovl.u8     q10, d6                      \n"
    "vmovl.u8     q11, d7                      \n"

    // 3 * line_0 + line_1
    "vmlal.u8     q8, d0, d24                  \n"
    "vmlal.u8     q9, d1, d24                  \n"
    "vmlal.u8     q10, d2, d24                 \n"
    "vmlal.u8     q11, d3, d24                 \n"

    // (3 * line_0 + line_1) >> 2
    "vqrshrn.u16  d0, q8, #2                   \n"
    "vqrshrn.u16  d1, q9, #2                   \n"
    "vqrshrn.u16  d2, q10, #2                  \n"
    "vqrshrn.u16  d3, q11, #2                  \n"

    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q8, d1                       \n"
    "vmlal.u8     q8, d0, d24                  \n"
    "vqrshrn.u16  d0, q8, #2                   \n"

    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"

    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q8, d2                       \n"
    "vmlal.u8     q8, d3, d24                  \n"
    "vqrshrn.u16  d2, q8, #2                   \n"

222
    MEMACCESS(1)
223
    "vst3.8       {d0, d1, d2}, [%1]!          \n"
224 225

    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
226 227 228 229 230 231
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
    "+r"(src_stride)        // %3
  :
  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
232 233 234
  );
}

235
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
236 237 238
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
239 240
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
241
  "1:                                          \n"
242
    MEMACCESS(0)
243
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
244
    MEMACCESS(3)
245
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
246
    "subs         %2, %2, #24                  \n"
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
    // average src line 0 with src line 1
    "vrhadd.u8    q0, q0, q2                   \n"
    "vrhadd.u8    q1, q1, q3                   \n"

    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q3, d1                       \n"
    "vmlal.u8     q3, d0, d24                  \n"
    "vqrshrn.u16  d0, q3, #2                   \n"

    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"

    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q3, d2                       \n"
    "vmlal.u8     q3, d3, d24                  \n"
    "vqrshrn.u16  d2, q3, #2                   \n"

264
    MEMACCESS(1)
265
    "vst3.8       {d0, d1, d2}, [%1]!          \n"
266
    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
267 268 269 270 271 272
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
    "+r"(src_stride)        // %3
  :
  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
273 274 275 276
  );
}

#define HAS_SCALEROWDOWN38_NEON
277
static uvec8 kShuf38 =
278
  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
279
static uvec8 kShuf38_2 =
280
  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
281
static vec16 kMult38_Div6 =
282 283
  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
284
static vec16 kMult38_Div9 =
285 286 287 288 289
  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
290
                         ptrdiff_t src_stride,
291 292
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
293
    MEMACCESS(3)
294
    "vld1.8     {q3}, [%3]                     \n"
295
  "1:                                          \n"
296
    MEMACCESS(0)
297 298 299 300
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
    "subs       %2, %2, #12                    \n"
    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
301
    MEMACCESS(1)
302
    "vst1.8     {d4}, [%1]!                    \n"
303
    MEMACCESS(1)
304 305
    "vst1.32    {d5[0]}, [%1]!                 \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
306 307 308 309 310
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width)         // %2
  : "r"(&kShuf38)           // %3
  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
311 312 313 314
  );
}

// 32x3 -> 12x1
315
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
316 317
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
318 319
  const uint8* src_ptr1 = src_ptr + src_stride * 2;

320
  asm volatile (
321
    MEMACCESS(5)
322
    "vld1.16    {q13}, [%5]                    \n"
323
    MEMACCESS(6)
324
    "vld1.8     {q14}, [%6]                    \n"
325
    MEMACCESS(7)
326
    "vld1.8     {q15}, [%7]                    \n"
327
    "add        %3, %0                         \n"
328
  "1:                                          \n"
329 330 331 332 333

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
334
    MEMACCESS(0)
335
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
336
    MEMACCESS(3)
337
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
338
    MEMACCESS(4)
339
    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
340
    "subs         %2, %2, #12                  \n"
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415

    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"
    "vtrn.u8      d16, d17                     \n"

    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"
    "vtrn.u8      d18, d19                     \n"

    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"
    "vpaddl.u8    q8, q8                       \n"

    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"
    "vpaddl.u8    d19, d19                     \n"

    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     q0, q8                       \n"
    "vadd.u16     d4, d3, d7                   \n"
    "vadd.u16     d4, d19                      \n"

    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
    //             + s[6 + st * 1] + s[7 + st * 1]
    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
    "vqrdmulh.s16 q2, q2, q13                  \n"
    "vmovn.u16    d4, q2                       \n"

    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg. This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded. Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"
    "vmovl.u8     q9, d18                      \n"

    // combine source lines
    "vadd.u16     q1, q3                       \n"
    "vadd.u16     q1, q9                       \n"

    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"

    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"

    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"

    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2. So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q15                  \n"

    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"

    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

416
    MEMACCESS(1)
417
    "vst1.8       {d3}, [%1]!                  \n"
418
    MEMACCESS(1)
419
    "vst1.32      {d4[0]}, [%1]!               \n"
420
    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
421 422 423
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
424 425 426 427 428 429
    "+r"(src_stride),       // %3
    "+r"(src_ptr1)          // %4
  : "r"(&kMult38_Div6),     // %5
    "r"(&kShuf38_2),        // %6
    "r"(&kMult38_Div9)      // %7
  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
430 431 432 433
  );
}

// 32x2 -> 12x1
434
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
435 436 437
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
438
    MEMACCESS(4)
439
    "vld1.16    {q13}, [%4]                    \n"
440
    MEMACCESS(5)
441 442
    "vld1.8     {q14}, [%5]                    \n"
    "add        %3, %0                         \n"
443
  "1:                                          \n"
444 445 446 447 448

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
449
    MEMACCESS(0)
450
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
451
    MEMACCESS(3)
452
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
453
    "subs         %2, %2, #12                  \n"
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517

    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"

    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"

    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"

    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"

    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     d4, d3, d7                   \n"

    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
    "vqrshrn.u16  d4, q2, #2                   \n"

    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg. This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded. Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"

    // combine source lines
    "vadd.u16     q1, q3                       \n"

    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"

    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"

    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"

    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2. So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q13                  \n"

    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"

    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

518
    MEMACCESS(1)
519
    "vst1.8       {d3}, [%1]!                  \n"
520
    MEMACCESS(1)
521
    "vst1.32      {d4[0]}, [%1]!               \n"
522
    "bgt          1b                           \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
523 524 525 526 527 528 529
  : "+r"(src_ptr),       // %0
    "+r"(dst_ptr),       // %1
    "+r"(dst_width),     // %2
    "+r"(src_stride)     // %3
  : "r"(&kMult38_Div6),  // %4
    "r"(&kShuf38_2)      // %5
  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
530 531 532
  );
}

533 534
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                    uint16* dst_ptr, int src_width, int src_height) {
535
  const uint8* src_tmp;
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
  asm volatile (
  "1:                                          \n"
    "mov       %0, %1                          \n"
    "mov       r12, %5                         \n"
    "veor      q2, q2, q2                      \n"
    "veor      q3, q3, q3                      \n"
  "2:                                          \n"
    // load 16 pixels into q0
    MEMACCESS(0)
    "vld1.8     {q0}, [%0], %3                 \n"
    "vaddw.u8   q3, q3, d1                     \n"
    "vaddw.u8   q2, q2, d0                     \n"
    "subs       r12, r12, #1                   \n"
    "bgt        2b                             \n"
    MEMACCESS(2)
    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
    "add        %1, %1, #16                    \n"
    "subs       %4, %4, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
555 556 557 558 559 560
  : "=&r"(src_tmp),    // %0
    "+r"(src_ptr),     // %1
    "+r"(dst_ptr),     // %2
    "+r"(src_stride),  // %3
    "+r"(src_width),   // %4
    "+r"(src_height)   // %5
561 562 563 564 565
  :
  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
  );
}

566 567 568 569
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n)                                    \
    "lsr        %5, %3, #16                    \n"             \
570
    "add        %6, %1, %5                     \n"             \
571
    "add        %3, %3, %4                     \n"             \
572 573
    MEMACCESS(6)                                               \
    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
574

575
// The NEON version mimics this formula (from row_common.cc):
576
// #define BLENDER(a, b, f) (uint8)((int)(a) +
577
//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
578

579 580
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx) {
581
  int dx_offset[4] = {0, 1, 2, 3};
582
  int* tmp = dx_offset;
583
  const uint8* src_tmp = src_ptr;
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
  asm volatile (
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
    "vmul.s32   q1, q1, q2                     \n"
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "vadd.s32   q1, q1, q0                     \n"
    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
    "vadd.s32   q2, q1, q3                     \n"
    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
  "1:                                          \n"
    LOAD2_DATA8_LANE(0)
    LOAD2_DATA8_LANE(1)
    LOAD2_DATA8_LANE(2)
    LOAD2_DATA8_LANE(3)
    LOAD2_DATA8_LANE(4)
    LOAD2_DATA8_LANE(5)
    LOAD2_DATA8_LANE(6)
    LOAD2_DATA8_LANE(7)
    "vmov       q10, q1                        \n"
    "vmov       q11, q2                        \n"
    "vuzp.16    q10, q11                       \n"
    "vmovl.u8   q8, d6                         \n"
    "vmovl.u8   q9, d7                         \n"
    "vsubl.s16  q11, d18, d16                  \n"
    "vsubl.s16  q12, d19, d17                  \n"
    "vmovl.u16  q13, d20                       \n"
    "vmovl.u16  q10, d21                       \n"
    "vmul.s32   q11, q11, q13                  \n"
    "vmul.s32   q12, q12, q10                  \n"
615 616
    "vrshrn.s32  d18, q11, #16                 \n"
    "vrshrn.s32  d19, q12, #16                 \n"
617 618 619 620 621 622 623 624 625
    "vadd.s16   q8, q8, q9                     \n"
    "vmovn.s16  d6, q8                         \n"

    MEMACCESS(0)
    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
    "vadd.s32   q1, q1, q0                     \n"
    "vadd.s32   q2, q2, q0                     \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
626 627 628 629 630 631 632 633 634
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1", "q2", "q3",
635 636 637 638 639 640
    "q8", "q9", "q10", "q11", "q12", "q13"
  );
}

#undef LOAD2_DATA8_LANE

641 642 643 644 645 646
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction) {
  asm volatile (
    "cmp          %4, #0                       \n"
647
    "beq          100f                         \n"
648
    "add          %2, %1                       \n"
649 650
    "cmp          %4, #64                      \n"
    "beq          75f                          \n"
651
    "cmp          %4, #128                     \n"
652 653 654
    "beq          50f                          \n"
    "cmp          %4, #192                     \n"
    "beq          25f                          \n"
655 656 657 658

    "vdup.8       d5, %4                       \n"
    "rsb          %4, #256                     \n"
    "vdup.8       d4, %4                       \n"
659
    // General purpose row blend.
660
  "1:                                          \n"
661
    MEMACCESS(1)
662
    "vld1.8       {q0}, [%1]!                  \n"
663
    MEMACCESS(2)
664
    "vld1.8       {q1}, [%2]!                  \n"
665
    "subs         %3, %3, #16                  \n"
666 667 668 669 670 671
    "vmull.u8     q13, d0, d4                  \n"
    "vmull.u8     q14, d1, d4                  \n"
    "vmlal.u8     q13, d2, d5                  \n"
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
672
    MEMACCESS(0)
673
    "vst1.8       {q0}, [%0]!                  \n"
674
    "bgt          1b                           \n"
675
    "b            99f                          \n"
676

677 678
    // Blend 25 / 75.
  "25:                                         \n"
679
    MEMACCESS(1)
680
    "vld1.8       {q0}, [%1]!                  \n"
681
    MEMACCESS(2)
682
    "vld1.8       {q1}, [%2]!                  \n"
683
    "subs         %3, %3, #16                  \n"
684 685
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
686
    MEMACCESS(0)
687
    "vst1.8       {q0}, [%0]!                  \n"
688 689
    "bgt          25b                          \n"
    "b            99f                          \n"
690

691 692
    // Blend 50 / 50.
  "50:                                         \n"
693
    MEMACCESS(1)
694
    "vld1.8       {q0}, [%1]!                  \n"
695
    MEMACCESS(2)
696
    "vld1.8       {q1}, [%2]!                  \n"
697
    "subs         %3, %3, #16                  \n"
698
    "vrhadd.u8    q0, q1                       \n"
699
    MEMACCESS(0)
700
    "vst1.8       {q0}, [%0]!                  \n"
701 702 703 704 705
    "bgt          50b                          \n"
    "b            99f                          \n"

    // Blend 75 / 25.
  "75:                                         \n"
706
    MEMACCESS(1)
707
    "vld1.8       {q1}, [%1]!                  \n"
708
    MEMACCESS(2)
709
    "vld1.8       {q0}, [%2]!                  \n"
710
    "subs         %3, %3, #16                  \n"
711 712
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
713
    MEMACCESS(0)
714
    "vst1.8       {q0}, [%0]!                  \n"
715 716 717 718 719
    "bgt          75b                          \n"
    "b            99f                          \n"

    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
720
    MEMACCESS(1)
721
    "vld1.8       {q0}, [%1]!                  \n"
722
    "subs         %3, %3, #16                  \n"
723
    MEMACCESS(0)
724
    "vst1.8       {q0}, [%0]!                  \n"
725 726 727
    "bgt          100b                         \n"

  "99:                                         \n"
728
    MEMACCESS(0)
729
    "vst1.8       {d1[7]}, [%0]                \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
730 731 732 733 734 735 736
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(src_stride),       // %2
    "+r"(dst_width),        // %3
    "+r"(source_y_fraction) // %4
  :
  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
737 738
  );
}
739

740
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
741 742 743 744
                            uint8* dst, int dst_width) {
  asm volatile (
  "1:                                          \n"
    // load even pixels into q0, odd into q1
745
    MEMACCESS(0)
746
    "vld2.32    {q0, q1}, [%0]!                \n"
747
    MEMACCESS(0)
748 749
    "vld2.32    {q2, q3}, [%0]!                \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
750
    MEMACCESS(1)
751
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
752
    MEMACCESS(1)
753 754 755 756 757 758 759 760 761 762
    "vst1.8     {q3}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
  );
}

763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
    MEMACCESS(0)
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #1                     \n"
    "vrshrn.u16 d2, q2, #1                     \n"
    "vrshrn.u16 d3, q3, #1                     \n"
    MEMACCESS(1)
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
    "bgt       1b                              \n"
  : "+r"(src_argb),         // %0
    "+r"(dst_argb),         // %1
    "+r"(dst_width)         // %2
  :
  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
  );
}

791 792 793 794 795 796
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %1, %0                     \n"
  "1:                                          \n"
797
    MEMACCESS(0)
798
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
799
    MEMACCESS(0)
800 801 802 803 804 805
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
806
    MEMACCESS(1)
807
    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
808
    MEMACCESS(1)
809 810 811 812 813 814 815 816 817
    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
    "vrshrn.u16 d2, q2, #2                     \n"
    "vrshrn.u16 d3, q3, #2                     \n"
818
    MEMACCESS(2)
819 820 821 822 823 824 825 826 827 828 829 830 831
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(src_stride),       // %1
    "+r"(dst),              // %2
    "+r"(dst_width)         // %3
  :
  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
}

// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
832 833
void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
                               int src_stepx, uint8* dst_argb, int dst_width) {
834 835 836
  asm volatile (
    "mov        r12, %3, lsl #2                \n"
  "1:                                          \n"
837
    MEMACCESS(0)
838
    "vld1.32    {d0[0]}, [%0], r12             \n"
839
    MEMACCESS(0)
840
    "vld1.32    {d0[1]}, [%0], r12             \n"
841
    MEMACCESS(0)
842
    "vld1.32    {d1[0]}, [%0], r12             \n"
843
    MEMACCESS(0)
844 845
    "vld1.32    {d1[1]}, [%0], r12             \n"
    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
846
    MEMACCESS(1)
847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
    "vst1.8     {q0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
    "+r"(dst_argb),    // %1
    "+r"(dst_width)    // %2
  : "r"(src_stepx)     // %3
  : "memory", "cc", "r12", "q0"
  );
}

// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width) {
  asm volatile (
863 864
    "mov        r12, %4, lsl #2                \n"
    "add        %1, %1, %0                     \n"
865
  "1:                                          \n"
866
    MEMACCESS(0)
867
    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
868
    MEMACCESS(1)
869
    "vld1.8     {d1}, [%1], r12                \n"
870
    MEMACCESS(0)
871
    "vld1.8     {d2}, [%0], r12                \n"
872
    MEMACCESS(1)
873
    "vld1.8     {d3}, [%1], r12                \n"
874
    MEMACCESS(0)
875
    "vld1.8     {d4}, [%0], r12                \n"
876
    MEMACCESS(1)
877
    "vld1.8     {d5}, [%1], r12                \n"
878
    MEMACCESS(0)
879
    "vld1.8     {d6}, [%0], r12                \n"
880
    MEMACCESS(1)
881 882 883 884 885 886 887 888 889
    "vld1.8     {d7}, [%1], r12                \n"
    "vaddl.u8   q0, d0, d1                     \n"
    "vaddl.u8   q1, d2, d3                     \n"
    "vaddl.u8   q2, d4, d5                     \n"
    "vaddl.u8   q3, d6, d7                     \n"
    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
890 891 892
    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
893
    MEMACCESS(2)
894 895 896 897 898 899 900 901 902 903 904
    "vst1.8     {q0}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
    "+r"(src_stride),  // %1
    "+r"(dst_argb),    // %2
    "+r"(dst_width)    // %3
  : "r"(src_stepx)     // %4
  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
  );
}

905 906 907 908
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n)                               \
    "lsr        %5, %3, #16                    \n"             \
909
    "add        %6, %1, %5, lsl #2             \n"             \
910 911 912 913 914 915
    "add        %3, %3, %4                     \n"             \
    MEMACCESS(6)                                               \
    "vld1.32    {"#dn"["#n"]}, [%6]            \n"

void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
916
  int tmp;
917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932
  const uint8* src_tmp = src_argb;
  asm volatile (
  "1:                                          \n"
    LOAD1_DATA32_LANE(d0, 0)
    LOAD1_DATA32_LANE(d0, 1)
    LOAD1_DATA32_LANE(d1, 0)
    LOAD1_DATA32_LANE(d1, 1)
    LOAD1_DATA32_LANE(d2, 0)
    LOAD1_DATA32_LANE(d2, 1)
    LOAD1_DATA32_LANE(d3, 0)
    LOAD1_DATA32_LANE(d3, 1)

    MEMACCESS(0)
    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
933 934 935 936 937 938 939
  : "+r"(dst_argb),   // %0
    "+r"(src_argb),   // %1
    "+r"(dst_width),  // %2
    "+r"(x),          // %3
    "+r"(dx),         // %4
    "=&r"(tmp),       // %5
    "+r"(src_tmp)     // %6
940 941 942 943 944 945 946
  :
  : "memory", "cc", "q0", "q1"
  );
}

#undef LOAD1_DATA32_LANE

947 948 949 950 951 952 953 954 955 956 957 958
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
    "lsr        %5, %3, #16                           \n"      \
    "add        %6, %1, %5, lsl #2                    \n"      \
    "add        %3, %3, %4                            \n"      \
    MEMACCESS(6)                                               \
    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"

void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
                              int dst_width, int x, int dx) {
  int dx_offset[4] = {0, 1, 2, 3};
959
  int* tmp = dx_offset;
960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
  const uint8* src_tmp = src_argb;
  asm volatile (
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
    "vmul.s32   q1, q1, q2                     \n"
    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "vadd.s32   q8, q1, q0                     \n"
  "1:                                          \n"
    // d0, d1: a
    // d2, d3: b
    LOAD2_DATA32_LANE(d0, d2, 0)
    LOAD2_DATA32_LANE(d0, d2, 1)
    LOAD2_DATA32_LANE(d1, d3, 0)
    LOAD2_DATA32_LANE(d1, d3, 1)
    "vshrn.i32   d22, q8, #9                   \n"
    "vand.16     d22, d22, d30                 \n"
    "vdup.8      d24, d22[0]                   \n"
    "vdup.8      d25, d22[2]                   \n"
    "vdup.8      d26, d22[4]                   \n"
    "vdup.8      d27, d22[6]                   \n"
    "vext.8      d4, d24, d25, #4              \n"
    "vext.8      d5, d26, d27, #4              \n"  // f
    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
    "vmull.u8    q11, d0, d20                  \n"
    "vmull.u8    q12, d1, d21                  \n"
    "vmull.u8    q13, d2, d4                   \n"
    "vmull.u8    q14, d3, d5                   \n"
    "vadd.i16    q11, q11, q13                 \n"
    "vadd.i16    q12, q12, q14                 \n"
    "vshrn.i16   d0, q11, #7                   \n"
    "vshrn.i16   d1, q12, #7                   \n"

    MEMACCESS(0)
    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
    "vadd.s32    q8, q8, q9                    \n"
    "subs        %2, %2, #4                    \n"  // 4 processed per loop
    "bgt         1b                            \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
    "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

#undef LOAD2_DATA32_LANE

1016
#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
1017 1018 1019 1020 1021

#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif