row_neon.cc 132 KB
Newer Older
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 4 5 6
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "libyuv/row.h"
12

13 14 15 16 17
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

18
// This module is for GCC Neon
19 20
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
21

22 23
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422                                                             \
24
    MEMACCESS(0)                                                               \
25
    "vld1.8     {d0}, [%0]!                    \n"                             \
26
    MEMACCESS(1)                                                               \
27
    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
28
    MEMACCESS(2)                                                               \
29
    "vld1.32    {d2[1]}, [%2]!                 \n"
30

31 32
// Read 8 Y, 2 U and 2 V from 422
#define READYUV411                                                             \
33
    MEMACCESS(0)                                                               \
34
    "vld1.8     {d0}, [%0]!                    \n"                             \
35
    MEMACCESS(1)                                                               \
36
    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
37
    MEMACCESS(2)                                                               \
38
    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
39 40 41 42 43
    "vmov.u8    d3, d2                         \n"                             \
    "vzip.u8    d2, d3                         \n"

// Read 8 Y, 8 U and 8 V from 444
#define READYUV444                                                             \
44
    MEMACCESS(0)                                                               \
45
    "vld1.8     {d0}, [%0]!                    \n"                             \
46
    MEMACCESS(1)                                                               \
47
    "vld1.8     {d2}, [%1]!                    \n"                             \
48
    MEMACCESS(2)                                                               \
49
    "vld1.8     {d3}, [%2]!                    \n"                             \
50 51 52
    "vpaddl.u8  q1, q1                         \n"                             \
    "vrshrn.u16 d2, q1, #1                     \n"

53 54
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400                                                             \
55
    MEMACCESS(0)                                                               \
56
    "vld1.8     {d0}, [%0]!                    \n"                             \
57 58
    "vmov.u8    d2, #128                       \n"

59 60
// Read 8 Y and 4 UV from NV12
#define READNV12                                                               \
61
    MEMACCESS(0)                                                               \
62
    "vld1.8     {d0}, [%0]!                    \n"                             \
63
    MEMACCESS(1)                                                               \
64
    "vld1.8     {d2}, [%1]!                    \n"                             \
65 66
    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
    "vuzp.u8    d2, d3                         \n"                             \
67
    "vtrn.u32   d2, d3                         \n"
68 69 70

// Read 8 Y and 4 VU from NV21
#define READNV21                                                               \
71
    MEMACCESS(0)                                                               \
72
    "vld1.8     {d0}, [%0]!                    \n"                             \
73
    MEMACCESS(1)                                                               \
74
    "vld1.8     {d2}, [%1]!                    \n"                             \
75 76
    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
    "vuzp.u8    d3, d2                         \n"                             \
77 78 79 80
    "vtrn.u32   d2, d3                         \n"

// Read 8 YUY2
#define READYUY2                                                               \
81
    MEMACCESS(0)                                                               \
82
    "vld2.8     {d0, d2}, [%0]!                \n"                             \
83 84 85 86 87 88
    "vmov.u8    d3, d2                         \n"                             \
    "vuzp.u8    d2, d3                         \n"                             \
    "vtrn.u32   d2, d3                         \n"

// Read 8 UYVY
#define READUYVY                                                               \
89
    MEMACCESS(0)                                                               \
90
    "vld2.8     {d2, d3}, [%0]!                \n"                             \
91 92 93 94
    "vmov.u8    d0, d3                         \n"                             \
    "vmov.u8    d3, d2                         \n"                             \
    "vuzp.u8    d2, d3                         \n"                             \
    "vtrn.u32   d2, d3                         \n"
95

96 97 98 99 100 101 102 103
#define YUV422TORGB_SETUP_REG                                                  \
    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"

104
#define YUV422TORGB                                                            \
105 106 107 108 109 110 111 112 113
    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
    "vmovl.u8   q0, d0                         \n" /* Y                      */\
    "vmovl.s16  q10, d1                        \n"                             \
    "vmovl.s16  q0, d0                         \n"                             \
    "vmul.s32   q10, q10, q15                  \n"                             \
    "vmul.s32   q0, q0, q15                    \n"                             \
    "vqshrun.s32 d0, q0, #16                   \n"                             \
    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
114
    "vadd.s16   d18, d19                       \n"                             \
115 116 117
    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
    "vaddw.u16  q1, q1, d16                    \n"                             \
    "vaddw.u16  q10, q10, d17                  \n"                             \
    "vaddw.u16  q3, q3, d18                    \n"                             \
    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
    "vqshrun.s16 d21, q0, #6                   \n" /* G */

// YUV to RGB conversion constants.
// Y contribution to R,G,B.  Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */

// U and V contributions to R,G,B.
#define UB -128 /* -min(128, round(2.018 * 64)) */
#define UG 25 /* -round(-0.391 * 64) */
#define VG 52 /* -round(-0.813 * 64) */
#define VR -102 /* -round(1.596 * 64) */

// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128            - YGB)
#define BG (UG * 128 + VG * 128 - YGB)
#define BR            (VR * 128 - YGB)

static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
                          0, 0, 0, 0, 0, 0, 0, 0 };
static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
                        0, 0, 0, 0, 0, 0, 0, 0 };
static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };

#undef YG
#undef YGB
#undef UB
#undef UG
#undef VG
#undef VR
#undef BB
#undef BG
#undef BR
163

164 165 166 167 168 169
void I444ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
                        int width) {
  asm volatile (
170
    YUV422TORGB_SETUP_REG
171
    ".p2align   2                              \n"
172 173 174 175 176
  "1:                                          \n"
    READYUV444
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
    "vmov.u8    d23, #255                      \n"
177
    MEMACCESS(3)
178 179 180 181 182 183 184
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
185 186 187 188
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
189 190 191 192 193
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
194 195 196 197
void I422ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
198
                        int width) {
199
  asm volatile (
200
    YUV422TORGB_SETUP_REG
201
    ".p2align   2                              \n"
202
  "1:                                          \n"
203
    READYUV422
204
    YUV422TORGB
205
    "subs       %4, %4, #8                     \n"
206
    "vmov.u8    d23, #255                      \n"
207
    MEMACCESS(3)
208
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
209
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
210 211 212 213 214
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
215 216 217 218
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
219 220
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
221 222 223
  );
}

224 225 226 227 228 229
void I411ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
                        int width) {
  asm volatile (
230
    YUV422TORGB_SETUP_REG
231
    ".p2align   2                              \n"
232 233 234 235 236
  "1:                                          \n"
    READYUV411
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
    "vmov.u8    d23, #255                      \n"
237
    MEMACCESS(3)
238 239 240 241 242 243 244
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
245 246 247 248
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
249 250 251 252 253
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
254 255 256 257
void I422ToBGRARow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_bgra,
258
                        int width) {
259
  asm volatile (
260
    YUV422TORGB_SETUP_REG
261
    ".p2align   2                              \n"
262
  "1:                                          \n"
263
    READYUV422
264
    YUV422TORGB
265
    "subs       %4, %4, #8                     \n"
266 267
    "vswp.u8    d20, d22                       \n"
    "vmov.u8    d19, #255                      \n"
268
    MEMACCESS(3)
269
    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
270
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
271 272 273 274 275
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_bgra),  // %3
      "+r"(width)      // %4
276 277 278 279
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
280 281
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
282 283 284
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
285 286 287 288
void I422ToABGRRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_abgr,
289
                        int width) {
290
  asm volatile (
291
    YUV422TORGB_SETUP_REG
292
    ".p2align   2                              \n"
293
  "1:                                          \n"
294
    READYUV422
295
    YUV422TORGB
296
    "subs       %4, %4, #8                     \n"
297 298
    "vswp.u8    d20, d22                       \n"
    "vmov.u8    d23, #255                      \n"
299
    MEMACCESS(3)
300
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
301
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
302 303 304 305 306
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_abgr),  // %3
      "+r"(width)      // %4
307 308 309 310
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
311 312
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
313 314
  );
}
315

fbarchard@google.com's avatar
fbarchard@google.com committed
316 317 318 319
void I422ToRGBARow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_rgba,
320 321
                        int width) {
  asm volatile (
322
    YUV422TORGB_SETUP_REG
323
    ".p2align   2                              \n"
324
  "1:                                          \n"
325
    READYUV422
326
    YUV422TORGB
327
    "subs       %4, %4, #8                     \n"
328
    "vmov.u8    d19, #255                      \n"
329
    MEMACCESS(3)
330
    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
331
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
332 333 334 335 336
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_rgba),  // %3
      "+r"(width)      // %4
337 338 339 340
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
341 342
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
343 344 345
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
346
void I422ToRGB24Row_NEON(const uint8* src_y,
347 348 349 350
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_rgb24,
                         int width) {
351
  asm volatile (
352
    YUV422TORGB_SETUP_REG
353
    ".p2align   2                              \n"
354 355 356 357
  "1:                                          \n"
    READYUV422
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
358
    MEMACCESS(3)
359 360
    "vst3.8     {d20, d21, d22}, [%3]!         \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
361 362 363 364 365
    : "+r"(src_y),      // %0
      "+r"(src_u),      // %1
      "+r"(src_v),      // %2
      "+r"(dst_rgb24),  // %3
      "+r"(width)       // %4
366 367 368 369
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
370 371 372 373 374
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
375 376 377 378
void I422ToRAWRow_NEON(const uint8* src_y,
                       const uint8* src_u,
                       const uint8* src_v,
                       uint8* dst_raw,
379 380
                       int width) {
  asm volatile (
381
    YUV422TORGB_SETUP_REG
382
    ".p2align   2                              \n"
383 384 385 386 387
  "1:                                          \n"
    READYUV422
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
    "vswp.u8    d20, d22                       \n"
388
    MEMACCESS(3)
389 390
    "vst3.8     {d20, d21, d22}, [%3]!         \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
391 392 393 394
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_raw),  // %3
395
      "+r"(width)     // %4
396 397 398 399
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
400 401 402 403 404
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

405 406 407 408 409 410 411 412 413 414 415 416
#define ARGBTORGB565                                                           \
    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
    "vorr       q0, q0, q10                    \n"  /* BGR                  */

417
void I422ToRGB565Row_NEON(const uint8* src_y,
418 419 420 421
                          const uint8* src_u,
                          const uint8* src_v,
                          uint8* dst_rgb565,
                          int width) {
422
  asm volatile (
423
    YUV422TORGB_SETUP_REG
424
    ".p2align   2                              \n"
425 426 427 428
  "1:                                          \n"
    READYUV422
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
429
    ARGBTORGB565
430
    MEMACCESS(3)
431 432 433 434 435 436 437
    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_rgb565),  // %3
      "+r"(width)     // %4
438 439 440 441
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
442 443 444 445 446
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
#define ARGBTOARGB1555                                                         \
    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
    "vorr       q0, q0, q1                     \n"  /* BGRA                 */

void I422ToARGB1555Row_NEON(const uint8* src_y,
463 464 465 466
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb1555,
                            int width) {
467
  asm volatile (
468
    YUV422TORGB_SETUP_REG
469
    ".p2align   2                              \n"
470 471 472 473 474 475
  "1:                                          \n"
    READYUV422
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
    "vmov.u8    d23, #255                      \n"
    ARGBTOARGB1555
476
    MEMACCESS(3)
477 478 479 480 481 482 483
    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
    "bgt        1b                             \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_argb1555),  // %3
      "+r"(width)     // %4
484 485 486 487
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

#define ARGBTOARGB4444                                                         \
    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
    "vzip.u8    d0, d1                         \n"  /* BGRA                 */

void I422ToARGB4444Row_NEON(const uint8* src_y,
503 504 505 506
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb4444,
                            int width) {
507
  asm volatile (
508
    YUV422TORGB_SETUP_REG
509
    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
510
    ".p2align   2                              \n"
511 512 513 514 515 516
  "1:                                          \n"
    READYUV422
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
    "vmov.u8    d23, #255                      \n"
    ARGBTOARGB4444
517
    MEMACCESS(3)
518 519 520 521 522 523 524
    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
    "bgt        1b                             \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_argb4444),  // %3
      "+r"(width)     // %4
525 526 527 528
    : [kUVToRB]"r"(&kUVToRB),   // %5
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
529 530 531 532 533
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

534 535 536 537
void YToARGBRow_NEON(const uint8* src_y,
                     uint8* dst_argb,
                     int width) {
  asm volatile (
538
    YUV422TORGB_SETUP_REG
539
    ".p2align   2                              \n"
540 541 542 543 544
  "1:                                          \n"
    READYUV400
    YUV422TORGB
    "subs       %2, %2, #8                     \n"
    "vmov.u8    d23, #255                      \n"
545
    MEMACCESS(1)
546 547 548 549 550
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
551 552 553 554
    : [kUVToRB]"r"(&kUVToRB),   // %3
      [kUVToG]"r"(&kUVToG),     // %4
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
555 556 557 558 559 560 561 562 563
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void I400ToARGBRow_NEON(const uint8* src_y,
                        uint8* dst_argb,
                        int width) {
  asm volatile (
564
    ".p2align   2                              \n"
565 566
    "vmov.u8    d23, #255                      \n"
  "1:                                          \n"
567
    MEMACCESS(0)
568
    "vld1.8     {d20}, [%0]!                   \n"
569 570 571
    "vmov       d21, d20                       \n"
    "vmov       d22, d20                       \n"
    "subs       %2, %2, #8                     \n"
572
    MEMACCESS(1)
573 574 575 576 577 578 579 580 581 582
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    :
    : "cc", "memory", "d20", "d21", "d22", "d23"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
583 584 585
void NV12ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_uv,
                        uint8* dst_argb,
586 587
                        int width) {
  asm volatile (
588
    YUV422TORGB_SETUP_REG
589
    ".p2align   2                              \n"
590 591 592 593 594
  "1:                                          \n"
    READNV12
    YUV422TORGB
    "subs       %3, %3, #8                     \n"
    "vmov.u8    d23, #255                      \n"
595
    MEMACCESS(2)
596 597
    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
598 599 600 601
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
602 603 604 605
    : [kUVToRB]"r"(&kUVToRB),   // %4
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
606 607
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
608 609 610
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
611 612 613
void NV21ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_uv,
                        uint8* dst_argb,
614 615
                        int width) {
  asm volatile (
616
    YUV422TORGB_SETUP_REG
617
    ".p2align   2                              \n"
618 619 620 621 622
  "1:                                          \n"
    READNV21
    YUV422TORGB
    "subs       %3, %3, #8                     \n"
    "vmov.u8    d23, #255                      \n"
623
    MEMACCESS(2)
624 625
    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
626 627 628 629
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
630 631 632 633
    : [kUVToRB]"r"(&kUVToRB),   // %4
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
634 635
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
636 637 638
  );
}

639
void NV12ToRGB565Row_NEON(const uint8* src_y,
640 641 642
                          const uint8* src_uv,
                          uint8* dst_rgb565,
                          int width) {
643
  asm volatile (
644
    YUV422TORGB_SETUP_REG
645
    ".p2align   2                              \n"
646 647 648 649 650
  "1:                                          \n"
    READNV12
    YUV422TORGB
    "subs       %3, %3, #8                     \n"
    ARGBTORGB565
651
    MEMACCESS(2)
652 653 654 655 656 657
    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_rgb565),  // %2
      "+r"(width)      // %3
658 659 660 661
    : [kUVToRB]"r"(&kUVToRB),   // %4
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
662 663 664 665 666 667
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void NV21ToRGB565Row_NEON(const uint8* src_y,
668 669 670
                          const uint8* src_uv,
                          uint8* dst_rgb565,
                          int width) {
671
  asm volatile (
672
    YUV422TORGB_SETUP_REG
673
    ".p2align   2                              \n"
674 675 676 677 678
  "1:                                          \n"
    READNV21
    YUV422TORGB
    "subs       %3, %3, #8                     \n"
    ARGBTORGB565
679
    MEMACCESS(2)
680 681 682 683 684 685
    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_rgb565),  // %2
      "+r"(width)      // %3
686 687 688 689
    : [kUVToRB]"r"(&kUVToRB),   // %4
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
690 691 692 693 694
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

695 696 697 698
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                        uint8* dst_argb,
                        int width) {
  asm volatile (
699
    YUV422TORGB_SETUP_REG
700
    ".p2align   2                              \n"
701 702 703 704 705
  "1:                                          \n"
    READYUY2
    YUV422TORGB
    "subs       %2, %2, #8                     \n"
    "vmov.u8    d23, #255                      \n"
706
    MEMACCESS(1)
707 708 709 710 711
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_yuy2),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
712 713 714 715
    : [kUVToRB]"r"(&kUVToRB),   // %3
      [kUVToG]"r"(&kUVToG),     // %4
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
716 717 718 719 720 721 722 723 724
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                        uint8* dst_argb,
                        int width) {
  asm volatile (
725
    YUV422TORGB_SETUP_REG
726
    ".p2align   2                              \n"
727 728 729 730 731
  "1:                                          \n"
    READUYVY
    YUV422TORGB
    "subs       %2, %2, #8                     \n"
    "vmov.u8    d23, #255                      \n"
732
    MEMACCESS(1)
733 734 735 736 737
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_uyvy),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
738 739 740 741
    : [kUVToRB]"r"(&kUVToRB),   // %3
      [kUVToG]"r"(&kUVToG),     // %4
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
742 743 744 745 746
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

747
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
748 749
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                     int width) {
750
  asm volatile (
751
    ".p2align   2                              \n"
752
  "1:                                          \n"
753
    MEMACCESS(0)
754
    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
755
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
756
    MEMACCESS(1)
757
    "vst1.8     {q0}, [%1]!                    \n"  // store U
758
    MEMACCESS(2)
759
    "vst1.8     {q1}, [%2]!                    \n"  // store V
760
    "bgt        1b                             \n"
761 762 763
    : "+r"(src_uv),  // %0
      "+r"(dst_u),   // %1
      "+r"(dst_v),   // %2
764
      "+r"(width)    // %3  // Output registers
765
    :                       // Input registers
fbarchard@google.com's avatar
fbarchard@google.com committed
766
    : "cc", "memory", "q0", "q1"  // Clobber List
767 768 769
  );
}

770
// Reads 16 U's and V's and writes out 16 pairs of UV.
771 772
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                     int width) {
773
  asm volatile (
774
    ".p2align   2                              \n"
775
  "1:                                          \n"
776
    MEMACCESS(0)
777
    "vld1.8     {q0}, [%0]!                    \n"  // load U
778
    MEMACCESS(1)
779
    "vld1.8     {q1}, [%1]!                    \n"  // load V
780
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
781
    MEMACCESS(2)
782
    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
783 784 785 786 787 788 789
    "bgt        1b                             \n"
    :
      "+r"(src_u),   // %0
      "+r"(src_v),   // %1
      "+r"(dst_uv),  // %2
      "+r"(width)    // %3  // Output registers
    :                       // Input registers
fbarchard@google.com's avatar
fbarchard@google.com committed
790
    : "cc", "memory", "q0", "q1"  // Clobber List
791 792
  );
}
793

794
// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
795
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
796
  asm volatile (
797
    ".p2align   2                              \n"
798
  "1:                                          \n"
799
    MEMACCESS(0)
800
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
801
    "subs       %2, %2, #32                    \n"  // 32 processed per loop
802
    MEMACCESS(1)
803
    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
804
    "bgt        1b                             \n"
805 806 807 808
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(count)  // %2  // Output registers
  :                     // Input registers
fbarchard@google.com's avatar
fbarchard@google.com committed
809
  : "cc", "memory", "q0", "q1"  // Clobber List
810 811 812
  );
}

813
// SetRow writes 'count' bytes using an 8 bit value repeated.
814
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
815
  asm volatile (
816 817
    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
  "1:                                          \n"
818
    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
819
    MEMACCESS(0)
820
    "vst1.8    {q0}, [%0]!                     \n"  // store
821
    "bgt       1b                              \n"
822 823
  : "+r"(dst),   // %0
    "+r"(count)  // %1
824
  : "r"(v8)      // %2
fbarchard@google.com's avatar
fbarchard@google.com committed
825
  : "cc", "memory", "q0"
826
  );
827 828
}

829
// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
830
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
831 832 833 834 835 836 837 838 839 840 841 842
  asm volatile (
    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
  "1:                                          \n"
    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
    MEMACCESS(0)
    "vst1.8    {q0}, [%0]!                     \n"  // store
    "bgt       1b                              \n"
  : "+r"(dst),   // %0
    "+r"(count)  // %1
  : "r"(v32)     // %2
  : "cc", "memory", "q0"
  );
843 844
}

845
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
846
  asm volatile (
847 848 849 850 851
    // Start at end of source row.
    "mov        r3, #-16                       \n"
    "add        %0, %0, %2                     \n"
    "sub        %0, #16                        \n"

852
    ".p2align   2                              \n"
853
  "1:                                          \n"
854
    MEMACCESS(0)
855 856 857
    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
    "subs       %2, #16                        \n"  // 16 pixels per loop.
    "vrev64.8   q0, q0                         \n"
858
    MEMACCESS(1)
859
    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
860
    MEMACCESS(1)
861 862 863 864 865 866
    "vst1.8     {d0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(width)  // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
867
  : "cc", "memory", "r3", "q0"
868 869 870
  );
}

871 872
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                      int width) {
873
  asm volatile (
874
    // Start at end of source row.
875
    "mov        r12, #-16                      \n"
876 877 878
    "add        %0, %0, %3, lsl #1             \n"
    "sub        %0, #16                        \n"

879
    ".p2align   2                              \n"
880
  "1:                                          \n"
881
    MEMACCESS(0)
882
    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
883 884
    "subs       %3, #8                         \n"  // 8 pixels per loop.
    "vrev64.8   q0, q0                         \n"
885
    MEMACCESS(1)
886
    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
887
    MEMACCESS(2)
888 889 890 891 892 893 894
    "vst1.8     {d1}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_uv),  // %0
    "+r"(dst_u),   // %1
    "+r"(dst_v),   // %2
    "+r"(width)    // %3
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
895
  : "cc", "memory", "r12", "q0"
896 897
  );
}
898 899 900 901 902 903 904 905

void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  asm volatile (
    // Start at end of source row.
    "mov        r3, #-16                       \n"
    "add        %0, %0, %2, lsl #2             \n"
    "sub        %0, #16                        \n"

906
    ".p2align   2                              \n"
907
  "1:                                          \n"
908
    MEMACCESS(0)
909 910 911
    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
    "subs       %2, #4                         \n"  // 4 pixels per loop.
    "vrev64.32  q0, q0                         \n"
912
    MEMACCESS(1)
913
    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
914
    MEMACCESS(1)
915 916 917 918 919 920
    "vst1.8     {d0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(width)  // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
921
  : "cc", "memory", "r3", "q0"
922 923
  );
}
924

925 926
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  asm volatile (
927
    "vmov.u8    d4, #255                       \n"  // Alpha
928
    ".p2align   2                              \n"
929
  "1:                                          \n"
930
    MEMACCESS(0)
931
    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
932
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
933
    MEMACCESS(1)
934
    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
935 936
    "bgt        1b                             \n"
  : "+r"(src_rgb24),  // %0
937 938
    "+r"(dst_argb),   // %1
    "+r"(pix)         // %2
939
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
940
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
941 942 943 944 945
  );
}

void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
  asm volatile (
946
    "vmov.u8    d4, #255                       \n"  // Alpha
947
    ".p2align   2                              \n"
948
  "1:                                          \n"
949
    MEMACCESS(0)
950
    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
951
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
952
    "vswp.u8    d1, d3                         \n"  // swap R, B
953
    MEMACCESS(1)
954
    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
955
    "bgt        1b                             \n"
956
  : "+r"(src_raw),   // %0
957 958 959
    "+r"(dst_argb),  // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
960
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
961 962 963
  );
}

964
#define RGB565TOARGB                                                           \
fbarchard@google.com's avatar
fbarchard@google.com committed
965 966 967 968 969 970 971 972 973 974
    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
975 976 977 978

void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // Alpha
979
    ".p2align   2                              \n"
980
  "1:                                          \n"
981
    MEMACCESS(0)
982
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
983 984
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    RGB565TOARGB
985
    MEMACCESS(1)
986 987 988 989 990 991
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_rgb565),  // %0
    "+r"(dst_argb),    // %1
    "+r"(pix)          // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
992
  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
993 994 995
  );
}

996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
#define ARGB1555TOARGB                                                         \
    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \

1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB                                                           \
    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

1023 1024 1025 1026
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
                            int pix) {
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // Alpha
1027
    ".p2align   2                              \n"
1028
  "1:                                          \n"
1029
    MEMACCESS(0)
1030 1031 1032
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB1555TOARGB
1033
    MEMACCESS(1)
1034 1035 1036 1037 1038 1039
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb1555),  // %0
    "+r"(dst_argb),    // %1
    "+r"(pix)          // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1040
  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
  );
}

#define ARGB4444TOARGB                                                         \
    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */

void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
                            int pix) {
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // Alpha
1058
    ".p2align   2                              \n"
1059
  "1:                                          \n"
1060
    MEMACCESS(0)
1061 1062 1063
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB4444TOARGB
1064
    MEMACCESS(1)
1065 1066 1067 1068 1069 1070
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb4444),  // %0
    "+r"(dst_argb),    // %1
    "+r"(pix)          // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1071
  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
1072 1073 1074
  );
}

1075 1076
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
  asm volatile (
1077
    ".p2align   2                              \n"
1078
  "1:                                          \n"
1079
    MEMACCESS(0)
1080
    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
1081
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1082
    MEMACCESS(1)
1083
    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
1084 1085 1086 1087 1088
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_rgb24),  // %1
    "+r"(pix)         // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1089
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
1090 1091 1092
  );
}

1093 1094
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
  asm volatile (
1095
    ".p2align   2                              \n"
1096
  "1:                                          \n"
1097
    MEMACCESS(0)
1098
    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
1099
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1100
    "vswp.u8    d1, d3                         \n"  // swap R, B
1101
    MEMACCESS(1)
1102
    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
1103 1104 1105 1106 1107
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_raw),   // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1108
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
1109 1110 1111
  );
}

1112 1113
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
  asm volatile (
1114
    ".p2align   2                              \n"
1115
  "1:                                          \n"
1116
    MEMACCESS(0)
1117
    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
1118
    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
1119
    MEMACCESS(1)
1120
    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
1121 1122 1123 1124 1125
    "bgt        1b                             \n"
  : "+r"(src_yuy2),  // %0
    "+r"(dst_y),     // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1126
  : "cc", "memory", "q0", "q1"  // Clobber List
1127 1128 1129 1130 1131
  );
}

void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
  asm volatile (
1132
    ".p2align   2                              \n"
1133
  "1:                                          \n"
1134
    MEMACCESS(0)
1135
    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
1136
    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
1137
    MEMACCESS(1)
1138
    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
1139 1140 1141 1142 1143
    "bgt        1b                             \n"
  : "+r"(src_uyvy),  // %0
    "+r"(dst_y),     // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1144
  : "cc", "memory", "q0", "q1"  // Clobber List
1145 1146 1147 1148 1149 1150
  );
}

void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
                         int pix) {
  asm volatile (
1151
    ".p2align   2                              \n"
1152
  "1:                                          \n"
1153
    MEMACCESS(0)
1154
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
1155
    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1156
    MEMACCESS(1)
1157
    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
1158
    MEMACCESS(2)
1159
    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
1160 1161 1162 1163 1164 1165
    "bgt        1b                             \n"
  : "+r"(src_yuy2),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
    "+r"(pix)        // %3
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1166
  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1167 1168 1169 1170 1171 1172
  );
}

void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
                         int pix) {
  asm volatile (
1173
    ".p2align   2                              \n"
1174
  "1:                                          \n"
1175
    MEMACCESS(0)
1176
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1177
    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1178
    MEMACCESS(1)
1179
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
1180
    MEMACCESS(2)
1181
    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
1182 1183 1184 1185 1186 1187
    "bgt        1b                             \n"
  : "+r"(src_uyvy),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
    "+r"(pix)        // %3
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1188
  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1189 1190 1191 1192 1193 1194
  );
}

void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
1195
    "add        %1, %0, %1                     \n"  // stride + src_yuy2
1196
    ".p2align   2                              \n"
1197
  "1:                                          \n"
1198
    MEMACCESS(0)
1199
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
1200
    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1201
    MEMACCESS(1)
1202 1203 1204
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
1205
    MEMACCESS(2)
1206
    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
1207
    MEMACCESS(3)
1208
    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
1209
    "bgt        1b                             \n"
1210
  : "+r"(src_yuy2),     // %0
1211
    "+r"(stride_yuy2),  // %1
1212 1213 1214
    "+r"(dst_u),        // %2
    "+r"(dst_v),        // %3
    "+r"(pix)           // %4
1215
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1216
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1217 1218 1219 1220 1221 1222
  );
}

void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
1223
    "add        %1, %0, %1                     \n"  // stride + src_uyvy
1224
    ".p2align   2                              \n"
1225
  "1:                                          \n"
1226
    MEMACCESS(0)
1227
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1228
    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1229
    MEMACCESS(1)
1230 1231 1232
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
1233
    MEMACCESS(2)
1234
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
1235
    MEMACCESS(3)
1236
    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
1237
    "bgt        1b                             \n"
1238
  : "+r"(src_uyvy),     // %0
1239
    "+r"(stride_uyvy),  // %1
1240 1241 1242
    "+r"(dst_u),        // %2
    "+r"(dst_v),        // %3
    "+r"(pix)           // %4
1243
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1244
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1245 1246
  );
}
1247

1248 1249 1250 1251 1252
// Select G channels from ARGB.  e.g.  GGGGGGGG
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /*selector*/, int pix) {
  asm volatile (
  "1:                                          \n"
1253
    MEMACCESS(0)
1254 1255
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
1256
    MEMACCESS(1)
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266
    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_bayer),  // %1
    "+r"(pix)         // %2
  :
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
1267 1268 1269 1270
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                         const uint8* shuffler, int pix) {
  asm volatile (
1271
    MEMACCESS(3)
1272
    "vld1.8     {q2}, [%3]                     \n"  // shuffler
fbarchard@google.com's avatar
fbarchard@google.com committed
1273
  "1:                                          \n"
1274
    MEMACCESS(0)
1275
    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
fbarchard@google.com's avatar
fbarchard@google.com committed
1276 1277 1278
    "subs       %2, %2, #4                     \n"  // 4 processed per loop
    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
1279
    MEMACCESS(1)
1280
    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
fbarchard@google.com's avatar
fbarchard@google.com committed
1281 1282 1283 1284 1285 1286
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_argb),  // %1
    "+r"(pix)        // %2
  : "r"(shuffler)    // %3
  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
1287
  );
1288 1289
}

fbarchard@google.com's avatar
fbarchard@google.com committed
1290 1291 1292 1293 1294
void I422ToYUY2Row_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_yuy2, int width) {
  asm volatile (
1295
    ".p2align   2                              \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
1296
  "1:                                          \n"
1297
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1298
    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
1299
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1300
    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
1301
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1302 1303
    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
    "subs       %4, %4, #16                    \n"  // 16 pixels
1304
    MEMACCESS(3)
1305
    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
fbarchard@google.com's avatar
fbarchard@google.com committed
1306
    "bgt        1b                             \n"
1307 1308 1309 1310 1311 1312 1313
  : "+r"(src_y),     // %0
    "+r"(src_u),     // %1
    "+r"(src_v),     // %2
    "+r"(dst_yuy2),  // %3
    "+r"(width)      // %4
  :
  : "cc", "memory", "d0", "d1", "d2", "d3"
fbarchard@google.com's avatar
fbarchard@google.com committed
1314 1315 1316 1317 1318 1319 1320 1321
  );
}

void I422ToUYVYRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_uyvy, int width) {
  asm volatile (
1322
    ".p2align   2                              \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
1323
  "1:                                          \n"
1324
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1325
    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
1326
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1327
    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
1328
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1329 1330
    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
    "subs       %4, %4, #16                    \n"  // 16 pixels
1331
    MEMACCESS(3)
1332
    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
fbarchard@google.com's avatar
fbarchard@google.com committed
1333
    "bgt        1b                             \n"
1334 1335 1336 1337 1338 1339 1340
  : "+r"(src_y),     // %0
    "+r"(src_u),     // %1
    "+r"(src_v),     // %2
    "+r"(dst_uyvy),  // %3
    "+r"(width)      // %4
  :
  : "cc", "memory", "d0", "d1", "d2", "d3"
fbarchard@google.com's avatar
fbarchard@google.com committed
1341 1342 1343
  );
}

1344 1345
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
  asm volatile (
1346
    ".p2align   2                              \n"
1347
  "1:                                          \n"
1348
    MEMACCESS(0)
1349
    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1350
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1351
    ARGBTORGB565
1352
    MEMACCESS(1)
1353 1354 1355 1356 1357 1358
    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_rgb565),  // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1359
  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1360 1361 1362 1363 1364 1365
  );
}

void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
                            int pix) {
  asm volatile (
1366
    ".p2align   2                              \n"
1367
  "1:                                          \n"
1368
    MEMACCESS(0)
1369
    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1370
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1371
    ARGBTOARGB1555
1372
    MEMACCESS(1)
1373 1374 1375 1376 1377 1378
    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_argb1555),  // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1379
  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1380 1381 1382
  );
}

1383 1384
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
                            int pix) {
1385 1386
  asm volatile (
    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
1387
    ".p2align   2                              \n"
1388
  "1:                                          \n"
1389
    MEMACCESS(0)
1390
    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1391
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1392
    ARGBTOARGB4444
1393
    MEMACCESS(1)
1394 1395
    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
    "bgt        1b                             \n"
1396
  : "+r"(src_argb),      // %0
1397
    "+r"(dst_argb4444),  // %1
1398
    "+r"(pix)            // %2
1399
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1400
  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1401 1402
  );
}
1403 1404

void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1405 1406 1407 1408 1409
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
1410
    ".p2align   2                              \n"
1411
  "1:                                          \n"
1412
    MEMACCESS(0)
1413
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1414 1415 1416 1417 1418 1419
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
1420
    MEMACCESS(1)
1421 1422 1423 1424 1425 1426
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_y),     // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1427
  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
1428 1429 1430
  );
}

1431 1432
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
  asm volatile (
1433 1434 1435
    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
1436
    ".p2align   2                              \n"
1437
  "1:                                          \n"
1438
    MEMACCESS(0)
1439 1440 1441 1442 1443
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
1444
    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
1445
    MEMACCESS(1)
1446 1447 1448 1449 1450 1451 1452 1453 1454 1455
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_y),     // %1
    "+r"(pix)        // %2
  :
  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  );
}

1456
// 8x1 pixels.
1457 1458 1459
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                         int pix) {
  asm volatile (
1460 1461 1462 1463 1464 1465
    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1466
    ".p2align   2                              \n"
1467
  "1:                                          \n"
1468
    MEMACCESS(0)
1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlsl.u8   q2, d1, d25                    \n"  // G
    "vmlsl.u8   q2, d2, d26                    \n"  // R
    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned

    "vmull.u8   q3, d2, d24                    \n"  // R
    "vmlsl.u8   q3, d1, d28                    \n"  // G
    "vmlsl.u8   q3, d0, d27                    \n"  // B
    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned

1481 1482
    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
1483

1484
    MEMACCESS(1)
1485
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1486
    MEMACCESS(2)
1487
    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1488 1489 1490 1491 1492 1493
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
    "+r"(pix)        // %3
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1494
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
1495 1496
  );
}
1497 1498 1499 1500 1501 1502 1503 1504 1505

// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                         int pix) {
  asm volatile (
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
fbarchard@google.com's avatar
fbarchard@google.com committed
1506
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1507
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1508
    ".p2align   2                              \n"
1509
  "1:                                          \n"
1510
    MEMACCESS(0)
1511
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1512
    MEMACCESS(0)
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
    "vmul.s16   q8, q0, q10                    \n"  // B
    "vmls.s16   q8, q1, q11                    \n"  // G
    "vmls.s16   q8, q2, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

    "vmul.s16   q9, q2, q10                    \n"  // R
    "vmls.s16   q9, q1, q14                    \n"  // G
    "vmls.s16   q9, q0, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

1533
    MEMACCESS(1)
1534
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1535
    MEMACCESS(2)
1536 1537 1538 1539 1540 1541 1542
    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
    "+r"(pix)        // %3
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1543
  : "cc", "memory", "q0", "q1", "q2", "q3",
1544 1545 1546
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
1547

fbarchard@google.com's avatar
fbarchard@google.com committed
1548 1549 1550 1551
// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                         int pix) {
  asm volatile (
1552 1553 1554 1555 1556
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
fbarchard@google.com's avatar
fbarchard@google.com committed
1557
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1558
    ".p2align   2                              \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
1559
  "1:                                          \n"
1560
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1561
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1562
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1563 1564 1565 1566
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1567
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1568
    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
1569
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1570 1571 1572 1573
    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
1574

fbarchard@google.com's avatar
fbarchard@google.com committed
1575 1576 1577 1578 1579 1580
    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
    "vpadd.u16  d1, d8, d9                     \n"  // B
    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
    "vpadd.u16  d3, d10, d11                   \n"  // G
    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
    "vpadd.u16  d5, d12, d13                   \n"  // R
1581 1582 1583 1584 1585

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

fbarchard@google.com's avatar
fbarchard@google.com committed
1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596
    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
    "vmul.s16   q8, q0, q10                    \n"  // B
    "vmls.s16   q8, q1, q11                    \n"  // G
    "vmls.s16   q8, q2, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
    "vmul.s16   q9, q2, q10                    \n"  // R
    "vmls.s16   q9, q1, q14                    \n"  // G
    "vmls.s16   q9, q0, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1597
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1598
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1599
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1600 1601 1602 1603 1604 1605 1606
    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
    "+r"(pix)        // %3
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1607
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
fbarchard@google.com's avatar
fbarchard@google.com committed
1608 1609 1610 1611
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
1612
// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
1613 1614 1615 1616
#define RGBTOUV(QB, QG, QR) \
    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
1617
    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
1618 1619 1620
    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
1621 1622 1623 1624
    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */

1625
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
fbarchard@google.com's avatar
fbarchard@google.com committed
1626 1627
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.com's avatar
fbarchard@google.com committed
1628
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
1629
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1630 1631 1632 1633 1634
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
fbarchard@google.com's avatar
fbarchard@google.com committed
1635
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1636
    ".p2align   2                              \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
1637
  "1:                                          \n"
1638
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1639
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1640
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1641 1642 1643 1644
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1645
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1646
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1647
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1648 1649 1650 1651
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1652 1653 1654 1655 1656

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

fbarchard@google.com's avatar
fbarchard@google.com committed
1657
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1658
    RGBTOUV(q0, q1, q2)
1659
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1660
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1661
    MEMACCESS(3)
fbarchard@google.com's avatar
fbarchard@google.com committed
1662
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
fbarchard@google.com's avatar
fbarchard@google.com committed
1663 1664
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
fbarchard@google.com's avatar
fbarchard@google.com committed
1665 1666 1667 1668
    "+r"(src_stride_argb),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
fbarchard@google.com's avatar
fbarchard@google.com committed
1669
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1670
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
fbarchard@google.com's avatar
fbarchard@google.com committed
1671 1672 1673
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
1674

1675 1676 1677 1678 1679
// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1680 1681 1682 1683 1684
    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
1685
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1686
    ".p2align   2                              \n"
1687
  "1:                                          \n"
1688
    MEMACCESS(0)
1689
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1690
    MEMACCESS(0)
1691 1692 1693 1694
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1695
    MEMACCESS(1)
1696
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1697
    MEMACCESS(1)
1698 1699 1700 1701
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1702 1703 1704 1705 1706

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1707 1708
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
1709
    MEMACCESS(2)
1710
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1711
    MEMACCESS(3)
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(src_stride_argb),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

1725 1726 1727 1728
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
1729 1730 1731 1732 1733
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1734
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1735
    ".p2align   2                              \n"
1736
  "1:                                          \n"
1737
    MEMACCESS(0)
1738
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
1739
    MEMACCESS(0)
1740 1741 1742 1743
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
1744
    MEMACCESS(1)
1745
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
1746
    MEMACCESS(1)
1747 1748 1749 1750
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
1751 1752 1753 1754 1755

    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
    "vrshr.u16  q2, q2, #1                     \n"
    "vrshr.u16  q3, q3, #1                     \n"

1756 1757
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q3, q2, q1)
1758
    MEMACCESS(2)
1759
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1760
    MEMACCESS(3)
1761 1762 1763 1764 1765 1766 1767 1768
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_bgra),  // %0
    "+r"(src_stride_bgra),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1769
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1770 1771 1772 1773 1774 1775 1776 1777
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
1778 1779 1780 1781 1782
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1783
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1784
    ".p2align   2                              \n"
1785
  "1:                                          \n"
1786
    MEMACCESS(0)
1787
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
1788
    MEMACCESS(0)
1789 1790 1791 1792
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1793
    MEMACCESS(1)
1794
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
1795
    MEMACCESS(1)
1796 1797 1798 1799
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1800 1801 1802 1803 1804

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1805 1806
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q2, q1, q0)
1807
    MEMACCESS(2)
1808
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1809
    MEMACCESS(3)
1810 1811 1812 1813 1814 1815 1816 1817
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_abgr),  // %0
    "+r"(src_stride_abgr),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1818
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1819 1820 1821 1822 1823 1824 1825 1826
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
                      uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
1827 1828 1829 1830 1831
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1832
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1833
    ".p2align   2                              \n"
1834
  "1:                                          \n"
1835
    MEMACCESS(0)
1836
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
1837
    MEMACCESS(0)
1838 1839 1840 1841
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
1842
    MEMACCESS(1)
1843
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
1844
    MEMACCESS(1)
1845 1846 1847 1848
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
1849 1850 1851 1852 1853

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1854 1855
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
1856
    MEMACCESS(2)
1857
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1858
    MEMACCESS(3)
1859 1860 1861 1862 1863 1864 1865 1866
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgba),  // %0
    "+r"(src_stride_rgba),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1867
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1868 1869 1870 1871 1872 1873 1874 1875
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
                       uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
1876 1877 1878 1879 1880
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1881
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1882
    ".p2align   2                              \n"
1883
  "1:                                          \n"
1884
    MEMACCESS(0)
1885
    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
1886
    MEMACCESS(0)
1887 1888 1889 1890
    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1891
    MEMACCESS(1)
1892
    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
1893
    MEMACCESS(1)
1894 1895 1896 1897
    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1898 1899 1900 1901 1902

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1903 1904
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
1905
    MEMACCESS(2)
1906
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1907
    MEMACCESS(3)
1908 1909 1910 1911 1912 1913 1914 1915
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgb24),  // %0
    "+r"(src_stride_rgb24),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1916
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1917 1918 1919 1920 1921 1922 1923 1924
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
                     uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_raw
1925 1926 1927 1928 1929
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1930
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1931
    ".p2align   2                              \n"
1932
  "1:                                          \n"
1933
    MEMACCESS(0)
1934
    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
1935
    MEMACCESS(0)
1936 1937 1938 1939
    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1940
    MEMACCESS(1)
1941
    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
1942
    MEMACCESS(1)
1943 1944 1945 1946
    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1947 1948 1949 1950 1951

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1952 1953
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q2, q1, q0)
1954
    MEMACCESS(2)
1955
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1956
    MEMACCESS(3)
1957 1958 1959 1960 1961 1962 1963 1964
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_raw),  // %0
    "+r"(src_stride_raw),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1965
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1966 1967 1968 1969
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
1970 1971 1972 1973 1974
// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
                        uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1975 1976 1977 1978 1979
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
fbarchard@google.com's avatar
fbarchard@google.com committed
1980
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1981
    ".p2align   2                              \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
1982
  "1:                                          \n"
1983
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1984 1985 1986 1987 1988
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
    RGB565TOARGB
    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1989
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1990 1991 1992 1993 1994 1995
    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
    RGB565TOARGB
    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

1996
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1997 1998 1999 2000 2001
    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
    RGB565TOARGB
    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2002
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
2003 2004 2005 2006 2007 2008
    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
    RGB565TOARGB
    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

2009 2010 2011 2012
    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
    "vrshr.u16  q5, q5, #1                     \n"
    "vrshr.u16  q6, q6, #1                     \n"

fbarchard@google.com's avatar
fbarchard@google.com committed
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
    "vmul.s16   q8, q4, q10                    \n"  // B
    "vmls.s16   q8, q5, q11                    \n"  // G
    "vmls.s16   q8, q6, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
    "vmul.s16   q9, q6, q10                    \n"  // R
    "vmls.s16   q9, q5, q14                    \n"  // G
    "vmls.s16   q9, q4, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2024
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
2025
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2026
    MEMACCESS(3)
fbarchard@google.com's avatar
fbarchard@google.com committed
2027 2028 2029 2030 2031 2032 2033 2034
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgb565),  // %0
    "+r"(src_stride_rgb565),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2035
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
fbarchard@google.com's avatar
fbarchard@google.com committed
2036 2037 2038
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
2039 2040 2041 2042 2043 2044

// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
                        uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
2045 2046 2047 2048 2049
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2050
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2051
    ".p2align   2                              \n"
2052
  "1:                                          \n"
2053
    MEMACCESS(0)
2054 2055 2056 2057 2058
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2059
    MEMACCESS(0)
2060 2061 2062 2063 2064 2065
    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

2066
    MEMACCESS(1)
2067 2068 2069 2070 2071
    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2072
    MEMACCESS(1)
2073 2074 2075 2076 2077 2078
    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

2079 2080 2081 2082
    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
    "vrshr.u16  q5, q5, #1                     \n"
    "vrshr.u16  q6, q6, #1                     \n"

2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093
    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
    "vmul.s16   q8, q4, q10                    \n"  // B
    "vmls.s16   q8, q5, q11                    \n"  // G
    "vmls.s16   q8, q6, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
    "vmul.s16   q9, q6, q10                    \n"  // R
    "vmls.s16   q9, q5, q14                    \n"  // G
    "vmls.s16   q9, q4, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2094
    MEMACCESS(2)
2095
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2096
    MEMACCESS(3)
2097 2098 2099 2100 2101 2102 2103 2104
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb1555),  // %0
    "+r"(src_stride_argb1555),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2105
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2106 2107 2108 2109 2110 2111 2112 2113 2114
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
                          uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
2115 2116 2117 2118 2119
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
2120
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
2121
    ".p2align   2                              \n"
2122
  "1:                                          \n"
2123
    MEMACCESS(0)
2124 2125 2126 2127 2128
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2129
    MEMACCESS(0)
2130 2131 2132 2133 2134 2135
    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

2136
    MEMACCESS(1)
2137 2138 2139 2140 2141
    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
2142
    MEMACCESS(1)
2143 2144 2145 2146 2147 2148
    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

2149 2150 2151 2152
    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
    "vrshr.u16  q5, q5, #1                     \n"
    "vrshr.u16  q6, q6, #1                     \n"

2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163
    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
    "vmul.s16   q8, q4, q10                    \n"  // B
    "vmls.s16   q8, q5, q11                    \n"  // G
    "vmls.s16   q8, q6, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
    "vmul.s16   q9, q6, q10                    \n"  // R
    "vmls.s16   q9, q5, q14                    \n"  // G
    "vmls.s16   q9, q4, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
2164
    MEMACCESS(2)
2165
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
2166
    MEMACCESS(3)
2167 2168 2169 2170 2171 2172 2173 2174
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb4444),  // %0
    "+r"(src_stride_argb4444),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(pix)        // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2175
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2176 2177 2178
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
fbarchard@google.com's avatar
fbarchard@google.com committed
2179

2180 2181 2182 2183 2184 2185
void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2186
    ".p2align   2                              \n"
2187
  "1:                                          \n"
2188
    MEMACCESS(0)
2189
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
2190 2191 2192 2193 2194 2195 2196
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    RGB565TOARGB
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
2197
    MEMACCESS(1)
2198 2199 2200 2201 2202 2203
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_rgb565),  // %0
    "+r"(dst_y),       // %1
    "+r"(pix)          // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2204
  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2205 2206 2207
  );
}

2208 2209 2210 2211 2212 2213
void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2214
    ".p2align   2                              \n"
2215
  "1:                                          \n"
2216
    MEMACCESS(0)
2217 2218 2219 2220 2221 2222 2223 2224
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB1555TOARGB
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
2225
    MEMACCESS(1)
2226 2227 2228 2229 2230 2231
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb1555),  // %0
    "+r"(dst_y),         // %1
    "+r"(pix)            // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2232
  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2233 2234 2235 2236 2237 2238 2239 2240 2241
  );
}

void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
2242
    ".p2align   2                              \n"
2243
  "1:                                          \n"
2244
    MEMACCESS(0)
2245 2246 2247 2248 2249 2250 2251 2252
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB4444TOARGB
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
2253
    MEMACCESS(1)
2254 2255 2256 2257 2258 2259
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb4444),  // %0
    "+r"(dst_y),         // %1
    "+r"(pix)            // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2260
  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2261 2262
  );
}
2263 2264 2265 2266 2267 2268 2269

void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2270
    ".p2align   2                              \n"
2271
  "1:                                          \n"
2272
    MEMACCESS(0)
2273 2274 2275 2276 2277 2278 2279
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d1, d4                     \n"  // R
    "vmlal.u8   q8, d2, d5                     \n"  // G
    "vmlal.u8   q8, d3, d6                     \n"  // B
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2280
    MEMACCESS(1)
2281 2282 2283 2284 2285 2286
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_bgra),  // %0
    "+r"(dst_y),     // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2287
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2288 2289 2290 2291 2292 2293 2294 2295 2296
  );
}

void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2297
    ".p2align   2                              \n"
2298
  "1:                                          \n"
2299
    MEMACCESS(0)
2300 2301 2302 2303 2304 2305 2306
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d0, d4                     \n"  // R
    "vmlal.u8   q8, d1, d5                     \n"  // G
    "vmlal.u8   q8, d2, d6                     \n"  // B
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2307
    MEMACCESS(1)
2308 2309 2310 2311 2312 2313
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_abgr),  // %0
    "+r"(dst_y),  // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2314
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2315 2316 2317 2318
  );
}

void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2319 2320 2321 2322 2323
  asm volatile (
    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2324
    ".p2align   2                              \n"
2325
  "1:                                          \n"
2326
    MEMACCESS(0)
2327 2328 2329 2330 2331 2332 2333
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d1, d4                     \n"  // B
    "vmlal.u8   q8, d2, d5                     \n"  // G
    "vmlal.u8   q8, d3, d6                     \n"  // R
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2334
    MEMACCESS(1)
2335 2336 2337 2338 2339 2340
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_rgba),  // %0
    "+r"(dst_y),  // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2341
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2342 2343 2344 2345 2346 2347 2348 2349 2350
  );
}

void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2351
    ".p2align   2                              \n"
2352
  "1:                                          \n"
2353
    MEMACCESS(0)
2354
    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
2355 2356 2357 2358 2359 2360
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d0, d4                     \n"  // B
    "vmlal.u8   q8, d1, d5                     \n"  // G
    "vmlal.u8   q8, d2, d6                     \n"  // R
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2361
    MEMACCESS(1)
2362 2363
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
2364
  : "+r"(src_rgb24),  // %0
2365 2366 2367
    "+r"(dst_y),  // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2368
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2369 2370
  );
}
2371 2372 2373 2374 2375 2376 2377

void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
2378
    ".p2align   2                              \n"
2379
  "1:                                          \n"
2380
    MEMACCESS(0)
2381 2382 2383 2384 2385 2386 2387
    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d0, d4                     \n"  // B
    "vmlal.u8   q8, d1, d5                     \n"  // G
    "vmlal.u8   q8, d2, d6                     \n"  // R
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2388
    MEMACCESS(1)
2389 2390 2391 2392 2393 2394
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_raw),  // %0
    "+r"(dst_y),  // %1
    "+r"(pix)        // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2395
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2396 2397
  );
}
2398

2399 2400 2401 2402
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8* dst_ptr,
                         const uint8* src_ptr, ptrdiff_t src_stride,
                         int dst_width, int source_y_fraction) {
2403
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416
    "cmp        %4, #0                         \n"
    "beq        100f                           \n"
    "add        %2, %1                         \n"
    "cmp        %4, #64                        \n"
    "beq        75f                            \n"
    "cmp        %4, #128                       \n"
    "beq        50f                            \n"
    "cmp        %4, #192                       \n"
    "beq        25f                            \n"

    "vdup.8     d5, %4                         \n"
    "rsb        %4, #256                       \n"
    "vdup.8     d4, %4                         \n"
2417 2418
    // General purpose row blend.
  "1:                                          \n"
2419
    MEMACCESS(1)
2420
    "vld1.8     {q0}, [%1]!                    \n"
2421
    MEMACCESS(2)
2422
    "vld1.8     {q1}, [%2]!                    \n"
2423
    "subs       %3, %3, #16                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2424 2425 2426 2427 2428 2429
    "vmull.u8   q13, d0, d4                    \n"
    "vmull.u8   q14, d1, d4                    \n"
    "vmlal.u8   q13, d2, d5                    \n"
    "vmlal.u8   q14, d3, d5                    \n"
    "vrshrn.u16 d0, q13, #8                    \n"
    "vrshrn.u16 d1, q14, #8                    \n"
2430
    MEMACCESS(0)
2431
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2432 2433
    "bgt        1b                             \n"
    "b          99f                            \n"
2434 2435 2436

    // Blend 25 / 75.
  "25:                                         \n"
2437
    MEMACCESS(1)
2438
    "vld1.8     {q0}, [%1]!                    \n"
2439
    MEMACCESS(2)
2440
    "vld1.8     {q1}, [%2]!                    \n"
2441
    "subs       %3, %3, #16                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2442 2443
    "vrhadd.u8  q0, q1                         \n"
    "vrhadd.u8  q0, q1                         \n"
2444
    MEMACCESS(0)
2445
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2446 2447
    "bgt        25b                            \n"
    "b          99f                            \n"
2448 2449 2450

    // Blend 50 / 50.
  "50:                                         \n"
2451
    MEMACCESS(1)
2452
    "vld1.8     {q0}, [%1]!                    \n"
2453
    MEMACCESS(2)
2454
    "vld1.8     {q1}, [%2]!                    \n"
2455
    "subs       %3, %3, #16                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2456
    "vrhadd.u8  q0, q1                         \n"
2457
    MEMACCESS(0)
2458
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2459 2460
    "bgt        50b                            \n"
    "b          99f                            \n"
2461 2462 2463

    // Blend 75 / 25.
  "75:                                         \n"
2464
    MEMACCESS(1)
2465
    "vld1.8     {q1}, [%1]!                    \n"
2466
    MEMACCESS(2)
2467
    "vld1.8     {q0}, [%2]!                    \n"
2468
    "subs       %3, %3, #16                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2469 2470
    "vrhadd.u8  q0, q1                         \n"
    "vrhadd.u8  q0, q1                         \n"
2471
    MEMACCESS(0)
2472
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2473 2474
    "bgt        75b                            \n"
    "b          99f                            \n"
2475 2476 2477

    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
2478
    MEMACCESS(1)
2479
    "vld1.8     {q0}, [%1]!                    \n"
2480
    "subs       %3, %3, #16                    \n"
2481
    MEMACCESS(0)
2482
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2483
    "bgt        100b                           \n"
2484 2485 2486 2487 2488 2489 2490 2491

  "99:                                         \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(src_stride),       // %2
    "+r"(dst_width),        // %3
    "+r"(source_y_fraction) // %4
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2492
  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
2493 2494
  );
}
fbarchard@google.com's avatar
fbarchard@google.com committed
2495 2496 2497 2498 2499 2500 2501 2502 2503

// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                       uint8* dst_argb, int width) {
  asm volatile (
    "subs       %3, #8                         \n"
    "blt        89f                            \n"
    // Blend 8 pixels.
  "8:                                          \n"
2504
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2505
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
2506
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
2507 2508
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2509 2510 2511 2512 2513 2514 2515 2516 2517 2518
    "vmull.u8   q10, d4, d3                    \n"  // db * a
    "vmull.u8   q11, d5, d3                    \n"  // dg * a
    "vmull.u8   q12, d6, d3                    \n"  // dr * a
    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
    "vqadd.u8   d2, d2, d6                     \n"  // + sr
fbarchard@google.com's avatar
fbarchard@google.com committed
2519
    "vmov.u8    d3, #255                       \n"  // a = 255
2520
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
2521 2522 2523 2524 2525 2526 2527 2528 2529
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
    "bge        8b                             \n"

  "89:                                         \n"
    "adds       %3, #8-1                       \n"
    "blt        99f                            \n"

    // Blend 1 pixels.
  "1:                                          \n"
2530
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2531
    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
2532
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
2533 2534
    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
2535 2536 2537 2538 2539 2540 2541 2542 2543 2544
    "vmull.u8   q10, d4, d3                    \n"  // db * a
    "vmull.u8   q11, d5, d3                    \n"  // dg * a
    "vmull.u8   q12, d6, d3                    \n"  // dr * a
    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
    "vqadd.u8   d2, d2, d6                     \n"  // + sr
fbarchard@google.com's avatar
fbarchard@google.com committed
2545
    "vmov.u8    d3, #255                       \n"  // a = 255
2546
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
2547 2548 2549 2550 2551 2552 2553 2554 2555 2556
    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
    "bge        1b                             \n"

  "99:                                         \n"

  : "+r"(src_argb0),    // %0
    "+r"(src_argb1),    // %1
    "+r"(dst_argb),     // %2
    "+r"(width)         // %3
  :
2557
  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
fbarchard@google.com's avatar
fbarchard@google.com committed
2558 2559 2560
  );
}

2561 2562 2563 2564 2565
// Attenuate 8 pixels at a time.
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  asm volatile (
    // Attenuate 8 pixels.
  "1:                                          \n"
2566
    MEMACCESS(0)
2567 2568 2569 2570 2571 2572 2573 2574
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q10, d0, d3                    \n"  // b * a
    "vmull.u8   q11, d1, d3                    \n"  // g * a
    "vmull.u8   q12, d2, d3                    \n"  // r * a
    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
2575
    MEMACCESS(1)
2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_argb),   // %1
    "+r"(width)       // %2
  :
  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
                          int interval_offset, int width) {
  asm volatile (
    "vdup.u16   q8, %2                         \n"
    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
    "vdup.u16   q9, %3                         \n"  // interval multiply.
    "vdup.u16   q10, %4                        \n"  // interval add

    // 8 pixel loop.
    ".p2align   2                              \n"
  "1:                                          \n"
2599
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616
    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
    "vmovl.u8   q1, d2                         \n"
    "vmovl.u8   q2, d4                         \n"
    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
    "vqdmulh.s16 q1, q1, q8                    \n"  // g
    "vqdmulh.s16 q2, q2, q8                    \n"  // r
    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
    "vmul.u16   q1, q1, q9                     \n"  // g
    "vmul.u16   q2, q2, q9                     \n"  // r
    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
    "vadd.u16   q1, q1, q10                    \n"  // g
    "vadd.u16   q2, q2, q10                    \n"  // r
    "vqmovn.u16 d0, q0                         \n"
    "vqmovn.u16 d2, q1                         \n"
    "vqmovn.u16 d4, q2                         \n"
2617
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2618 2619 2620 2621 2622 2623 2624
    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(dst_argb),       // %0
    "+r"(width)           // %1
  : "r"(scale),           // %2
    "r"(interval_size),   // %3
    "r"(interval_offset)  // %4
2625 2626 2627 2628 2629 2630
  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  );
}

// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2631
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2632 2633 2634 2635
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
                       uint32 value) {
  asm volatile (
    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
2636 2637
    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
2638 2639 2640 2641

    // 8 pixel loop.
    ".p2align   2                              \n"
  "1:                                          \n"
2642
    MEMACCESS(0)
2643 2644 2645 2646 2647 2648
    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
    "vmovl.u8   q11, d22                       \n"
    "vmovl.u8   q12, d24                       \n"
    "vmovl.u8   q13, d26                       \n"
2649
    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
2650 2651 2652
    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
2653 2654 2655 2656
    "vqmovn.u16 d20, q10                       \n"
    "vqmovn.u16 d22, q11                       \n"
    "vqmovn.u16 d24, q12                       \n"
    "vqmovn.u16 d26, q13                       \n"
2657
    MEMACCESS(1)
2658 2659 2660 2661 2662 2663 2664
    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb),       // %0
    "+r"(dst_argb),       // %1
    "+r"(width)           // %2
  : "r"(value)            // %3
  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
fbarchard@google.com's avatar
fbarchard@google.com committed
2665 2666 2667
  );
}

2668
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2669 2670
// Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2671 2672
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  asm volatile (
2673 2674 2675
    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
2676
    ".p2align   2                              \n"
2677
  "1:                                          \n"
2678
    MEMACCESS(0)
2679 2680 2681 2682 2683
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
2684
    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
2685 2686
    "vmov       d1, d0                         \n"  // G
    "vmov       d2, d0                         \n"  // R
2687
    MEMACCESS(1)
2688 2689 2690 2691 2692 2693 2694 2695 2696 2697
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_argb),  // %1
    "+r"(width)      // %2
  :
  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
//    b = (r * 35 + g * 68 + b * 17) >> 7
//    g = (r * 45 + g * 88 + b * 22) >> 7
//    r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  asm volatile (
    "vmov.u8    d20, #17                       \n"  // BB coefficient
    "vmov.u8    d21, #68                       \n"  // BG coefficient
    "vmov.u8    d22, #35                       \n"  // BR coefficient
    "vmov.u8    d24, #22                       \n"  // GB coefficient
    "vmov.u8    d25, #88                       \n"  // GG coefficient
    "vmov.u8    d26, #45                       \n"  // GR coefficient
    "vmov.u8    d28, #24                       \n"  // BB coefficient
    "vmov.u8    d29, #98                       \n"  // BG coefficient
    "vmov.u8    d30, #50                       \n"  // BR coefficient
2713
    ".p2align   2                              \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2714
  "1:                                          \n"
2715
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726
    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
    "vmlal.u8   q2, d1, d21                    \n"  // G
    "vmlal.u8   q2, d2, d22                    \n"  // R
    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
    "vmlal.u8   q3, d1, d25                    \n"  // G
    "vmlal.u8   q3, d2, d26                    \n"  // R
    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
    "vmlal.u8   q8, d1, d29                    \n"  // G
    "vmlal.u8   q8, d2, d30                    \n"  // R
2727 2728 2729
    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
2730
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2731 2732 2733 2734 2735 2736 2737 2738 2739 2740
    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(dst_argb),  // %0
    "+r"(width)      // %1
  :
  : "cc", "memory", "q0", "q1", "q2", "q3",
    "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

2741
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2742 2743 2744 2745
// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
// needs to saturate.  Consider doing a non-saturating version.
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
                             const int8* matrix_argb, int width) {
2746
  asm volatile (
2747
    MEMACCESS(3)
2748
    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
2749
    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
2750
    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
2751

2752
    ".p2align   2                              \n"
2753
  "1:                                          \n"
2754
    MEMACCESS(0)
2755 2756
    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2757 2758 2759
    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
    "vmovl.u8   q9, d18                        \n"  // g
    "vmovl.u8   q10, d20                       \n"  // r
2760
    "vmovl.u8   q11, d22                       \n"  // a
2761 2762 2763
    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
2764
    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
2765 2766 2767
    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
2768
    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
2769 2770 2771
    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2772
    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2773 2774 2775
    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
2776
    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
2777 2778 2779
    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2780
    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2781 2782 2783 2784
    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
2785 2786 2787
    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2788 2789 2790 2791 2792
    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
2793
    MEMACCESS(1)
2794 2795 2796 2797 2798 2799
    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_argb),   // %1
    "+r"(width)       // %2
  : "r"(matrix_argb)  // %3
2800
  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
2801
    "q10", "q11", "q12", "q13", "q14", "q15"
2802 2803 2804
  );
}

2805 2806
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
#ifdef HAS_ARGBMULTIPLYROW_NEON
2807 2808 2809 2810 2811
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
  asm volatile (
    // 8 pixel loop.
2812
    ".p2align   2                              \n"
2813
  "1:                                          \n"
2814
    MEMACCESS(0)
2815
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
2816
    MEMACCESS(1)
2817 2818 2819 2820 2821 2822
    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q0, d0, d1                     \n"  // multiply B
    "vmull.u8   q1, d2, d3                     \n"  // multiply G
    "vmull.u8   q2, d4, d5                     \n"  // multiply R
    "vmull.u8   q3, d6, d7                     \n"  // multiply A
2823 2824 2825 2826
    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
2827
    MEMACCESS(2)
2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"

  : "+r"(src_argb0),  // %0
    "+r"(src_argb1),  // %1
    "+r"(dst_argb),   // %2
    "+r"(width)       // %3
  :
  : "cc", "memory", "q0", "q1", "q2", "q3"
  );
}
2839
#endif  // HAS_ARGBMULTIPLYROW_NEON
2840 2841 2842 2843 2844 2845

// Add 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
  asm volatile (
    // 8 pixel loop.
2846
    ".p2align   2                              \n"
2847
  "1:                                          \n"
2848
    MEMACCESS(0)
2849
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2850
    MEMACCESS(1)
2851 2852 2853 2854
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
2855
    MEMACCESS(2)
2856 2857 2858 2859 2860 2861 2862 2863
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"

  : "+r"(src_argb0),  // %0
    "+r"(src_argb1),  // %1
    "+r"(dst_argb),   // %2
    "+r"(width)       // %3
  :
2864 2865 2866 2867 2868 2869 2870 2871 2872
  : "cc", "memory", "q0", "q1", "q2", "q3"
  );
}

// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
  asm volatile (
    // 8 pixel loop.
2873
    ".p2align   2                              \n"
2874
  "1:                                          \n"
2875
    MEMACCESS(0)
2876
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2877
    MEMACCESS(1)
2878 2879 2880 2881
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
2882
    MEMACCESS(2)
2883 2884 2885 2886 2887 2888 2889 2890 2891
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"

  : "+r"(src_argb0),  // %0
    "+r"(src_argb1),  // %1
    "+r"(dst_argb),   // %2
    "+r"(width)       // %3
  :
  : "cc", "memory", "q0", "q1", "q2", "q3"
2892 2893 2894
  );
}

2895 2896 2897 2898 2899 2900 2901 2902 2903 2904
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
// A = 255
// R = Sobel
// G = Sobel
// B = Sobel
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                     uint8* dst_argb, int width) {
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // alpha
    // 8 pixel loop.
2905
    ".p2align   2                              \n"
2906
  "1:                                          \n"
2907
    MEMACCESS(0)
2908
    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
2909
    MEMACCESS(1)
2910 2911 2912 2913 2914
    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   d0, d0, d1                     \n"  // add
    "vmov.u8    d1, d0                         \n"
    "vmov.u8    d2, d0                         \n"
2915
    MEMACCESS(2)
2916 2917 2918 2919
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_sobelx),  // %0
    "+r"(src_sobely),  // %1
2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931
    "+r"(dst_argb),    // %2
    "+r"(width)        // %3
  :
  : "cc", "memory", "q0", "q1"
  );
}

// Adds Sobel X and Sobel Y and stores Sobel into plane.
void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                          uint8* dst_y, int width) {
  asm volatile (
    // 16 pixel loop.
2932
    ".p2align   2                              \n"
2933
  "1:                                          \n"
2934
    MEMACCESS(0)
2935
    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
2936
    MEMACCESS(1)
2937 2938 2939
    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
    "vqadd.u8   q0, q0, q1                     \n"  // add
2940
    MEMACCESS(2)
2941 2942 2943 2944 2945 2946
    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
    "bgt        1b                             \n"
  : "+r"(src_sobelx),  // %0
    "+r"(src_sobely),  // %1
    "+r"(dst_y),       // %2
    "+r"(width)        // %3
2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961
  :
  : "cc", "memory", "q0", "q1"
  );
}

// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
// R = Sobel X
// G = Sobel
// B = Sobel Y
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                     uint8* dst_argb, int width) {
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // alpha
    // 8 pixel loop.
2962
    ".p2align   2                              \n"
2963
  "1:                                          \n"
2964
    MEMACCESS(0)
2965
    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
2966
    MEMACCESS(1)
2967 2968 2969
    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   d1, d0, d2                     \n"  // add
2970
    MEMACCESS(2)
2971 2972 2973 2974
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_sobelx),  // %0
    "+r"(src_sobely),  // %1
fbarchard@google.com's avatar
fbarchard@google.com committed
2975 2976
    "+r"(dst_argb),    // %2
    "+r"(width)        // %3
2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988
  :
  : "cc", "memory", "q0", "q1"
  );
}

// SobelX as a matrix is
// -1  0  1
// -2  0  2
// -1  0  1
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
                    const uint8* src_y2, uint8* dst_sobelx, int width) {
  asm volatile (
2989
    ".p2align   2                              \n"
2990
  "1:                                          \n"
2991
    MEMACCESS(0)
2992
    "vld1.8     {d0}, [%0],%5                  \n"  // top
2993
    MEMACCESS(0)
2994
    "vld1.8     {d1}, [%0],%6                  \n"
2995
    "vsubl.u8   q0, d0, d1                     \n"
2996
    MEMACCESS(1)
2997
    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
2998
    MEMACCESS(1)
2999
    "vld1.8     {d3}, [%1],%6                  \n"
3000 3001 3002
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vadd.s16   q0, q0, q1                     \n"
3003
    MEMACCESS(2)
3004
    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
3005
    MEMACCESS(2)
3006
    "vld1.8     {d3}, [%2],%6                  \n"
3007 3008 3009 3010 3011
    "subs       %4, %4, #8                     \n"  // 8 pixels
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vabs.s16   q0, q0                         \n"
    "vqmovn.u16 d0, q0                         \n"
3012
    MEMACCESS(3)
3013
    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032
    "bgt        1b                             \n"
  : "+r"(src_y0),      // %0
    "+r"(src_y1),      // %1
    "+r"(src_y2),      // %2
    "+r"(dst_sobelx),  // %3
    "+r"(width)        // %4
  : "r"(2),            // %5
    "r"(6)             // %6
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
}

// SobelY as a matrix is
// -1 -2 -1
//  0  0  0
//  1  2  1
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
                    uint8* dst_sobely, int width) {
  asm volatile (
3033
    ".p2align   2                              \n"
3034
  "1:                                          \n"
3035
    MEMACCESS(0)
3036
    "vld1.8     {d0}, [%0],%4                  \n"  // left
3037
    MEMACCESS(1)
3038
    "vld1.8     {d1}, [%1],%4                  \n"
3039
    "vsubl.u8   q0, d0, d1                     \n"
3040
    MEMACCESS(0)
3041
    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
3042
    MEMACCESS(1)
3043
    "vld1.8     {d3}, [%1],%4                  \n"
3044 3045 3046
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vadd.s16   q0, q0, q1                     \n"
3047
    MEMACCESS(0)
3048
    "vld1.8     {d2}, [%0],%5                  \n"  // right
3049
    MEMACCESS(1)
3050
    "vld1.8     {d3}, [%1],%5                  \n"
3051 3052 3053 3054 3055
    "subs       %3, %3, #8                     \n"  // 8 pixels
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vabs.s16   q0, q0                         \n"
    "vqmovn.u16 d0, q0                         \n"
3056
    MEMACCESS(2)
3057
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
3058 3059 3060 3061 3062 3063 3064 3065 3066 3067
    "bgt        1b                             \n"
  : "+r"(src_y0),      // %0
    "+r"(src_y1),      // %1
    "+r"(dst_sobely),  // %2
    "+r"(width)        // %3
  : "r"(1),            // %4
    "r"(6)             // %5
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
}
3068
#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
3069

3070 3071 3072 3073
#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif