row_neon.cc 122 KB
Newer Older
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 4 5 6
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "libyuv/row.h"
12

13 14
#include <stdio.h>

15 16 17 18 19
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

20
// This module is for GCC Neon
21 22
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
23

24
// Read 8 Y, 4 U and 4 V from 422
Frank Barchard's avatar
Frank Barchard committed
25 26 27
#define READYUV422 \
  MEMACCESS(0)     \
  "vld1.8     {d0}, [%0]!                    \n"                             \
28
    MEMACCESS(1)                                                               \
29
    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
30
    MEMACCESS(2)                                                               \
31
    "vld1.32    {d2[1]}, [%2]!                 \n"
32

33
// Read 8 Y, 8 U and 8 V from 444
Frank Barchard's avatar
Frank Barchard committed
34 35 36
#define READYUV444 \
  MEMACCESS(0)     \
  "vld1.8     {d0}, [%0]!                    \n"                             \
37
    MEMACCESS(1)                                                               \
38
    "vld1.8     {d2}, [%1]!                    \n"                             \
39
    MEMACCESS(2)                                                               \
40
    "vld1.8     {d3}, [%2]!                    \n"                             \
41 42 43
    "vpaddl.u8  q1, q1                         \n"                             \
    "vrshrn.u16 d2, q1, #1                     \n"

44
// Read 8 Y, and set 4 U and 4 V to 128
Frank Barchard's avatar
Frank Barchard committed
45 46 47 48
#define READYUV400                               \
  MEMACCESS(0)                                   \
  "vld1.8     {d0}, [%0]!                    \n" \
  "vmov.u8    d2, #128                       \n"
49

50
// Read 8 Y and 4 UV from NV12
Frank Barchard's avatar
Frank Barchard committed
51 52 53
#define READNV12 \
  MEMACCESS(0)   \
  "vld1.8     {d0}, [%0]!                    \n"                             \
54
    MEMACCESS(1)                                                               \
55
    "vld1.8     {d2}, [%1]!                    \n"                             \
56 57
    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
    "vuzp.u8    d2, d3                         \n"                             \
58
    "vtrn.u32   d2, d3                         \n"
59 60

// Read 8 Y and 4 VU from NV21
Frank Barchard's avatar
Frank Barchard committed
61 62 63
#define READNV21 \
  MEMACCESS(0)   \
  "vld1.8     {d0}, [%0]!                    \n"                             \
64
    MEMACCESS(1)                                                               \
65
    "vld1.8     {d2}, [%1]!                    \n"                             \
66 67
    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
    "vuzp.u8    d3, d2                         \n"                             \
68 69 70
    "vtrn.u32   d2, d3                         \n"

// Read 8 YUY2
Frank Barchard's avatar
Frank Barchard committed
71 72 73 74 75 76
#define READYUY2                                 \
  MEMACCESS(0)                                   \
  "vld2.8     {d0, d2}, [%0]!                \n" \
  "vmov.u8    d3, d2                         \n" \
  "vuzp.u8    d2, d3                         \n" \
  "vtrn.u32   d2, d3                         \n"
77 78

// Read 8 UYVY
Frank Barchard's avatar
Frank Barchard committed
79 80 81 82 83 84 85 86 87 88 89
#define READUYVY                                 \
  MEMACCESS(0)                                   \
  "vld2.8     {d2, d3}, [%0]!                \n" \
  "vmov.u8    d0, d3                         \n" \
  "vmov.u8    d3, d2                         \n" \
  "vuzp.u8    d2, d3                         \n" \
  "vtrn.u32   d2, d3                         \n"

#define YUVTORGB_SETUP \
  MEMACCESS([kUVToRB]) \
  "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
90
    MEMACCESS([kUVToG])                                                        \
91
    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
92
    MEMACCESS([kUVBiasBGR])                                                    \
93
    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
94
    MEMACCESS([kUVBiasBGR])                                                    \
95
    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
96
    MEMACCESS([kUVBiasBGR])                                                    \
97
    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
98
    MEMACCESS([kYToRgb])                                                       \
99 100
    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"

Frank Barchard's avatar
Frank Barchard committed
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
#define YUVTORGB                                                              \
  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
  "vmovl.s16  q10, d1                        \n"                              \
  "vmovl.s16  q0, d0                         \n"                              \
  "vmul.s32   q10, q10, q15                  \n"                              \
  "vmul.s32   q0, q0, q15                    \n"                              \
  "vqshrun.s32 d0, q0, #16                   \n"                              \
  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
  "vadd.s16   d18, d19                       \n"                              \
  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
  "vaddw.u16  q1, q1, d16                    \n"                              \
  "vaddw.u16  q10, q10, d17                  \n"                              \
  "vaddw.u16  q3, q3, d18                    \n"                              \
  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
  "vqshrun.s16 d21, q0, #6                   \n" /* G */
127

128 129 130 131
void I444ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
132
                        const struct YuvConstants* yuvconstants,
133 134
                        int width) {
  asm volatile (
135
    YUVTORGB_SETUP
136
    "vmov.u8    d23, #255                      \n"
137 138
  "1:                                          \n"
    READYUV444
139
    YUVTORGB
140
    "subs       %4, %4, #8                     \n"
141
    MEMACCESS(3)
142 143 144 145 146 147 148
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
149 150 151 152
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
153
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
154 155 156 157
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

158 159 160 161
void I422ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
162
                        const struct YuvConstants* yuvconstants,
163
                        int width) {
164
  asm volatile (
165
    YUVTORGB_SETUP
166
    "vmov.u8    d23, #255                      \n"
167
  "1:                                          \n"
168
    READYUV422
169
    YUVTORGB
170
    "subs       %4, %4, #8                     \n"
171
    MEMACCESS(3)
172
    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
173
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
174 175 176 177 178
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_argb),  // %3
      "+r"(width)      // %4
179 180 181 182
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
183
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
184
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
185 186 187
  );
}

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
void I422AlphaToARGBRow_NEON(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
                             const uint8* src_a,
                             uint8* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  asm volatile (
    YUVTORGB_SETUP
  "1:                                          \n"
    READYUV422
    YUVTORGB
    "subs       %5, %5, #8                     \n"
    MEMACCESS(3)
    "vld1.8     {d23}, [%3]!                   \n"
    MEMACCESS(4)
    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(src_a),     // %3
      "+r"(dst_argb),  // %4
      "+r"(width)      // %5
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
221 222 223 224
void I422ToRGBARow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_rgba,
225
                        const struct YuvConstants* yuvconstants,
226 227
                        int width) {
  asm volatile (
228
    YUVTORGB_SETUP
229
  "1:                                          \n"
230
    READYUV422
231
    YUVTORGB
232
    "subs       %4, %4, #8                     \n"
233
    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB
234
    MEMACCESS(3)
235
    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
236
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
237 238 239 240 241
    : "+r"(src_y),     // %0
      "+r"(src_u),     // %1
      "+r"(src_v),     // %2
      "+r"(dst_rgba),  // %3
      "+r"(width)      // %4
242 243 244 245
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
246
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
247
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
248 249 250
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
251
void I422ToRGB24Row_NEON(const uint8* src_y,
252 253 254
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_rgb24,
255
                         const struct YuvConstants* yuvconstants,
256
                         int width) {
257
  asm volatile (
258
    YUVTORGB_SETUP
259 260
  "1:                                          \n"
    READYUV422
261
    YUVTORGB
262
    "subs       %4, %4, #8                     \n"
263
    MEMACCESS(3)
264 265
    "vst3.8     {d20, d21, d22}, [%3]!         \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
266 267 268 269 270
    : "+r"(src_y),      // %0
      "+r"(src_u),      // %1
      "+r"(src_v),      // %2
      "+r"(dst_rgb24),  // %3
      "+r"(width)       // %4
271 272 273 274
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
275
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
276 277 278 279
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
280 281 282 283 284 285
#define ARGBTORGB565                                                        \
  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
286

287
void I422ToRGB565Row_NEON(const uint8* src_y,
288 289 290
                          const uint8* src_u,
                          const uint8* src_v,
                          uint8* dst_rgb565,
291
                          const struct YuvConstants* yuvconstants,
292
                          int width) {
293
  asm volatile (
294
    YUVTORGB_SETUP
295 296
  "1:                                          \n"
    READYUV422
297
    YUVTORGB
298
    "subs       %4, %4, #8                     \n"
299
    ARGBTORGB565
300
    MEMACCESS(3)
301 302 303 304 305 306 307
    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_rgb565),  // %3
      "+r"(width)     // %4
308 309 310 311
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
312
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
313 314 315 316
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
317 318 319 320 321 322 323 324
#define ARGBTOARGB1555                                                      \
  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
325 326

void I422ToARGB1555Row_NEON(const uint8* src_y,
327 328 329
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb1555,
330
                            const struct YuvConstants* yuvconstants,
331
                            int width) {
332
  asm volatile (
333
    YUVTORGB_SETUP
334 335
  "1:                                          \n"
    READYUV422
336
    YUVTORGB
337 338 339
    "subs       %4, %4, #8                     \n"
    "vmov.u8    d23, #255                      \n"
    ARGBTOARGB1555
340
    MEMACCESS(3)
341 342 343 344 345 346 347
    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
    "bgt        1b                             \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_argb1555),  // %3
      "+r"(width)     // %4
348 349 350 351
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
352
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
353 354 355 356
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
357 358 359 360 361 362 363 364
#define ARGBTOARGB4444                                                      \
  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
  "vorr       d0, d20, d21                   \n" /* BG                   */ \
  "vorr       d1, d22, d23                   \n" /* RA                   */ \
  "vzip.u8    d0, d1                         \n" /* BGRA                 */
365 366

void I422ToARGB4444Row_NEON(const uint8* src_y,
367 368 369
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb4444,
370
                            const struct YuvConstants* yuvconstants,
371
                            int width) {
372
  asm volatile (
373
    YUVTORGB_SETUP
374 375 376
    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
  "1:                                          \n"
    READYUV422
377
    YUVTORGB
378 379 380
    "subs       %4, %4, #8                     \n"
    "vmov.u8    d23, #255                      \n"
    ARGBTOARGB4444
381
    MEMACCESS(3)
382 383 384 385 386 387 388
    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
    "bgt        1b                             \n"
    : "+r"(src_y),    // %0
      "+r"(src_u),    // %1
      "+r"(src_v),    // %2
      "+r"(dst_argb4444),  // %3
      "+r"(width)     // %4
389 390 391 392
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
393
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
394 395 396 397
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
398
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
399
  asm volatile (
400
    YUVTORGB_SETUP
401
    "vmov.u8    d23, #255                      \n"
402 403
  "1:                                          \n"
    READYUV400
404
    YUVTORGB
405
    "subs       %2, %2, #8                     \n"
406
    MEMACCESS(1)
407 408 409 410 411
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
412 413 414 415
    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
416
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
417 418 419 420
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
421
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
422 423 424
  asm volatile (
    "vmov.u8    d23, #255                      \n"
  "1:                                          \n"
425
    MEMACCESS(0)
426
    "vld1.8     {d20}, [%0]!                   \n"
427 428 429
    "vmov       d21, d20                       \n"
    "vmov       d22, d20                       \n"
    "subs       %2, %2, #8                     \n"
430
    MEMACCESS(1)
431 432 433 434 435 436 437 438 439 440
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
    :
    : "cc", "memory", "d20", "d21", "d22", "d23"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
441 442 443
void NV12ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_uv,
                        uint8* dst_argb,
444
                        const struct YuvConstants* yuvconstants,
445 446
                        int width) {
  asm volatile (
447
    YUVTORGB_SETUP
448
    "vmov.u8    d23, #255                      \n"
449 450
  "1:                                          \n"
    READNV12
451
    YUVTORGB
452
    "subs       %3, %3, #8                     \n"
453
    MEMACCESS(2)
454 455
    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
    "bgt        1b                             \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
456 457 458 459
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
460 461 462 463
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
464
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
465
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
466 467 468
  );
}

469 470 471
void NV21ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_vu,
                        uint8* dst_argb,
472
                        const struct YuvConstants* yuvconstants,
473 474 475
                        int width) {
  asm volatile (
    YUVTORGB_SETUP
476
    "vmov.u8    d23, #255                      \n"
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
  "1:                                          \n"
    READNV21
    YUVTORGB
    "subs       %3, %3, #8                     \n"
    MEMACCESS(2)
    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_vu),    // %1
      "+r"(dst_argb),  // %2
      "+r"(width)      // %3
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

497
void NV12ToRGB565Row_NEON(const uint8* src_y,
498 499
                          const uint8* src_uv,
                          uint8* dst_rgb565,
500
                          const struct YuvConstants* yuvconstants,
501
                          int width) {
502
  asm volatile (
503
    YUVTORGB_SETUP
504 505
  "1:                                          \n"
    READNV12
506
    YUVTORGB
507 508
    "subs       %3, %3, #8                     \n"
    ARGBTORGB565
509
    MEMACCESS(2)
510 511 512 513 514 515
    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(src_uv),    // %1
      "+r"(dst_rgb565),  // %2
      "+r"(width)      // %3
516 517 518 519
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
520
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
521 522 523 524
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

525 526
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                        uint8* dst_argb,
527
                        const struct YuvConstants* yuvconstants,
528 529
                        int width) {
  asm volatile (
530
    YUVTORGB_SETUP
531
    "vmov.u8    d23, #255                      \n"
532 533
  "1:                                          \n"
    READYUY2
534
    YUVTORGB
535
    "subs       %2, %2, #8                     \n"
536
    MEMACCESS(1)
537 538 539 540 541
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_yuy2),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
542 543 544 545
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
546
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
547 548 549 550 551 552
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                        uint8* dst_argb,
553
                        const struct YuvConstants* yuvconstants,
554 555
                        int width) {
  asm volatile (
556
    YUVTORGB_SETUP
557
    "vmov.u8    d23, #255                      \n"
558 559
  "1:                                          \n"
    READUYVY
560
    YUVTORGB
561
    "subs       %2, %2, #8                     \n"
562
    MEMACCESS(1)
563 564 565 566 567
    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
    "bgt        1b                             \n"
    : "+r"(src_uyvy),  // %0
      "+r"(dst_argb),  // %1
      "+r"(width)      // %2
568 569 570 571
    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
      [kUVToG]"r"(&yuvconstants->kUVToG),
      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
      [kYToRgb]"r"(&yuvconstants->kYToRgb)
572
    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
573 574 575 576
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

577
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
Frank Barchard's avatar
Frank Barchard committed
578 579 580
void SplitUVRow_NEON(const uint8* src_uv,
                     uint8* dst_u,
                     uint8* dst_v,
581
                     int width) {
582
  asm volatile (
583
  "1:                                          \n"
584
    MEMACCESS(0)
585
    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
586
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
587
    MEMACCESS(1)
588
    "vst1.8     {q0}, [%1]!                    \n"  // store U
589
    MEMACCESS(2)
590
    "vst1.8     {q1}, [%2]!                    \n"  // store V
591
    "bgt        1b                             \n"
592 593 594
    : "+r"(src_uv),  // %0
      "+r"(dst_u),   // %1
      "+r"(dst_v),   // %2
595
      "+r"(width)    // %3  // Output registers
596
    :                       // Input registers
fbarchard@google.com's avatar
fbarchard@google.com committed
597
    : "cc", "memory", "q0", "q1"  // Clobber List
598 599 600
  );
}

601
// Reads 16 U's and V's and writes out 16 pairs of UV.
Frank Barchard's avatar
Frank Barchard committed
602 603 604
void MergeUVRow_NEON(const uint8* src_u,
                     const uint8* src_v,
                     uint8* dst_uv,
605
                     int width) {
606 607
  asm volatile (
  "1:                                          \n"
608
    MEMACCESS(0)
609
    "vld1.8     {q0}, [%0]!                    \n"  // load U
610
    MEMACCESS(1)
611
    "vld1.8     {q1}, [%1]!                    \n"  // load V
612
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
613
    MEMACCESS(2)
614
    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
615 616 617 618 619 620 621
    "bgt        1b                             \n"
    :
      "+r"(src_u),   // %0
      "+r"(src_v),   // %1
      "+r"(dst_uv),  // %2
      "+r"(width)    // %3  // Output registers
    :                       // Input registers
fbarchard@google.com's avatar
fbarchard@google.com committed
622
    : "cc", "memory", "q0", "q1"  // Clobber List
623 624
  );
}
625

626
// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
627
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
628
  asm volatile (
629
  "1:                                          \n"
630
    MEMACCESS(0)
631
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
632
    "subs       %2, %2, #32                    \n"  // 32 processed per loop
633
    MEMACCESS(1)
634
    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
635
    "bgt        1b                             \n"
636 637 638 639
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(count)  // %2  // Output registers
  :                     // Input registers
fbarchard@google.com's avatar
fbarchard@google.com committed
640
  : "cc", "memory", "q0", "q1"  // Clobber List
641 642 643
  );
}

644
// SetRow writes 'count' bytes using an 8 bit value repeated.
645
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
646
  asm volatile (
647 648
    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
  "1:                                          \n"
649
    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
650
    MEMACCESS(0)
651
    "vst1.8    {q0}, [%0]!                     \n"  // store
652
    "bgt       1b                              \n"
653 654
  : "+r"(dst),   // %0
    "+r"(count)  // %1
655
  : "r"(v8)      // %2
fbarchard@google.com's avatar
fbarchard@google.com committed
656
  : "cc", "memory", "q0"
657
  );
658 659
}

660
// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
661
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
662 663 664 665 666 667 668 669 670 671 672 673
  asm volatile (
    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
  "1:                                          \n"
    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
    MEMACCESS(0)
    "vst1.8    {q0}, [%0]!                     \n"  // store
    "bgt       1b                              \n"
  : "+r"(dst),   // %0
    "+r"(count)  // %1
  : "r"(v32)     // %2
  : "cc", "memory", "q0"
  );
674 675
}

676
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
677
  asm volatile (
678 679 680 681 682
    // Start at end of source row.
    "mov        r3, #-16                       \n"
    "add        %0, %0, %2                     \n"
    "sub        %0, #16                        \n"

683
  "1:                                          \n"
684
    MEMACCESS(0)
685 686 687
    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
    "subs       %2, #16                        \n"  // 16 pixels per loop.
    "vrev64.8   q0, q0                         \n"
688
    MEMACCESS(1)
689
    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
690
    MEMACCESS(1)
691 692 693 694 695 696
    "vst1.8     {d0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(width)  // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
697
  : "cc", "memory", "r3", "q0"
698 699 700
  );
}

Frank Barchard's avatar
Frank Barchard committed
701 702 703
void MirrorUVRow_NEON(const uint8* src_uv,
                      uint8* dst_u,
                      uint8* dst_v,
704
                      int width) {
705
  asm volatile (
706
    // Start at end of source row.
707
    "mov        r12, #-16                      \n"
708 709 710
    "add        %0, %0, %3, lsl #1             \n"
    "sub        %0, #16                        \n"

711
  "1:                                          \n"
712
    MEMACCESS(0)
713
    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
714 715
    "subs       %3, #8                         \n"  // 8 pixels per loop.
    "vrev64.8   q0, q0                         \n"
716
    MEMACCESS(1)
717
    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
718
    MEMACCESS(2)
719 720 721 722 723 724 725
    "vst1.8     {d1}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_uv),  // %0
    "+r"(dst_u),   // %1
    "+r"(dst_v),   // %2
    "+r"(width)    // %3
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
726
  : "cc", "memory", "r12", "q0"
727 728
  );
}
729 730 731 732 733 734 735 736 737

void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  asm volatile (
    // Start at end of source row.
    "mov        r3, #-16                       \n"
    "add        %0, %0, %2, lsl #2             \n"
    "sub        %0, #16                        \n"

  "1:                                          \n"
738
    MEMACCESS(0)
739 740 741
    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
    "subs       %2, #4                         \n"  // 4 pixels per loop.
    "vrev64.32  q0, q0                         \n"
742
    MEMACCESS(1)
743
    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
744
    MEMACCESS(1)
745 746 747 748 749 750
    "vst1.8     {d0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(width)  // %2
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
751
  : "cc", "memory", "r3", "q0"
752 753
  );
}
754

755
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
756
  asm volatile (
757
    "vmov.u8    d4, #255                       \n"  // Alpha
758
  "1:                                          \n"
759
    MEMACCESS(0)
760
    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
761
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
762
    MEMACCESS(1)
763
    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
764 765
    "bgt        1b                             \n"
  : "+r"(src_rgb24),  // %0
766
    "+r"(dst_argb),   // %1
767
    "+r"(width)         // %2
768
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
769
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
770 771 772
  );
}

773
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
774
  asm volatile (
775
    "vmov.u8    d4, #255                       \n"  // Alpha
776
  "1:                                          \n"
777
    MEMACCESS(0)
778
    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
779
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
780
    "vswp.u8    d1, d3                         \n"  // swap R, B
781
    MEMACCESS(1)
782
    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
783
    "bgt        1b                             \n"
784
  : "+r"(src_raw),   // %0
785
    "+r"(dst_argb),  // %1
786
    "+r"(width)      // %2
787
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
788
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
789 790 791
  );
}

792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vswp.u8    d1, d3                         \n"  // swap R, B
    MEMACCESS(1)
    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
    "bgt        1b                             \n"
  : "+r"(src_raw),    // %0
    "+r"(dst_rgb24),  // %1
    "+r"(width)       // %2
  :
  : "cc", "memory", "d1", "d2", "d3"  // Clobber List
  );
}

Frank Barchard's avatar
Frank Barchard committed
810 811 812 813 814 815 816 817 818 819 820
#define RGB565TOARGB                                                        \
  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
  "vorr.u8    d1, d4, d6                     \n" /* G                    */
821

822
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
823 824 825
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // Alpha
  "1:                                          \n"
826
    MEMACCESS(0)
827
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
828 829
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    RGB565TOARGB
830
    MEMACCESS(1)
831 832 833 834
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_rgb565),  // %0
    "+r"(dst_argb),    // %1
835
    "+r"(width)          // %2
836
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
837
  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
838 839 840
  );
}

Frank Barchard's avatar
Frank Barchard committed
841 842 843 844 845 846 847 848 849 850 851 852 853
#define ARGB1555TOARGB                                                      \
  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
854

855
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
Frank Barchard's avatar
Frank Barchard committed
856 857 858 859 860 861 862 863 864 865 866 867 868 869
#define RGB555TOARGB                                                        \
  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
  "vorr.u8    d1, d4, d6                     \n" /* G                    */

void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
                            uint8* dst_argb,
870
                            int width) {
871 872 873
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // Alpha
  "1:                                          \n"
874
    MEMACCESS(0)
875 876 877
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB1555TOARGB
878
    MEMACCESS(1)
879 880 881 882
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb1555),  // %0
    "+r"(dst_argb),    // %1
883
    "+r"(width)          // %2
884
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
885
  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
886 887 888
  );
}

Frank Barchard's avatar
Frank Barchard committed
889 890 891 892 893 894 895 896 897
#define ARGB4444TOARGB                                                      \
  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
898

Frank Barchard's avatar
Frank Barchard committed
899 900
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
                            uint8* dst_argb,
901
                            int width) {
902 903 904
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // Alpha
  "1:                                          \n"
905
    MEMACCESS(0)
906 907 908
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB4444TOARGB
909
    MEMACCESS(1)
910 911 912 913
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb4444),  // %0
    "+r"(dst_argb),    // %1
914
    "+r"(width)          // %2
915
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
916
  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
917 918 919
  );
}

920
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
921 922
  asm volatile (
  "1:                                          \n"
923
    MEMACCESS(0)
924
    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
925
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
926
    MEMACCESS(1)
927
    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
928 929 930
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_rgb24),  // %1
931
    "+r"(width)         // %2
932
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
933
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
934 935 936
  );
}

937
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
938 939
  asm volatile (
  "1:                                          \n"
940
    MEMACCESS(0)
941
    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
942
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
943
    "vswp.u8    d1, d3                         \n"  // swap R, B
944
    MEMACCESS(1)
945
    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
946 947 948
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_raw),   // %1
949
    "+r"(width)        // %2
950
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
951
  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
952 953 954
  );
}

955
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
956 957
  asm volatile (
  "1:                                          \n"
958
    MEMACCESS(0)
959
    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
960
    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
961
    MEMACCESS(1)
962
    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
963 964 965
    "bgt        1b                             \n"
  : "+r"(src_yuy2),  // %0
    "+r"(dst_y),     // %1
966
    "+r"(width)        // %2
967
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
968
  : "cc", "memory", "q0", "q1"  // Clobber List
969 970 971
  );
}

972
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
973 974
  asm volatile (
  "1:                                          \n"
975
    MEMACCESS(0)
976
    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
977
    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
978
    MEMACCESS(1)
979
    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
980 981 982
    "bgt        1b                             \n"
  : "+r"(src_uyvy),  // %0
    "+r"(dst_y),     // %1
983
    "+r"(width)        // %2
984
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
985
  : "cc", "memory", "q0", "q1"  // Clobber List
986 987 988
  );
}

Frank Barchard's avatar
Frank Barchard committed
989 990 991
void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
                         uint8* dst_u,
                         uint8* dst_v,
992
                         int width) {
993 994
  asm volatile (
  "1:                                          \n"
995
    MEMACCESS(0)
996
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
997
    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
998
    MEMACCESS(1)
999
    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
1000
    MEMACCESS(2)
1001
    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
1002 1003 1004 1005
    "bgt        1b                             \n"
  : "+r"(src_yuy2),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
1006
    "+r"(width)        // %3
1007
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1008
  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1009 1010 1011
  );
}

Frank Barchard's avatar
Frank Barchard committed
1012 1013 1014
void UYVYToUV422Row_NEON(const uint8* src_uyvy,
                         uint8* dst_u,
                         uint8* dst_v,
1015
                         int width) {
1016 1017
  asm volatile (
  "1:                                          \n"
1018
    MEMACCESS(0)
1019
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1020
    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
1021
    MEMACCESS(1)
1022
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
1023
    MEMACCESS(2)
1024
    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
1025 1026 1027 1028
    "bgt        1b                             \n"
  : "+r"(src_uyvy),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
1029
    "+r"(width)        // %3
1030
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1031
  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
1032 1033 1034
  );
}

Frank Barchard's avatar
Frank Barchard committed
1035 1036 1037 1038 1039
void YUY2ToUVRow_NEON(const uint8* src_yuy2,
                      int stride_yuy2,
                      uint8* dst_u,
                      uint8* dst_v,
                      int width) {
1040
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
1041
    "add        %1, %0, %1                     \n"  // stride + src_yuy2
1042
  "1:                                          \n"
1043
    MEMACCESS(0)
1044
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
1045
    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1046
    MEMACCESS(1)
1047 1048 1049
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
1050
    MEMACCESS(2)
1051
    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
1052
    MEMACCESS(3)
1053
    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
1054
    "bgt        1b                             \n"
1055
  : "+r"(src_yuy2),     // %0
1056
    "+r"(stride_yuy2),  // %1
1057 1058
    "+r"(dst_u),        // %2
    "+r"(dst_v),        // %3
1059
    "+r"(width)           // %4
1060
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1061
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1062 1063 1064
  );
}

Frank Barchard's avatar
Frank Barchard committed
1065 1066 1067 1068 1069
void UYVYToUVRow_NEON(const uint8* src_uyvy,
                      int stride_uyvy,
                      uint8* dst_u,
                      uint8* dst_v,
                      int width) {
1070
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
1071
    "add        %1, %0, %1                     \n"  // stride + src_uyvy
1072
  "1:                                          \n"
1073
    MEMACCESS(0)
1074
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
1075
    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
1076
    MEMACCESS(1)
1077 1078 1079
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
1080
    MEMACCESS(2)
1081
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
1082
    MEMACCESS(3)
1083
    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
1084
    "bgt        1b                             \n"
1085
  : "+r"(src_uyvy),     // %0
1086
    "+r"(stride_uyvy),  // %1
1087 1088
    "+r"(dst_u),        // %2
    "+r"(dst_v),        // %3
1089
    "+r"(width)           // %4
1090
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1091
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
1092 1093
  );
}
1094

fbarchard@google.com's avatar
fbarchard@google.com committed
1095
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
Frank Barchard's avatar
Frank Barchard committed
1096 1097 1098 1099
void ARGBShuffleRow_NEON(const uint8* src_argb,
                         uint8* dst_argb,
                         const uint8* shuffler,
                         int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
1100
  asm volatile (
1101
    MEMACCESS(3)
1102
    "vld1.8     {q2}, [%3]                     \n"  // shuffler
fbarchard@google.com's avatar
fbarchard@google.com committed
1103
  "1:                                          \n"
1104
    MEMACCESS(0)
1105
    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
fbarchard@google.com's avatar
fbarchard@google.com committed
1106 1107 1108
    "subs       %2, %2, #4                     \n"  // 4 processed per loop
    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
1109
    MEMACCESS(1)
1110
    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
fbarchard@google.com's avatar
fbarchard@google.com committed
1111 1112 1113
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_argb),  // %1
1114
    "+r"(width)        // %2
fbarchard@google.com's avatar
fbarchard@google.com committed
1115 1116
  : "r"(shuffler)    // %3
  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
1117
  );
1118 1119
}

fbarchard@google.com's avatar
fbarchard@google.com committed
1120 1121 1122
void I422ToYUY2Row_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
Frank Barchard's avatar
Frank Barchard committed
1123 1124
                        uint8* dst_yuy2,
                        int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
1125 1126
  asm volatile (
  "1:                                          \n"
1127
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1128
    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
1129
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1130
    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
1131
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1132 1133
    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
    "subs       %4, %4, #16                    \n"  // 16 pixels
1134
    MEMACCESS(3)
1135
    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
fbarchard@google.com's avatar
fbarchard@google.com committed
1136
    "bgt        1b                             \n"
1137 1138 1139 1140 1141 1142 1143
  : "+r"(src_y),     // %0
    "+r"(src_u),     // %1
    "+r"(src_v),     // %2
    "+r"(dst_yuy2),  // %3
    "+r"(width)      // %4
  :
  : "cc", "memory", "d0", "d1", "d2", "d3"
fbarchard@google.com's avatar
fbarchard@google.com committed
1144 1145 1146 1147 1148 1149
  );
}

void I422ToUYVYRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
Frank Barchard's avatar
Frank Barchard committed
1150 1151
                        uint8* dst_uyvy,
                        int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
1152 1153
  asm volatile (
  "1:                                          \n"
1154
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1155
    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
1156
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1157
    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
1158
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1159 1160
    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
    "subs       %4, %4, #16                    \n"  // 16 pixels
1161
    MEMACCESS(3)
1162
    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
fbarchard@google.com's avatar
fbarchard@google.com committed
1163
    "bgt        1b                             \n"
1164 1165 1166 1167 1168 1169 1170
  : "+r"(src_y),     // %0
    "+r"(src_u),     // %1
    "+r"(src_v),     // %2
    "+r"(dst_uyvy),  // %3
    "+r"(width)      // %4
  :
  : "cc", "memory", "d0", "d1", "d2", "d3"
fbarchard@google.com's avatar
fbarchard@google.com committed
1171 1172 1173
  );
}

1174
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1175 1176
  asm volatile (
  "1:                                          \n"
1177
    MEMACCESS(0)
1178
    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1179
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1180
    ARGBTORGB565
1181
    MEMACCESS(1)
1182 1183 1184 1185
    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_rgb565),  // %1
1186
    "+r"(width)        // %2
1187
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1188
  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1189 1190 1191
  );
}

Frank Barchard's avatar
Frank Barchard committed
1192 1193 1194 1195
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
                                uint8* dst_rgb,
                                const uint32 dither4,
                                int width) {
1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
  asm volatile (
    "vdup.32    d2, %2                         \n"  // dither4
  "1:                                          \n"
    MEMACCESS(1)
    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   d20, d20, d2                   \n"
    "vqadd.u8   d21, d21, d2                   \n"
    "vqadd.u8   d22, d22, d2                   \n"
    ARGBTORGB565
    MEMACCESS(0)
    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
  : "+r"(dst_rgb)    // %0
  : "r"(src_argb),   // %1
    "r"(dither4),    // %2
    "r"(width)       // %3
  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1217 1218
void ARGBToARGB1555Row_NEON(const uint8* src_argb,
                            uint8* dst_argb1555,
1219
                            int width) {
1220 1221
  asm volatile (
  "1:                                          \n"
1222
    MEMACCESS(0)
1223
    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1224
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1225
    ARGBTOARGB1555
1226
    MEMACCESS(1)
1227 1228 1229 1230
    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_argb1555),  // %1
1231
    "+r"(width)        // %2
1232
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1233
  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1234 1235 1236
  );
}

Frank Barchard's avatar
Frank Barchard committed
1237 1238
void ARGBToARGB4444Row_NEON(const uint8* src_argb,
                            uint8* dst_argb4444,
1239
                            int width) {
1240 1241 1242
  asm volatile (
    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
  "1:                                          \n"
1243
    MEMACCESS(0)
1244
    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
1245
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
1246
    ARGBTOARGB4444
1247
    MEMACCESS(1)
1248 1249
    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
    "bgt        1b                             \n"
1250
  : "+r"(src_argb),      // %0
1251
    "+r"(dst_argb4444),  // %1
1252
    "+r"(width)            // %2
1253
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1254
  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
1255 1256
  );
}
1257

1258
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1259 1260 1261 1262 1263 1264
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
  "1:                                          \n"
1265
    MEMACCESS(0)
1266
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
1267 1268 1269 1270 1271 1272
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
1273
    MEMACCESS(1)
1274 1275 1276 1277
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_y),     // %1
1278
    "+r"(width)        // %2
1279
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1280
  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
1281 1282 1283
  );
}

1284 1285 1286 1287
void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
1288 1289 1290
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
1291
    MEMACCESS(1)
1292
    "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
1293 1294 1295 1296 1297
    "bgt       1b                              \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_a),      // %1
    "+r"(width)       // %2
  :
1298
  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
1299 1300 1301
  );
}

1302
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1303
  asm volatile (
1304 1305 1306
    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
1307
  "1:                                          \n"
1308
    MEMACCESS(0)
1309 1310 1311 1312 1313
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
1314
    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
1315
    MEMACCESS(1)
1316 1317 1318 1319
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_y),     // %1
1320
    "+r"(width)        // %2
1321 1322 1323 1324 1325
  :
  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  );
}

1326
// 8x1 pixels.
Frank Barchard's avatar
Frank Barchard committed
1327 1328 1329
void ARGBToUV444Row_NEON(const uint8* src_argb,
                         uint8* dst_u,
                         uint8* dst_v,
1330
                         int width) {
1331
  asm volatile (
1332 1333 1334 1335 1336 1337
    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1338
  "1:                                          \n"
1339
    MEMACCESS(0)
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlsl.u8   q2, d1, d25                    \n"  // G
    "vmlsl.u8   q2, d2, d26                    \n"  // R
    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned

    "vmull.u8   q3, d2, d24                    \n"  // R
    "vmlsl.u8   q3, d1, d28                    \n"  // G
    "vmlsl.u8   q3, d0, d27                    \n"  // B
    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned

1352 1353
    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
1354

1355
    MEMACCESS(1)
1356
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
1357
    MEMACCESS(2)
1358
    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
1359 1360 1361 1362
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_u),     // %1
    "+r"(dst_v),     // %2
1363
    "+r"(width)        // %3
1364
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1365
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
1366 1367
  );
}
1368

1369
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386
#define RGBTOUV(QB, QG, QR)                                                 \
  "vmul.s16   q8, " #QB                                                     \
  ", q10               \n" /* B                    */                       \
  "vmls.s16   q8, " #QG                                                     \
  ", q11               \n" /* G                    */                       \
  "vmls.s16   q8, " #QR                                                     \
  ", q12               \n"                       /* R                    */ \
  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
  "vmul.s16   q9, " #QR                                                     \
  ", q10               \n" /* R                    */                       \
  "vmls.s16   q9, " #QG                                                     \
  ", q14               \n" /* G                    */                       \
  "vmls.s16   q9, " #QB                                                     \
  ", q13               \n"                       /* B                    */ \
  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
1387

1388
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
Frank Barchard's avatar
Frank Barchard committed
1389 1390 1391 1392 1393
void ARGBToUVRow_NEON(const uint8* src_argb,
                      int src_stride_argb,
                      uint8* dst_u,
                      uint8* dst_v,
                      int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
1394
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
1395
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1396 1397 1398 1399 1400
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
fbarchard@google.com's avatar
fbarchard@google.com committed
1401 1402
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1403
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1404
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1405
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1406 1407 1408 1409
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1410
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1411
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1412
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1413 1414 1415 1416
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1417 1418 1419 1420 1421

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

fbarchard@google.com's avatar
fbarchard@google.com committed
1422
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1423
    RGBTOUV(q0, q1, q2)
1424
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1425
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1426
    MEMACCESS(3)
fbarchard@google.com's avatar
fbarchard@google.com committed
1427
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
fbarchard@google.com's avatar
fbarchard@google.com committed
1428 1429
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
fbarchard@google.com's avatar
fbarchard@google.com committed
1430 1431 1432
    "+r"(src_stride_argb),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1433
    "+r"(width)        // %4
fbarchard@google.com's avatar
fbarchard@google.com committed
1434
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1435
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
fbarchard@google.com's avatar
fbarchard@google.com committed
1436 1437 1438
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
1439

1440
// TODO(fbarchard): Subsample match C code.
Frank Barchard's avatar
Frank Barchard committed
1441 1442 1443 1444 1445
void ARGBToUVJRow_NEON(const uint8* src_argb,
                       int src_stride_argb,
                       uint8* dst_u,
                       uint8* dst_v,
                       int width) {
1446 1447
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1448 1449 1450 1451 1452
    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
1453 1454
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1455
    MEMACCESS(0)
1456
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
1457
    MEMACCESS(0)
1458 1459 1460 1461
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1462
    MEMACCESS(1)
1463
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
1464
    MEMACCESS(1)
1465 1466 1467 1468
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1469 1470 1471 1472 1473

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1474 1475
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
1476
    MEMACCESS(2)
1477
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1478
    MEMACCESS(3)
1479 1480 1481 1482 1483 1484
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(src_stride_argb),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1485
    "+r"(width)        // %4
1486 1487 1488 1489 1490 1491
  :
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1492 1493 1494 1495 1496
void BGRAToUVRow_NEON(const uint8* src_bgra,
                      int src_stride_bgra,
                      uint8* dst_u,
                      uint8* dst_v,
                      int width) {
1497 1498
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
1499 1500 1501 1502 1503
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1504 1505
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1506
    MEMACCESS(0)
1507
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
1508
    MEMACCESS(0)
1509 1510 1511 1512
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
1513
    MEMACCESS(1)
1514
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
1515
    MEMACCESS(1)
1516 1517 1518 1519
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
1520 1521 1522 1523 1524

    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
    "vrshr.u16  q2, q2, #1                     \n"
    "vrshr.u16  q3, q3, #1                     \n"

1525 1526
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q3, q2, q1)
1527
    MEMACCESS(2)
1528
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1529
    MEMACCESS(3)
1530 1531 1532 1533 1534 1535
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_bgra),  // %0
    "+r"(src_stride_bgra),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1536
    "+r"(width)        // %4
1537
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1538
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1539 1540 1541 1542
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1543 1544 1545 1546 1547
void ABGRToUVRow_NEON(const uint8* src_abgr,
                      int src_stride_abgr,
                      uint8* dst_u,
                      uint8* dst_v,
                      int width) {
1548 1549
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
1550 1551 1552 1553 1554
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1555 1556
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1557
    MEMACCESS(0)
1558
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
1559
    MEMACCESS(0)
1560 1561 1562 1563
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1564
    MEMACCESS(1)
1565
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
1566
    MEMACCESS(1)
1567 1568 1569 1570
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1571 1572 1573 1574 1575

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1576 1577
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q2, q1, q0)
1578
    MEMACCESS(2)
1579
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1580
    MEMACCESS(3)
1581 1582 1583 1584 1585 1586
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_abgr),  // %0
    "+r"(src_stride_abgr),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1587
    "+r"(width)        // %4
1588
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1589
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1590 1591 1592 1593
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1594 1595 1596 1597 1598
void RGBAToUVRow_NEON(const uint8* src_rgba,
                      int src_stride_rgba,
                      uint8* dst_u,
                      uint8* dst_v,
                      int width) {
1599 1600
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
1601 1602 1603 1604 1605
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1606 1607
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1608
    MEMACCESS(0)
1609
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
1610
    MEMACCESS(0)
1611 1612 1613 1614
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
1615
    MEMACCESS(1)
1616
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
1617
    MEMACCESS(1)
1618 1619 1620 1621
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
1622 1623 1624 1625 1626

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1627 1628
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
1629
    MEMACCESS(2)
1630
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1631
    MEMACCESS(3)
1632 1633 1634 1635 1636 1637
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgba),  // %0
    "+r"(src_stride_rgba),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1638
    "+r"(width)        // %4
1639
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1640
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1641 1642 1643 1644
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1645 1646 1647 1648 1649
void RGB24ToUVRow_NEON(const uint8* src_rgb24,
                       int src_stride_rgb24,
                       uint8* dst_u,
                       uint8* dst_v,
                       int width) {
1650 1651
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
1652 1653 1654 1655 1656
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1657 1658
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1659
    MEMACCESS(0)
1660
    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
1661
    MEMACCESS(0)
1662 1663 1664 1665
    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
1666
    MEMACCESS(1)
1667
    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
1668
    MEMACCESS(1)
1669 1670 1671 1672
    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1673 1674 1675 1676 1677

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1678 1679
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
1680
    MEMACCESS(2)
1681
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1682
    MEMACCESS(3)
1683 1684 1685 1686 1687 1688
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgb24),  // %0
    "+r"(src_stride_rgb24),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1689
    "+r"(width)        // %4
1690
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1691
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1692 1693 1694 1695
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1696 1697 1698 1699 1700
void RAWToUVRow_NEON(const uint8* src_raw,
                     int src_stride_raw,
                     uint8* dst_u,
                     uint8* dst_v,
                     int width) {
1701 1702
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_raw
1703 1704 1705 1706 1707
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1708 1709
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1710
    MEMACCESS(0)
1711
    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
1712
    MEMACCESS(0)
1713 1714 1715 1716
    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
1717
    MEMACCESS(1)
1718
    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
1719
    MEMACCESS(1)
1720 1721 1722 1723
    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1724 1725 1726 1727 1728

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1729 1730
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q2, q1, q0)
1731
    MEMACCESS(2)
1732
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1733
    MEMACCESS(3)
1734 1735 1736 1737 1738 1739
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_raw),  // %0
    "+r"(src_stride_raw),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1740
    "+r"(width)        // %4
1741
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1742
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1743 1744 1745 1746
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

1747
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1748 1749 1750 1751 1752
void RGB565ToUVRow_NEON(const uint8* src_rgb565,
                        int src_stride_rgb565,
                        uint8* dst_u,
                        uint8* dst_v,
                        int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
1753 1754
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1755 1756 1757 1758 1759
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
fbarchard@google.com's avatar
fbarchard@google.com committed
1760 1761
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1762
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1763 1764 1765 1766 1767
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
    RGB565TOARGB
    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1768
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
1769 1770 1771 1772 1773 1774
    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
    RGB565TOARGB
    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

1775
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1776 1777 1778 1779 1780
    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
    RGB565TOARGB
    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1781
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
1782 1783 1784 1785 1786 1787
    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
    RGB565TOARGB
    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

1788 1789 1790 1791
    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
    "vrshr.u16  q5, q5, #1                     \n"
    "vrshr.u16  q6, q6, #1                     \n"

fbarchard@google.com's avatar
fbarchard@google.com committed
1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802
    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
    "vmul.s16   q8, q4, q10                    \n"  // B
    "vmls.s16   q8, q5, q11                    \n"  // G
    "vmls.s16   q8, q6, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
    "vmul.s16   q9, q6, q10                    \n"  // R
    "vmls.s16   q9, q5, q14                    \n"  // G
    "vmls.s16   q9, q4, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1803
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1804
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1805
    MEMACCESS(3)
fbarchard@google.com's avatar
fbarchard@google.com committed
1806 1807 1808 1809 1810 1811
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgb565),  // %0
    "+r"(src_stride_rgb565),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1812
    "+r"(width)        // %4
fbarchard@google.com's avatar
fbarchard@google.com committed
1813
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1814
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
fbarchard@google.com's avatar
fbarchard@google.com committed
1815 1816 1817
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
1818

1819
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1820 1821 1822 1823 1824
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
                          int src_stride_argb1555,
                          uint8* dst_u,
                          uint8* dst_v,
                          int width) {
1825 1826
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1827 1828 1829 1830 1831
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1832 1833
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1834
    MEMACCESS(0)
1835 1836 1837 1838 1839
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1840
    MEMACCESS(0)
1841 1842 1843 1844 1845 1846
    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

1847
    MEMACCESS(1)
1848 1849 1850 1851 1852
    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1853
    MEMACCESS(1)
1854 1855 1856 1857 1858 1859
    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
    RGB555TOARGB
    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

1860 1861 1862 1863
    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
    "vrshr.u16  q5, q5, #1                     \n"
    "vrshr.u16  q6, q6, #1                     \n"

1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874
    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
    "vmul.s16   q8, q4, q10                    \n"  // B
    "vmls.s16   q8, q5, q11                    \n"  // G
    "vmls.s16   q8, q6, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
    "vmul.s16   q9, q6, q10                    \n"  // R
    "vmls.s16   q9, q5, q14                    \n"  // G
    "vmls.s16   q9, q4, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1875
    MEMACCESS(2)
1876
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1877
    MEMACCESS(3)
1878 1879 1880 1881 1882 1883
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb1555),  // %0
    "+r"(src_stride_argb1555),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1884
    "+r"(width)        // %4
1885
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1886
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1887 1888 1889 1890
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

1891
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1892 1893 1894 1895 1896
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
                          int src_stride_argb4444,
                          uint8* dst_u,
                          uint8* dst_v,
                          int width) {
1897 1898
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1899 1900 1901 1902 1903
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1904 1905
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
  "1:                                          \n"
1906
    MEMACCESS(0)
1907 1908 1909 1910 1911
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1912
    MEMACCESS(0)
1913 1914 1915 1916 1917 1918
    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

1919
    MEMACCESS(1)
1920 1921 1922 1923 1924
    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
1925
    MEMACCESS(1)
1926 1927 1928 1929 1930 1931
    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
    ARGB4444TOARGB
    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

1932 1933 1934 1935
    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
    "vrshr.u16  q5, q5, #1                     \n"
    "vrshr.u16  q6, q6, #1                     \n"

1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946
    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
    "vmul.s16   q8, q4, q10                    \n"  // B
    "vmls.s16   q8, q5, q11                    \n"  // G
    "vmls.s16   q8, q6, q12                    \n"  // R
    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
    "vmul.s16   q9, q6, q10                    \n"  // R
    "vmls.s16   q9, q5, q14                    \n"  // G
    "vmls.s16   q9, q4, q13                    \n"  // B
    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
1947
    MEMACCESS(2)
1948
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
1949
    MEMACCESS(3)
1950 1951 1952 1953 1954 1955
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb4444),  // %0
    "+r"(src_stride_argb4444),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1956
    "+r"(width)        // %4
1957
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1958
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1959 1960 1961
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
fbarchard@google.com's avatar
fbarchard@google.com committed
1962

1963
void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
1964 1965 1966 1967 1968 1969
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
  "1:                                          \n"
1970
    MEMACCESS(0)
1971
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
1972 1973 1974 1975 1976 1977 1978
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    RGB565TOARGB
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
1979
    MEMACCESS(1)
1980 1981 1982 1983
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_rgb565),  // %0
    "+r"(dst_y),       // %1
1984
    "+r"(width)          // %2
1985
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1986
  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
1987 1988 1989
  );
}

1990
void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
1991 1992 1993 1994 1995 1996
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
  "1:                                          \n"
1997
    MEMACCESS(0)
1998 1999 2000 2001 2002 2003 2004 2005
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB1555TOARGB
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
2006
    MEMACCESS(1)
2007 2008 2009 2010
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb1555),  // %0
    "+r"(dst_y),         // %1
2011
    "+r"(width)            // %2
2012
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2013
  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2014 2015 2016
  );
}

2017
void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
2018 2019 2020 2021 2022 2023
  asm volatile (
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
    "vmov.u8    d27, #16                       \n"  // Add 16 constant
  "1:                                          \n"
2024
    MEMACCESS(0)
2025 2026 2027 2028 2029 2030 2031 2032
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    ARGB4444TOARGB
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d27                        \n"
2033
    MEMACCESS(1)
2034 2035 2036 2037
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_argb4444),  // %0
    "+r"(dst_y),         // %1
2038
    "+r"(width)            // %2
2039
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2040
  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
2041 2042
  );
}
2043

2044
void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2045 2046 2047 2048 2049 2050
  asm volatile (
    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
  "1:                                          \n"
2051
    MEMACCESS(0)
2052 2053 2054 2055 2056 2057 2058
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d1, d4                     \n"  // R
    "vmlal.u8   q8, d2, d5                     \n"  // G
    "vmlal.u8   q8, d3, d6                     \n"  // B
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2059
    MEMACCESS(1)
2060 2061 2062 2063
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_bgra),  // %0
    "+r"(dst_y),     // %1
2064
    "+r"(width)        // %2
2065
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2066
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2067 2068 2069
  );
}

2070
void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2071 2072 2073 2074 2075 2076
  asm volatile (
    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
  "1:                                          \n"
2077
    MEMACCESS(0)
2078 2079 2080 2081 2082 2083 2084
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d0, d4                     \n"  // R
    "vmlal.u8   q8, d1, d5                     \n"  // G
    "vmlal.u8   q8, d2, d6                     \n"  // B
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2085
    MEMACCESS(1)
2086 2087 2088 2089
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_abgr),  // %0
    "+r"(dst_y),  // %1
2090
    "+r"(width)        // %2
2091
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2092
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2093 2094 2095
  );
}

2096
void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2097 2098 2099 2100 2101 2102
  asm volatile (
    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
  "1:                                          \n"
2103
    MEMACCESS(0)
2104 2105 2106 2107 2108 2109 2110
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d1, d4                     \n"  // B
    "vmlal.u8   q8, d2, d5                     \n"  // G
    "vmlal.u8   q8, d3, d6                     \n"  // R
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2111
    MEMACCESS(1)
2112 2113 2114 2115
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_rgba),  // %0
    "+r"(dst_y),  // %1
2116
    "+r"(width)        // %2
2117
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2118
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2119 2120 2121
  );
}

2122
void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2123 2124 2125 2126 2127 2128
  asm volatile (
    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
  "1:                                          \n"
2129
    MEMACCESS(0)
2130
    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
2131 2132 2133 2134 2135 2136
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d0, d4                     \n"  // B
    "vmlal.u8   q8, d1, d5                     \n"  // G
    "vmlal.u8   q8, d2, d6                     \n"  // R
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2137
    MEMACCESS(1)
2138 2139
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
2140
  : "+r"(src_rgb24),  // %0
2141
    "+r"(dst_y),  // %1
2142
    "+r"(width)        // %2
2143
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2144
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2145 2146
  );
}
2147

2148
void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2149 2150 2151 2152 2153 2154
  asm volatile (
    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
  "1:                                          \n"
2155
    MEMACCESS(0)
2156 2157 2158 2159 2160 2161 2162
    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d0, d4                     \n"  // B
    "vmlal.u8   q8, d1, d5                     \n"  // G
    "vmlal.u8   q8, d2, d6                     \n"  // R
    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
    "vqadd.u8   d0, d7                         \n"
2163
    MEMACCESS(1)
2164 2165 2166 2167
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
  : "+r"(src_raw),  // %0
    "+r"(dst_y),  // %1
2168
    "+r"(width)        // %2
2169
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2170
  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
2171 2172
  );
}
2173

2174 2175
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8* dst_ptr,
Frank Barchard's avatar
Frank Barchard committed
2176 2177 2178 2179
                         const uint8* src_ptr,
                         ptrdiff_t src_stride,
                         int dst_width,
                         int source_y_fraction) {
2180
  int y1_fraction = source_y_fraction;
2181
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
2182 2183 2184
    "cmp        %4, #0                         \n"
    "beq        100f                           \n"
    "add        %2, %1                         \n"
2185
    "cmp        %4, #128                       \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2186 2187 2188
    "beq        50f                            \n"

    "vdup.8     d5, %4                         \n"
2189
    "rsb        %4, #256                       \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2190
    "vdup.8     d4, %4                         \n"
2191 2192
    // General purpose row blend.
  "1:                                          \n"
2193
    MEMACCESS(1)
2194
    "vld1.8     {q0}, [%1]!                    \n"
2195
    MEMACCESS(2)
2196
    "vld1.8     {q1}, [%2]!                    \n"
2197
    "subs       %3, %3, #16                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2198 2199 2200 2201
    "vmull.u8   q13, d0, d4                    \n"
    "vmull.u8   q14, d1, d4                    \n"
    "vmlal.u8   q13, d2, d5                    \n"
    "vmlal.u8   q14, d3, d5                    \n"
2202 2203
    "vrshrn.u16 d0, q13, #8                    \n"
    "vrshrn.u16 d1, q14, #8                    \n"
2204
    MEMACCESS(0)
2205
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2206 2207
    "bgt        1b                             \n"
    "b          99f                            \n"
2208 2209 2210

    // Blend 50 / 50.
  "50:                                         \n"
2211
    MEMACCESS(1)
2212
    "vld1.8     {q0}, [%1]!                    \n"
2213
    MEMACCESS(2)
2214
    "vld1.8     {q1}, [%2]!                    \n"
2215
    "subs       %3, %3, #16                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2216
    "vrhadd.u8  q0, q1                         \n"
2217
    MEMACCESS(0)
2218
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2219 2220
    "bgt        50b                            \n"
    "b          99f                            \n"
2221 2222 2223

    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
2224
    MEMACCESS(1)
2225
    "vld1.8     {q0}, [%1]!                    \n"
2226
    "subs       %3, %3, #16                    \n"
2227
    MEMACCESS(0)
2228
    "vst1.8     {q0}, [%0]!                    \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
2229
    "bgt        100b                           \n"
2230 2231 2232 2233 2234 2235

  "99:                                         \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(src_stride),       // %2
    "+r"(dst_width),        // %3
2236
    "+r"(y1_fraction)       // %4
2237
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
2238
  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
2239 2240
  );
}
fbarchard@google.com's avatar
fbarchard@google.com committed
2241 2242

// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
Frank Barchard's avatar
Frank Barchard committed
2243 2244 2245 2246
void ARGBBlendRow_NEON(const uint8* src_argb0,
                       const uint8* src_argb1,
                       uint8* dst_argb,
                       int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
2247 2248 2249 2250 2251
  asm volatile (
    "subs       %3, #8                         \n"
    "blt        89f                            \n"
    // Blend 8 pixels.
  "8:                                          \n"
2252
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2253
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
2254
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
2255 2256
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
2257 2258 2259 2260 2261 2262 2263 2264 2265 2266
    "vmull.u8   q10, d4, d3                    \n"  // db * a
    "vmull.u8   q11, d5, d3                    \n"  // dg * a
    "vmull.u8   q12, d6, d3                    \n"  // dr * a
    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
    "vqadd.u8   d2, d2, d6                     \n"  // + sr
fbarchard@google.com's avatar
fbarchard@google.com committed
2267
    "vmov.u8    d3, #255                       \n"  // a = 255
2268
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
2269 2270 2271 2272 2273 2274 2275 2276 2277
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
    "bge        8b                             \n"

  "89:                                         \n"
    "adds       %3, #8-1                       \n"
    "blt        99f                            \n"

    // Blend 1 pixels.
  "1:                                          \n"
2278
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2279
    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
2280
    MEMACCESS(1)
fbarchard@google.com's avatar
fbarchard@google.com committed
2281 2282
    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
2283 2284 2285 2286 2287 2288 2289 2290 2291 2292
    "vmull.u8   q10, d4, d3                    \n"  // db * a
    "vmull.u8   q11, d5, d3                    \n"  // dg * a
    "vmull.u8   q12, d6, d3                    \n"  // dr * a
    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
    "vqadd.u8   d2, d2, d6                     \n"  // + sr
fbarchard@google.com's avatar
fbarchard@google.com committed
2293
    "vmov.u8    d3, #255                       \n"  // a = 255
2294
    MEMACCESS(2)
fbarchard@google.com's avatar
fbarchard@google.com committed
2295 2296 2297 2298 2299 2300 2301 2302 2303 2304
    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
    "bge        1b                             \n"

  "99:                                         \n"

  : "+r"(src_argb0),    // %0
    "+r"(src_argb1),    // %1
    "+r"(dst_argb),     // %2
    "+r"(width)         // %3
  :
2305
  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
fbarchard@google.com's avatar
fbarchard@google.com committed
2306 2307 2308
  );
}

2309 2310 2311 2312 2313
// Attenuate 8 pixels at a time.
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  asm volatile (
    // Attenuate 8 pixels.
  "1:                                          \n"
2314
    MEMACCESS(0)
2315 2316 2317 2318 2319 2320 2321 2322
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q10, d0, d3                    \n"  // b * a
    "vmull.u8   q11, d1, d3                    \n"  // g * a
    "vmull.u8   q12, d2, d3                    \n"  // r * a
    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
2323
    MEMACCESS(1)
2324 2325 2326 2327 2328 2329 2330 2331 2332 2333
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_argb),   // %1
    "+r"(width)       // %2
  :
  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
2334 2335
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
Frank Barchard's avatar
Frank Barchard committed
2336 2337 2338 2339 2340
void ARGBQuantizeRow_NEON(uint8* dst_argb,
                          int scale,
                          int interval_size,
                          int interval_offset,
                          int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
2341 2342 2343 2344 2345 2346 2347 2348
  asm volatile (
    "vdup.u16   q8, %2                         \n"
    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
    "vdup.u16   q9, %3                         \n"  // interval multiply.
    "vdup.u16   q10, %4                        \n"  // interval add

    // 8 pixel loop.
  "1:                                          \n"
2349
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366
    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
    "vmovl.u8   q1, d2                         \n"
    "vmovl.u8   q2, d4                         \n"
    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
    "vqdmulh.s16 q1, q1, q8                    \n"  // g
    "vqdmulh.s16 q2, q2, q8                    \n"  // r
    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
    "vmul.u16   q1, q1, q9                     \n"  // g
    "vmul.u16   q2, q2, q9                     \n"  // r
    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
    "vadd.u16   q1, q1, q10                    \n"  // g
    "vadd.u16   q2, q2, q10                    \n"  // r
    "vqmovn.u16 d0, q0                         \n"
    "vqmovn.u16 d2, q1                         \n"
    "vqmovn.u16 d4, q2                         \n"
2367
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2368 2369 2370 2371 2372 2373 2374
    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(dst_argb),       // %0
    "+r"(width)           // %1
  : "r"(scale),           // %2
    "r"(interval_size),   // %3
    "r"(interval_offset)  // %4
2375 2376 2377 2378 2379 2380
  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  );
}

// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2381
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
Frank Barchard's avatar
Frank Barchard committed
2382 2383 2384
void ARGBShadeRow_NEON(const uint8* src_argb,
                       uint8* dst_argb,
                       int width,
2385 2386 2387
                       uint32 value) {
  asm volatile (
    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
2388 2389
    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
2390 2391 2392

    // 8 pixel loop.
  "1:                                          \n"
2393
    MEMACCESS(0)
2394 2395 2396 2397 2398 2399
    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
    "vmovl.u8   q11, d22                       \n"
    "vmovl.u8   q12, d24                       \n"
    "vmovl.u8   q13, d26                       \n"
2400
    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
2401 2402 2403
    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
2404 2405 2406 2407
    "vqmovn.u16 d20, q10                       \n"
    "vqmovn.u16 d22, q11                       \n"
    "vqmovn.u16 d24, q12                       \n"
    "vqmovn.u16 d26, q13                       \n"
2408
    MEMACCESS(1)
2409 2410 2411 2412 2413 2414 2415
    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb),       // %0
    "+r"(dst_argb),       // %1
    "+r"(width)           // %2
  : "r"(value)            // %3
  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
fbarchard@google.com's avatar
fbarchard@google.com committed
2416 2417 2418
  );
}

2419
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2420 2421
// Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2422 2423
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  asm volatile (
2424 2425 2426
    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
2427
  "1:                                          \n"
2428
    MEMACCESS(0)
2429 2430 2431 2432 2433
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d24                    \n"  // B
    "vmlal.u8   q2, d1, d25                    \n"  // G
    "vmlal.u8   q2, d2, d26                    \n"  // R
2434
    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
2435 2436
    "vmov       d1, d0                         \n"  // G
    "vmov       d2, d0                         \n"  // R
2437
    MEMACCESS(1)
2438 2439 2440 2441 2442 2443 2444 2445 2446 2447
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_argb),  // %1
    "+r"(width)      // %2
  :
  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  );
}

fbarchard@google.com's avatar
fbarchard@google.com committed
2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
//    b = (r * 35 + g * 68 + b * 17) >> 7
//    g = (r * 45 + g * 88 + b * 22) >> 7
//    r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  asm volatile (
    "vmov.u8    d20, #17                       \n"  // BB coefficient
    "vmov.u8    d21, #68                       \n"  // BG coefficient
    "vmov.u8    d22, #35                       \n"  // BR coefficient
    "vmov.u8    d24, #22                       \n"  // GB coefficient
    "vmov.u8    d25, #88                       \n"  // GG coefficient
    "vmov.u8    d26, #45                       \n"  // GR coefficient
    "vmov.u8    d28, #24                       \n"  // BB coefficient
    "vmov.u8    d29, #98                       \n"  // BG coefficient
    "vmov.u8    d30, #50                       \n"  // BR coefficient
  "1:                                          \n"
2464
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475
    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
    "vmlal.u8   q2, d1, d21                    \n"  // G
    "vmlal.u8   q2, d2, d22                    \n"  // R
    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
    "vmlal.u8   q3, d1, d25                    \n"  // G
    "vmlal.u8   q3, d2, d26                    \n"  // R
    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
    "vmlal.u8   q8, d1, d29                    \n"  // G
    "vmlal.u8   q8, d2, d30                    \n"  // R
2476 2477 2478
    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
2479
    MEMACCESS(0)
fbarchard@google.com's avatar
fbarchard@google.com committed
2480 2481 2482 2483 2484 2485 2486 2487 2488 2489
    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(dst_argb),  // %0
    "+r"(width)      // %1
  :
  : "cc", "memory", "q0", "q1", "q2", "q3",
    "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

2490
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2491 2492
// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
// needs to saturate.  Consider doing a non-saturating version.
Frank Barchard's avatar
Frank Barchard committed
2493 2494 2495 2496
void ARGBColorMatrixRow_NEON(const uint8* src_argb,
                             uint8* dst_argb,
                             const int8* matrix_argb,
                             int width) {
2497
  asm volatile (
2498
    MEMACCESS(3)
2499
    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
2500
    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
2501
    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
2502 2503

  "1:                                          \n"
2504
    MEMACCESS(0)
2505 2506
    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
2507 2508 2509
    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
    "vmovl.u8   q9, d18                        \n"  // g
    "vmovl.u8   q10, d20                       \n"  // r
2510
    "vmovl.u8   q11, d22                       \n"  // a
2511 2512 2513
    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
2514
    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
2515 2516 2517
    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
2518
    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
2519 2520 2521
    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2522
    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2523 2524 2525
    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
2526
    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
2527 2528 2529
    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2530
    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
2531 2532 2533 2534
    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
2535 2536 2537
    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
2538 2539 2540 2541 2542
    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
2543
    MEMACCESS(1)
2544 2545 2546 2547 2548 2549
    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_argb),   // %1
    "+r"(width)       // %2
  : "r"(matrix_argb)  // %3
2550
  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
2551
    "q10", "q11", "q12", "q13", "q14", "q15"
2552 2553 2554
  );
}

2555
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
Frank Barchard's avatar
Frank Barchard committed
2556 2557 2558 2559
void ARGBMultiplyRow_NEON(const uint8* src_argb0,
                          const uint8* src_argb1,
                          uint8* dst_argb,
                          int width) {
2560 2561 2562
  asm volatile (
    // 8 pixel loop.
  "1:                                          \n"
2563
    MEMACCESS(0)
2564
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
2565
    MEMACCESS(1)
2566 2567 2568 2569 2570 2571
    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q0, d0, d1                     \n"  // multiply B
    "vmull.u8   q1, d2, d3                     \n"  // multiply G
    "vmull.u8   q2, d4, d5                     \n"  // multiply R
    "vmull.u8   q3, d6, d7                     \n"  // multiply A
2572 2573 2574 2575
    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
2576
    MEMACCESS(2)
2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"

  : "+r"(src_argb0),  // %0
    "+r"(src_argb1),  // %1
    "+r"(dst_argb),   // %2
    "+r"(width)       // %3
  :
  : "cc", "memory", "q0", "q1", "q2", "q3"
  );
}

// Add 2 rows of ARGB pixels together, 8 pixels at a time.
Frank Barchard's avatar
Frank Barchard committed
2590 2591 2592 2593
void ARGBAddRow_NEON(const uint8* src_argb0,
                     const uint8* src_argb1,
                     uint8* dst_argb,
                     int width) {
2594 2595 2596
  asm volatile (
    // 8 pixel loop.
  "1:                                          \n"
2597
    MEMACCESS(0)
2598
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2599
    MEMACCESS(1)
2600 2601 2602 2603
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
2604
    MEMACCESS(2)
2605 2606 2607 2608 2609 2610 2611 2612
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"

  : "+r"(src_argb0),  // %0
    "+r"(src_argb1),  // %1
    "+r"(dst_argb),   // %2
    "+r"(width)       // %3
  :
2613 2614 2615 2616 2617
  : "cc", "memory", "q0", "q1", "q2", "q3"
  );
}

// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
Frank Barchard's avatar
Frank Barchard committed
2618 2619 2620 2621
void ARGBSubtractRow_NEON(const uint8* src_argb0,
                          const uint8* src_argb1,
                          uint8* dst_argb,
                          int width) {
2622 2623 2624
  asm volatile (
    // 8 pixel loop.
  "1:                                          \n"
2625
    MEMACCESS(0)
2626
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
2627
    MEMACCESS(1)
2628 2629 2630 2631
    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
2632
    MEMACCESS(2)
2633 2634 2635 2636 2637 2638 2639 2640 2641
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"

  : "+r"(src_argb0),  // %0
    "+r"(src_argb1),  // %1
    "+r"(dst_argb),   // %2
    "+r"(width)       // %3
  :
  : "cc", "memory", "q0", "q1", "q2", "q3"
2642 2643 2644
  );
}

2645 2646 2647 2648 2649
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
// A = 255
// R = Sobel
// G = Sobel
// B = Sobel
Frank Barchard's avatar
Frank Barchard committed
2650 2651 2652 2653
void SobelRow_NEON(const uint8* src_sobelx,
                   const uint8* src_sobely,
                   uint8* dst_argb,
                   int width) {
2654 2655 2656 2657
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // alpha
    // 8 pixel loop.
  "1:                                          \n"
2658
    MEMACCESS(0)
2659
    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
2660
    MEMACCESS(1)
2661 2662 2663 2664 2665
    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   d0, d0, d1                     \n"  // add
    "vmov.u8    d1, d0                         \n"
    "vmov.u8    d2, d0                         \n"
2666
    MEMACCESS(2)
2667 2668 2669 2670
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_sobelx),  // %0
    "+r"(src_sobely),  // %1
2671 2672 2673 2674 2675 2676 2677 2678
    "+r"(dst_argb),    // %2
    "+r"(width)        // %3
  :
  : "cc", "memory", "q0", "q1"
  );
}

// Adds Sobel X and Sobel Y and stores Sobel into plane.
Frank Barchard's avatar
Frank Barchard committed
2679 2680 2681 2682
void SobelToPlaneRow_NEON(const uint8* src_sobelx,
                          const uint8* src_sobely,
                          uint8* dst_y,
                          int width) {
2683 2684 2685
  asm volatile (
    // 16 pixel loop.
  "1:                                          \n"
2686
    MEMACCESS(0)
2687
    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
2688
    MEMACCESS(1)
2689 2690 2691
    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
    "vqadd.u8   q0, q0, q1                     \n"  // add
2692
    MEMACCESS(2)
2693 2694 2695 2696 2697 2698
    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
    "bgt        1b                             \n"
  : "+r"(src_sobelx),  // %0
    "+r"(src_sobely),  // %1
    "+r"(dst_y),       // %2
    "+r"(width)        // %3
2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
  :
  : "cc", "memory", "q0", "q1"
  );
}

// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
// R = Sobel X
// G = Sobel
// B = Sobel Y
Frank Barchard's avatar
Frank Barchard committed
2709 2710 2711 2712
void SobelXYRow_NEON(const uint8* src_sobelx,
                     const uint8* src_sobely,
                     uint8* dst_argb,
                     int width) {
2713 2714 2715 2716
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // alpha
    // 8 pixel loop.
  "1:                                          \n"
2717
    MEMACCESS(0)
2718
    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
2719
    MEMACCESS(1)
2720 2721 2722
    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   d1, d0, d2                     \n"  // add
2723
    MEMACCESS(2)
2724 2725 2726 2727
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
    "bgt        1b                             \n"
  : "+r"(src_sobelx),  // %0
    "+r"(src_sobely),  // %1
fbarchard@google.com's avatar
fbarchard@google.com committed
2728 2729
    "+r"(dst_argb),    // %2
    "+r"(width)        // %3
2730 2731 2732 2733 2734 2735 2736 2737 2738
  :
  : "cc", "memory", "q0", "q1"
  );
}

// SobelX as a matrix is
// -1  0  1
// -2  0  2
// -1  0  1
Frank Barchard's avatar
Frank Barchard committed
2739 2740 2741 2742 2743
void SobelXRow_NEON(const uint8* src_y0,
                    const uint8* src_y1,
                    const uint8* src_y2,
                    uint8* dst_sobelx,
                    int width) {
2744 2745
  asm volatile (
  "1:                                          \n"
2746
    MEMACCESS(0)
2747
    "vld1.8     {d0}, [%0],%5                  \n"  // top
2748
    MEMACCESS(0)
2749
    "vld1.8     {d1}, [%0],%6                  \n"
2750
    "vsubl.u8   q0, d0, d1                     \n"
2751
    MEMACCESS(1)
2752
    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
2753
    MEMACCESS(1)
2754
    "vld1.8     {d3}, [%1],%6                  \n"
2755 2756 2757
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vadd.s16   q0, q0, q1                     \n"
2758
    MEMACCESS(2)
2759
    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
2760
    MEMACCESS(2)
2761
    "vld1.8     {d3}, [%2],%6                  \n"
2762 2763 2764 2765 2766
    "subs       %4, %4, #8                     \n"  // 8 pixels
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vabs.s16   q0, q0                         \n"
    "vqmovn.u16 d0, q0                         \n"
2767
    MEMACCESS(3)
2768
    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784
    "bgt        1b                             \n"
  : "+r"(src_y0),      // %0
    "+r"(src_y1),      // %1
    "+r"(src_y2),      // %2
    "+r"(dst_sobelx),  // %3
    "+r"(width)        // %4
  : "r"(2),            // %5
    "r"(6)             // %6
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
}

// SobelY as a matrix is
// -1 -2 -1
//  0  0  0
//  1  2  1
Frank Barchard's avatar
Frank Barchard committed
2785 2786 2787 2788
void SobelYRow_NEON(const uint8* src_y0,
                    const uint8* src_y1,
                    uint8* dst_sobely,
                    int width) {
2789 2790
  asm volatile (
  "1:                                          \n"
2791
    MEMACCESS(0)
2792
    "vld1.8     {d0}, [%0],%4                  \n"  // left
2793
    MEMACCESS(1)
2794
    "vld1.8     {d1}, [%1],%4                  \n"
2795
    "vsubl.u8   q0, d0, d1                     \n"
2796
    MEMACCESS(0)
2797
    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
2798
    MEMACCESS(1)
2799
    "vld1.8     {d3}, [%1],%4                  \n"
2800 2801 2802
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vadd.s16   q0, q0, q1                     \n"
2803
    MEMACCESS(0)
2804
    "vld1.8     {d2}, [%0],%5                  \n"  // right
2805
    MEMACCESS(1)
2806
    "vld1.8     {d3}, [%1],%5                  \n"
2807 2808 2809 2810 2811
    "subs       %3, %3, #8                     \n"  // 8 pixels
    "vsubl.u8   q1, d2, d3                     \n"
    "vadd.s16   q0, q0, q1                     \n"
    "vabs.s16   q0, q0                         \n"
    "vqmovn.u16 d0, q0                         \n"
2812
    MEMACCESS(2)
2813
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
2814 2815 2816 2817 2818 2819 2820 2821 2822 2823
    "bgt        1b                             \n"
  : "+r"(src_y0),      // %0
    "+r"(src_y1),      // %1
    "+r"(dst_sobely),  // %2
    "+r"(width)        // %3
  : "r"(1),            // %4
    "r"(6)             // %5
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
}
2824 2825 2826 2827 2828 2829 2830 2831 2832

void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
  asm volatile (
    "vdup.32    q0, %3                         \n"

  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
    "subs       %2, %2, #8                     \n"  // 8 pixels per loop
2833 2834
    "vmovl.u16  q2, d2                         \n"  // 8 int's
    "vmovl.u16  q3, d3                         \n"
2835 2836 2837 2838
    "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
    "vcvt.f32.u32  q3, q3                      \n"
    "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
    "vmul.f32   q3, q3, q0                     \n"
2839
    "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
2840 2841
    "vqshrn.u32 d3, q3, #13                    \n"
    MEMACCESS(1)
2842
    "vst1.8     {q1}, [%1]!                    \n"
2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860
    "bgt        1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  : "r"(1.9259299444e-34f)    // %3
  : "cc", "memory", "q0", "q1", "q2", "q3"
  );
}

// TODO(fbarchard): multiply by element.
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
  asm volatile (
    "vdup.32    q0, %3                         \n"

  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
    "subs       %2, %2, #8                     \n"  // 8 pixels per loop
2861 2862
    "vmovl.u16  q2, d2                         \n"  // 8 int's
    "vmovl.u16  q3, d3                         \n"
2863 2864 2865 2866
    "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
    "vcvt.f32.u32  q3, q3                      \n"
    "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
    "vmul.f32   q3, q3, q0                     \n"
2867
    "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
2868 2869
    "vqshrn.u32 d3, q3, #13                    \n"
    MEMACCESS(1)
2870
    "vst1.8     {q1}, [%1]!                    \n"
2871 2872 2873 2874 2875 2876 2877 2878 2879 2880
    "bgt        1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  : "r"(scale * 1.9259299444e-34f)    // %3
  : "cc", "memory", "q0", "q1", "q2", "q3"
  );
}

#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
2881

2882 2883 2884 2885
#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif