row_neon.cc 127 KB
Newer Older
1
/*
2
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 4 5 6
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS. All contributing project authors may
8 9 10
 *  be found in the AUTHORS file in the root of the source tree.
 */

11
#include "libyuv/row.h"
12

13 14
#include <stdio.h>

15 16 17 18 19
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

20
// This module is for GCC Neon
21 22
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
    !defined(__aarch64__)
23

24
// Read 8 Y, 4 U and 4 V from 422
25 26 27 28
#define READYUV422                               \
  "vld1.8     {d0}, [%0]!                    \n" \
  "vld1.32    {d2[0]}, [%1]!                 \n" \
  "vld1.32    {d2[1]}, [%2]!                 \n"
29

30
// Read 8 Y, 8 U and 8 V from 444
31 32 33 34 35 36
#define READYUV444                               \
  "vld1.8     {d0}, [%0]!                    \n" \
  "vld1.8     {d2}, [%1]!                    \n" \
  "vld1.8     {d3}, [%2]!                    \n" \
  "vpaddl.u8  q1, q1                         \n" \
  "vrshrn.u16 d2, q1, #1                     \n"
37

38
// Read 8 Y, and set 4 U and 4 V to 128
Frank Barchard's avatar
Frank Barchard committed
39 40 41
#define READYUV400                               \
  "vld1.8     {d0}, [%0]!                    \n" \
  "vmov.u8    d2, #128                       \n"
42

43
// Read 8 Y and 4 UV from NV12
44 45 46 47 48 49
#define READNV12                                                               \
  "vld1.8     {d0}, [%0]!                    \n"                               \
  "vld1.8     {d2}, [%1]!                    \n"                               \
  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
  "vuzp.u8    d2, d3                         \n"                               \
  "vtrn.u32   d2, d3                         \n"
50 51

// Read 8 Y and 4 VU from NV21
52 53 54 55 56 57
#define READNV21                                                               \
  "vld1.8     {d0}, [%0]!                    \n"                               \
  "vld1.8     {d2}, [%1]!                    \n"                               \
  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
  "vuzp.u8    d3, d2                         \n"                               \
  "vtrn.u32   d2, d3                         \n"
58 59

// Read 8 YUY2
Frank Barchard's avatar
Frank Barchard committed
60 61 62 63 64
#define READYUY2                                 \
  "vld2.8     {d0, d2}, [%0]!                \n" \
  "vmov.u8    d3, d2                         \n" \
  "vuzp.u8    d2, d3                         \n" \
  "vtrn.u32   d2, d3                         \n"
65 66

// Read 8 UYVY
Frank Barchard's avatar
Frank Barchard committed
67 68 69 70 71 72 73
#define READUYVY                                 \
  "vld2.8     {d2, d3}, [%0]!                \n" \
  "vmov.u8    d0, d3                         \n" \
  "vmov.u8    d3, d2                         \n" \
  "vuzp.u8    d2, d3                         \n" \
  "vtrn.u32   d2, d3                         \n"

74 75 76 77 78 79 80
#define YUVTORGB_SETUP                             \
  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \
  "vld1.8     {d25}, [%[kUVToG]]             \n"   \
  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \
  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \
  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
81

Frank Barchard's avatar
Frank Barchard committed
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
#define YUVTORGB                                                              \
  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
  "vmovl.s16  q10, d1                        \n"                              \
  "vmovl.s16  q0, d0                         \n"                              \
  "vmul.s32   q10, q10, q15                  \n"                              \
  "vmul.s32   q0, q0, q15                    \n"                              \
  "vqshrun.s32 d0, q0, #16                   \n"                              \
  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
  "vadd.s16   d18, d19                       \n"                              \
  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
  "vaddw.u16  q1, q1, d16                    \n"                              \
  "vaddw.u16  q10, q10, d17                  \n"                              \
  "vaddw.u16  q3, q3, d18                    \n"                              \
  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
  "vqshrun.s16 d21, q0, #6                   \n" /* G */
108

Frank Barchard's avatar
Frank Barchard committed
109 110 111 112
void I444ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
113
                        const struct YuvConstants* yuvconstants,
114
                        int width) {
115 116 117
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8    d23, #255                      \n"
118
      "1:                                        \n" READYUV444 YUVTORGB
119 120 121 122 123 124 125 126 127 128 129 130 131 132
      "subs       %4, %4, #8                     \n"
      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_argb),  // %3
        "+r"(width)      // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
133 134
}

Frank Barchard's avatar
Frank Barchard committed
135 136 137 138
void I422ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
139
                        const struct YuvConstants* yuvconstants,
140
                        int width) {
141 142 143
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8    d23, #255                      \n"
144
      "1:                                        \n" READYUV422 YUVTORGB
145 146 147 148 149 150 151 152 153 154 155 156 157 158
      "subs       %4, %4, #8                     \n"
      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_argb),  // %3
        "+r"(width)      // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
159 160
}

Frank Barchard's avatar
Frank Barchard committed
161 162 163 164 165
void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
                             const uint8_t* src_a,
                             uint8_t* dst_argb,
166 167
                             const struct YuvConstants* yuvconstants,
                             int width) {
168 169
  asm volatile(
      YUVTORGB_SETUP
170
      "1:                                        \n" READYUV422 YUVTORGB
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
      "subs       %5, %5, #8                     \n"
      "vld1.8     {d23}, [%3]!                   \n"
      "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(src_a),     // %3
        "+r"(dst_argb),  // %4
        "+r"(width)      // %5
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
187 188
}

Frank Barchard's avatar
Frank Barchard committed
189 190 191 192
void I422ToRGBARow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_rgba,
193
                        const struct YuvConstants* yuvconstants,
194
                        int width) {
195 196
  asm volatile(
      YUVTORGB_SETUP
197
      "1:                                        \n" READYUV422 YUVTORGB
198
      "subs       %4, %4, #8                     \n"
199
      "vmov.u8    d19, #255                      \n"  // YUVTORGB modified d19
200 201 202 203 204 205 206 207 208 209 210 211 212
      "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_rgba),  // %3
        "+r"(width)      // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
213 214
}

Frank Barchard's avatar
Frank Barchard committed
215 216 217 218
void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_rgb24,
219
                         const struct YuvConstants* yuvconstants,
220
                         int width) {
221 222
  asm volatile(
      YUVTORGB_SETUP
223
      "1:                                        \n" READYUV422 YUVTORGB
224 225 226 227 228 229 230 231 232 233 234 235 236 237
      "subs       %4, %4, #8                     \n"
      "vst3.8     {d20, d21, d22}, [%3]!         \n"
      "bgt        1b                             \n"
      : "+r"(src_y),      // %0
        "+r"(src_u),      // %1
        "+r"(src_v),      // %2
        "+r"(dst_rgb24),  // %3
        "+r"(width)       // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
238 239
}

Frank Barchard's avatar
Frank Barchard committed
240 241 242 243 244 245
#define ARGBTORGB565                                                        \
  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
246

Frank Barchard's avatar
Frank Barchard committed
247 248 249 250
void I422ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_rgb565,
251
                          const struct YuvConstants* yuvconstants,
252
                          int width) {
253 254
  asm volatile(
      YUVTORGB_SETUP
255 256
      "1:                                        \n" READYUV422 YUVTORGB
      "subs       %4, %4, #8                     \n" ARGBTORGB565
257 258 259 260 261 262 263 264 265 266 267 268 269
      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
      "bgt        1b                             \n"
      : "+r"(src_y),       // %0
        "+r"(src_u),       // %1
        "+r"(src_v),       // %2
        "+r"(dst_rgb565),  // %3
        "+r"(width)        // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
270 271
}

Frank Barchard's avatar
Frank Barchard committed
272 273 274 275 276 277 278 279
#define ARGBTOARGB1555                                                      \
  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
280

Frank Barchard's avatar
Frank Barchard committed
281 282 283 284
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb1555,
285
                            const struct YuvConstants* yuvconstants,
286
                            int width) {
287 288
  asm volatile(
      YUVTORGB_SETUP
289
      "1:                                        \n" READYUV422 YUVTORGB
290
      "subs       %4, %4, #8                     \n"
291
      "vmov.u8    d23, #255                      \n" ARGBTOARGB1555
292 293 294 295 296 297 298 299 300 301 302 303 304
      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
      "bgt        1b                             \n"
      : "+r"(src_y),         // %0
        "+r"(src_u),         // %1
        "+r"(src_v),         // %2
        "+r"(dst_argb1555),  // %3
        "+r"(width)          // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
305 306
}

Frank Barchard's avatar
Frank Barchard committed
307 308 309 310 311 312 313 314
#define ARGBTOARGB4444                                                      \
  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
  "vorr       d0, d20, d21                   \n" /* BG                   */ \
  "vorr       d1, d22, d23                   \n" /* RA                   */ \
  "vzip.u8    d0, d1                         \n" /* BGRA                 */
315

Frank Barchard's avatar
Frank Barchard committed
316 317 318 319
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
                            uint8_t* dst_argb4444,
320
                            const struct YuvConstants* yuvconstants,
321
                            int width) {
322 323
  asm volatile(
      YUVTORGB_SETUP
324 325 326 327
      "vmov.u8    d4, #0x0f                      \n"  // vbic bits to clear
      "1:                                        \n"

      READYUV422 YUVTORGB
328
      "subs       %4, %4, #8                     \n"
329
      "vmov.u8    d23, #255                      \n" ARGBTOARGB4444
330 331 332 333 334 335 336 337 338 339 340 341 342
      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
      "bgt        1b                             \n"
      : "+r"(src_y),         // %0
        "+r"(src_u),         // %1
        "+r"(src_v),         // %2
        "+r"(dst_argb4444),  // %3
        "+r"(width)          // %4
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
343 344
}

Frank Barchard's avatar
Frank Barchard committed
345
void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
346 347 348
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8    d23, #255                      \n"
349
      "1:                                        \n" READYUV400 YUVTORGB
350 351 352 353 354 355 356 357 358 359 360 361
      "subs       %2, %2, #8                     \n"
      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
        [kUVToG] "r"(&kYuvI601Constants.kUVToG),
        [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
        [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
362 363
}

Frank Barchard's avatar
Frank Barchard committed
364
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
365 366
  asm volatile(
      "vmov.u8    d23, #255                      \n"
367
      "1:                                        \n"
368 369 370 371 372 373 374 375 376 377 378
      "vld1.8     {d20}, [%0]!                   \n"
      "vmov       d21, d20                       \n"
      "vmov       d22, d20                       \n"
      "subs       %2, %2, #8                     \n"
      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "d20", "d21", "d22", "d23");
379 380
}

Frank Barchard's avatar
Frank Barchard committed
381 382 383
void NV12ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_uv,
                        uint8_t* dst_argb,
384
                        const struct YuvConstants* yuvconstants,
385
                        int width) {
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
  asm volatile(YUVTORGB_SETUP
               "vmov.u8    d23, #255                      \n"
               "1:                                        \n" READNV12 YUVTORGB
               "subs       %3, %3, #8                     \n"
               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
               "bgt        1b                             \n"
               : "+r"(src_y),     // %0
                 "+r"(src_uv),    // %1
                 "+r"(dst_argb),  // %2
                 "+r"(width)      // %3
               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
                 [kUVToG] "r"(&yuvconstants->kUVToG),
                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
                 "q10", "q11", "q12", "q13", "q14", "q15");
402 403
}

Frank Barchard's avatar
Frank Barchard committed
404 405 406
void NV21ToARGBRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_vu,
                        uint8_t* dst_argb,
407
                        const struct YuvConstants* yuvconstants,
408
                        int width) {
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
  asm volatile(YUVTORGB_SETUP
               "vmov.u8    d23, #255                      \n"
               "1:                                        \n" READNV21 YUVTORGB
               "subs       %3, %3, #8                     \n"
               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
               "bgt        1b                             \n"
               : "+r"(src_y),     // %0
                 "+r"(src_vu),    // %1
                 "+r"(dst_argb),  // %2
                 "+r"(width)      // %3
               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
                 [kUVToG] "r"(&yuvconstants->kUVToG),
                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
                 "q10", "q11", "q12", "q13", "q14", "q15");
425 426
}

427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(

      YUVTORGB_SETUP

      "1:                                        \n"

      READNV12 YUVTORGB
      "subs       %3, %3, #8                     \n"
      "vst3.8     {d20, d21, d22}, [%2]!         \n"
      "bgt        1b                             \n"
      : "+r"(src_y),      // %0
        "+r"(src_uv),     // %1
        "+r"(dst_rgb24),  // %2
        "+r"(width)       // %3
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
}

void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
  asm volatile(

      YUVTORGB_SETUP

      "1:                                        \n"

      READNV21 YUVTORGB
      "subs       %3, %3, #8                     \n"
      "vst3.8     {d20, d21, d22}, [%2]!         \n"
      "bgt        1b                             \n"
      : "+r"(src_y),      // %0
        "+r"(src_vu),     // %1
        "+r"(dst_rgb24),  // %2
        "+r"(width)       // %3
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
}

Frank Barchard's avatar
Frank Barchard committed
481 482 483
void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
484
                          const struct YuvConstants* yuvconstants,
485
                          int width) {
486 487
  asm volatile(
      YUVTORGB_SETUP
488 489
      "1:                                        \n" READNV12 YUVTORGB
      "subs       %3, %3, #8                     \n" ARGBTORGB565
490 491 492 493 494 495 496 497 498 499 500 501
      "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
      "bgt        1b                             \n"
      : "+r"(src_y),       // %0
        "+r"(src_uv),      // %1
        "+r"(dst_rgb565),  // %2
        "+r"(width)        // %3
      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
        [kUVToG] "r"(&yuvconstants->kUVToG),
        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
        [kYToRgb] "r"(&yuvconstants->kYToRgb)
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
        "q12", "q13", "q14", "q15");
502 503
}

Frank Barchard's avatar
Frank Barchard committed
504 505
void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
506
                        const struct YuvConstants* yuvconstants,
507
                        int width) {
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
  asm volatile(YUVTORGB_SETUP
               "vmov.u8    d23, #255                      \n"
               "1:                                        \n" READYUY2 YUVTORGB
               "subs       %2, %2, #8                     \n"
               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
               "bgt        1b                             \n"
               : "+r"(src_yuy2),  // %0
                 "+r"(dst_argb),  // %1
                 "+r"(width)      // %2
               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
                 [kUVToG] "r"(&yuvconstants->kUVToG),
                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
                 "q10", "q11", "q12", "q13", "q14", "q15");
523 524
}

Frank Barchard's avatar
Frank Barchard committed
525 526
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
527
                        const struct YuvConstants* yuvconstants,
528
                        int width) {
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543
  asm volatile(YUVTORGB_SETUP
               "vmov.u8    d23, #255                      \n"
               "1:                                        \n" READUYVY YUVTORGB
               "subs       %2, %2, #8                     \n"
               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
               "bgt        1b                             \n"
               : "+r"(src_uyvy),  // %0
                 "+r"(dst_argb),  // %1
                 "+r"(width)      // %2
               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
                 [kUVToG] "r"(&yuvconstants->kUVToG),
                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
                 "q10", "q11", "q12", "q13", "q14", "q15");
544 545
}

546
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
Frank Barchard's avatar
Frank Barchard committed
547 548 549
void SplitUVRow_NEON(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
550
                     int width) {
551
  asm volatile(
552
      "1:                                        \n"
553 554 555 556 557 558 559 560 561 562 563 564
      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
      "subs       %3, %3, #16                    \n"  // 16 processed per loop
      "vst1.8     {q0}, [%1]!                    \n"  // store U
      "vst1.8     {q1}, [%2]!                    \n"  // store V
      "bgt        1b                             \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
565 566
}

567
// Reads 16 U's and V's and writes out 16 pairs of UV.
Frank Barchard's avatar
Frank Barchard committed
568 569 570
void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
571
                     int width) {
572
  asm volatile(
573
      "1:                                        \n"
574 575 576
      "vld1.8     {q0}, [%0]!                    \n"  // load U
      "vld1.8     {q1}, [%1]!                    \n"  // load V
      "subs       %3, %3, #16                    \n"  // 16 processed per loop
577
      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
578 579 580 581 582 583 584 585
      "bgt        1b                             \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
        "+r"(dst_uv),               // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
586
}
587

588
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
Frank Barchard's avatar
Frank Barchard committed
589 590 591 592
void SplitRGBRow_NEON(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
                      uint8_t* dst_b,
593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
                      int width) {
  asm volatile(
      "1:                                        \n"
      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB
      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB
      "subs       %4, %4, #16                    \n"  // 16 processed per loop
      "vst1.8     {q0}, [%1]!                    \n"  // store R
      "vst1.8     {q1}, [%2]!                    \n"  // store G
      "vst1.8     {q2}, [%3]!                    \n"  // store B
      "bgt        1b                             \n"
      : "+r"(src_rgb),                    // %0
        "+r"(dst_r),                      // %1
        "+r"(dst_g),                      // %2
        "+r"(dst_b),                      // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
      );
}

// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
Frank Barchard's avatar
Frank Barchard committed
614 615 616 617
void MergeRGBRow_NEON(const uint8_t* src_r,
                      const uint8_t* src_g,
                      const uint8_t* src_b,
                      uint8_t* dst_rgb,
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
                      int width) {
  asm volatile(
      "1:                                        \n"
      "vld1.8     {q0}, [%0]!                    \n"  // load R
      "vld1.8     {q1}, [%1]!                    \n"  // load G
      "vld1.8     {q2}, [%2]!                    \n"  // load B
      "subs       %4, %4, #16                    \n"  // 16 processed per loop
      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB
      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB
      "bgt        1b                             \n"
      : "+r"(src_r),                      // %0
        "+r"(src_g),                      // %1
        "+r"(src_b),                      // %2
        "+r"(dst_rgb),                    // %3
        "+r"(width)                       // %4
      :                                   // Input registers
      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
      );
}

638
// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
639
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
640
  asm volatile(
641
      "1:                                        \n"
642 643 644 645 646 647
      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
      "subs       %2, %2, #32                    \n"  // 32 processed per loop
      "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
      "bgt        1b                             \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
648
        "+r"(width)                 // %2  // Output registers
649 650 651
      :                             // Input registers
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
652 653
}

654 655
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
656 657
  asm volatile(
      "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
658
      "1:                                        \n"
659 660 661 662
      "subs      %1, %1, #16                     \n"  // 16 bytes per loop
      "vst1.8    {q0}, [%0]!                     \n"  // store
      "bgt       1b                              \n"
      : "+r"(dst),   // %0
663
        "+r"(width)  // %1
664 665
      : "r"(v8)      // %2
      : "cc", "memory", "q0");
666 667
}

668 669
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
670 671
  asm volatile(
      "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
672
      "1:                                        \n"
673 674 675 676
      "subs      %1, %1, #4                      \n"  // 4 pixels per loop
      "vst1.8    {q0}, [%0]!                     \n"  // store
      "bgt       1b                              \n"
      : "+r"(dst),   // %0
677
        "+r"(width)  // %1
678 679
      : "r"(v32)     // %2
      : "cc", "memory", "q0");
680 681
}

Frank Barchard's avatar
Frank Barchard committed
682
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
683 684 685 686 687 688
  asm volatile(
      // Start at end of source row.
      "mov        r3, #-16                       \n"
      "add        %0, %0, %2                     \n"
      "sub        %0, #16                        \n"

689
      "1:                                        \n"
690 691 692 693 694 695 696 697 698 699 700
      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
      "subs       %2, #16                        \n"  // 16 pixels per loop.
      "vrev64.8   q0, q0                         \n"
      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
      "vst1.8     {d0}, [%1]!                    \n"
      "bgt        1b                             \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
      :
      : "cc", "memory", "r3", "q0");
701 702
}

Frank Barchard's avatar
Frank Barchard committed
703 704 705
void MirrorUVRow_NEON(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
706
                      int width) {
707 708 709 710 711 712
  asm volatile(
      // Start at end of source row.
      "mov        r12, #-16                      \n"
      "add        %0, %0, %3, lsl #1             \n"
      "sub        %0, #16                        \n"

713
      "1:                                        \n"
714 715 716 717 718 719 720 721 722 723 724 725
      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
      "subs       %3, #8                         \n"  // 8 pixels per loop.
      "vrev64.8   q0, q0                         \n"
      "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
      "vst1.8     {d1}, [%2]!                    \n"
      "bgt        1b                             \n"
      : "+r"(src_uv),  // %0
        "+r"(dst_u),   // %1
        "+r"(dst_v),   // %2
        "+r"(width)    // %3
      :
      : "cc", "memory", "r12", "q0");
726
}
727

Frank Barchard's avatar
Frank Barchard committed
728
void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
729 730 731 732 733 734
  asm volatile(
      // Start at end of source row.
      "mov        r3, #-16                       \n"
      "add        %0, %0, %2, lsl #2             \n"
      "sub        %0, #16                        \n"

735
      "1:                                        \n"
736 737 738 739 740 741 742 743 744 745 746
      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
      "subs       %2, #4                         \n"  // 4 pixels per loop.
      "vrev64.32  q0, q0                         \n"
      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
      "vst1.8     {d0}, [%1]!                    \n"
      "bgt        1b                             \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
      :
      : "cc", "memory", "r3", "q0");
747
}
748

749 750 751
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
752 753
  asm volatile(
      "vmov.u8    d4, #255                       \n"  // Alpha
754
      "1:                                        \n"
755 756 757 758 759 760 761 762 763 764
      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_argb),   // %1
        "+r"(width)       // %2
      :
      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
      );
765 766
}

Frank Barchard's avatar
Frank Barchard committed
767
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
768 769
  asm volatile(
      "vmov.u8    d4, #255                       \n"  // Alpha
770
      "1:                                        \n"
771 772 773 774 775 776 777 778 779 780 781
      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vswp.u8    d1, d3                         \n"  // swap R, B
      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
      );
782 783
}

Frank Barchard's avatar
Frank Barchard committed
784
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
785
  asm volatile(
786
      "1:                                        \n"
787 788 789 790 791 792 793 794 795 796 797 798
      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vswp.u8    d1, d3                         \n"  // swap R, B
      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
                                                      // RGB24.
      "bgt        1b                             \n"
      : "+r"(src_raw),    // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      :
      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
      );
799 800
}

Frank Barchard's avatar
Frank Barchard committed
801 802 803 804 805 806 807 808 809 810 811
#define RGB565TOARGB                                                        \
  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
  "vorr.u8    d1, d4, d6                     \n" /* G                    */
812

813 814 815
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width) {
816 817
  asm volatile(
      "vmov.u8    d3, #255                       \n"  // Alpha
818
      "1:                                        \n"
819 820 821 822 823 824 825 826 827 828 829
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      RGB565TOARGB
      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_argb),    // %1
        "+r"(width)        // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
      );
830 831
}

Frank Barchard's avatar
Frank Barchard committed
832 833 834 835 836 837 838 839 840 841 842 843 844
#define ARGB1555TOARGB                                                      \
  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
845

846
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
Frank Barchard's avatar
Frank Barchard committed
847 848 849 850 851 852 853 854 855 856 857 858
#define RGB555TOARGB                                                        \
  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
  "vorr.u8    d1, d4, d6                     \n" /* G                    */

Frank Barchard's avatar
Frank Barchard committed
859 860
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
861
                            int width) {
862 863
  asm volatile(
      "vmov.u8    d3, #255                       \n"  // Alpha
864
      "1:                                        \n"
865 866 867 868 869 870 871 872 873 874 875
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      ARGB1555TOARGB
      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(src_argb1555),  // %0
        "+r"(dst_argb),      // %1
        "+r"(width)          // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
      );
876 877
}

Frank Barchard's avatar
Frank Barchard committed
878 879 880 881 882 883 884 885 886
#define ARGB4444TOARGB                                                      \
  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
887

Frank Barchard's avatar
Frank Barchard committed
888 889
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
890
                            int width) {
891 892
  asm volatile(
      "vmov.u8    d3, #255                       \n"  // Alpha
893
      "1:                                        \n"
894 895 896 897 898 899 900 901 902 903 904
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      ARGB4444TOARGB
      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_argb),      // %1
        "+r"(width)          // %2
      :
      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
      );
905 906
}

907 908 909
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {
910
  asm volatile(
911
      "1:                                        \n"
912 913 914 915 916 917 918 919 920 921 922
      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
                                                      // RGB24.
      "bgt        1b                             \n"
      : "+r"(src_argb),   // %0
        "+r"(dst_rgb24),  // %1
        "+r"(width)       // %2
      :
      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
      );
923 924
}

Frank Barchard's avatar
Frank Barchard committed
925
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
926
  asm volatile(
927
      "1:                                        \n"
928 929 930 931 932 933 934 935 936 937 938
      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vswp.u8    d1, d3                         \n"  // swap R, B
      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
      "bgt        1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_raw),   // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
      );
939 940
}

Frank Barchard's avatar
Frank Barchard committed
941
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
942
  asm volatile(
943
      "1:                                        \n"
944 945 946 947 948 949 950 951 952 953
      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
      "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
      "bgt        1b                             \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
954 955
}

Frank Barchard's avatar
Frank Barchard committed
956
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
957
  asm volatile(
958
      "1:                                        \n"
959 960 961 962 963 964 965 966 967 968
      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
      "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
      "bgt        1b                             \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
969 970
}

Frank Barchard's avatar
Frank Barchard committed
971 972 973
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
974
                         int width) {
975
  asm volatile(
976
      "1:                                        \n"
977 978 979 980 981 982 983 984 985 986 987 988
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
      "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
      "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
      "bgt        1b                             \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_u),     // %1
        "+r"(dst_v),     // %2
        "+r"(width)      // %3
      :
      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
      );
989 990
}

Frank Barchard's avatar
Frank Barchard committed
991 992 993
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
994
                         int width) {
995
  asm volatile(
996
      "1:                                        \n"
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
      "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
      "bgt        1b                             \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_u),     // %1
        "+r"(dst_v),     // %2
        "+r"(width)      // %3
      :
      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
      );
1009 1010
}

Frank Barchard's avatar
Frank Barchard committed
1011
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
Frank Barchard's avatar
Frank Barchard committed
1012
                      int stride_yuy2,
Frank Barchard's avatar
Frank Barchard committed
1013 1014
                      uint8_t* dst_u,
                      uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1015
                      int width) {
1016 1017
  asm volatile(
      "add        %1, %0, %1                     \n"  // stride + src_yuy2
1018
      "1:                                        \n"
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
      "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
      "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
      "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
      "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
      "bgt        1b                             \n"
      : "+r"(src_yuy2),     // %0
        "+r"(stride_yuy2),  // %1
        "+r"(dst_u),        // %2
        "+r"(dst_v),        // %3
        "+r"(width)         // %4
      :
      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
        "d7"  // Clobber List
      );
1036 1037
}

Frank Barchard's avatar
Frank Barchard committed
1038
void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
Frank Barchard's avatar
Frank Barchard committed
1039
                      int stride_uyvy,
Frank Barchard's avatar
Frank Barchard committed
1040 1041
                      uint8_t* dst_u,
                      uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1042
                      int width) {
1043 1044
  asm volatile(
      "add        %1, %0, %1                     \n"  // stride + src_uyvy
1045
      "1:                                        \n"
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
      "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
      "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
      "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
      "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
      "bgt        1b                             \n"
      : "+r"(src_uyvy),     // %0
        "+r"(stride_uyvy),  // %1
        "+r"(dst_u),        // %2
        "+r"(dst_v),        // %3
        "+r"(width)         // %4
      :
      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
        "d7"  // Clobber List
      );
1063
}
1064

fbarchard@google.com's avatar
fbarchard@google.com committed
1065
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
Frank Barchard's avatar
Frank Barchard committed
1066 1067 1068
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
Frank Barchard's avatar
Frank Barchard committed
1069
                         int width) {
1070 1071
  asm volatile(
      "vld1.8     {q2}, [%3]                     \n"  // shuffler
1072
      "1:                                        \n"
1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
      "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
      "subs       %2, %2, #4                     \n"  // 4 processed per loop
      "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
      "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
      "vst1.8     {q1}, [%1]!                    \n"  // store 4.
      "bgt        1b                             \n"
      : "+r"(src_argb),                   // %0
        "+r"(dst_argb),                   // %1
        "+r"(width)                       // %2
      : "r"(shuffler)                     // %3
      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
      );
1085 1086
}

Frank Barchard's avatar
Frank Barchard committed
1087 1088 1089 1090
void I422ToYUY2Row_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
Frank Barchard's avatar
Frank Barchard committed
1091
                        int width) {
1092
  asm volatile(
1093
      "1:                                        \n"
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
      "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
      "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
      "subs       %4, %4, #16                    \n"  // 16 pixels
      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_yuy2),  // %3
        "+r"(width)      // %4
      :
      : "cc", "memory", "d0", "d1", "d2", "d3");
fbarchard@google.com's avatar
fbarchard@google.com committed
1107 1108
}

Frank Barchard's avatar
Frank Barchard committed
1109 1110 1111 1112
void I422ToUYVYRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
Frank Barchard's avatar
Frank Barchard committed
1113
                        int width) {
1114
  asm volatile(
1115
      "1:                                        \n"
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
      "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
      "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
      "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
      "subs       %4, %4, #16                    \n"  // 16 pixels
      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
      "bgt        1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
        "+r"(src_v),     // %2
        "+r"(dst_uyvy),  // %3
        "+r"(width)      // %4
      :
      : "cc", "memory", "d0", "d1", "d2", "d3");
fbarchard@google.com's avatar
fbarchard@google.com committed
1129 1130
}

1131 1132 1133
void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb565,
                          int width) {
1134
  asm volatile(
1135
      "1:                                        \n"
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      ARGBTORGB565
      "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
      "bgt        1b                             \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_rgb565),  // %1
        "+r"(width)        // %2
      :
      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
1146 1147
}

Frank Barchard's avatar
Frank Barchard committed
1148 1149 1150
void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
                                const uint32_t dither4,
Frank Barchard's avatar
Frank Barchard committed
1151
                                int width) {
1152 1153
  asm volatile(
      "vdup.32    d2, %2                         \n"  // dither4
1154
      "1:                                        \n"
1155 1156 1157 1158
      "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vqadd.u8   d20, d20, d2                   \n"
      "vqadd.u8   d21, d21, d2                   \n"
1159 1160 1161
      "vqadd.u8   d22, d22, d2                   \n"  // add for dither
      ARGBTORGB565
      "vst1.8     {q0}, [%0]!                    \n"  // store 8 RGB565.
1162 1163 1164 1165 1166 1167
      "bgt        1b                             \n"
      : "+r"(dst_rgb)   // %0
      : "r"(src_argb),  // %1
        "r"(dither4),   // %2
        "r"(width)      // %3
      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
1168 1169
}

Frank Barchard's avatar
Frank Barchard committed
1170 1171
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb1555,
1172
                            int width) {
1173
  asm volatile(
1174
      "1:                                        \n"
1175 1176 1177
      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      ARGBTOARGB1555
1178
      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB1555.
1179 1180 1181 1182 1183 1184
      "bgt        1b                             \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb1555),  // %1
        "+r"(width)          // %2
      :
      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
1185 1186
}

Frank Barchard's avatar
Frank Barchard committed
1187 1188
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb4444,
1189
                            int width) {
1190 1191 1192
  asm volatile(
      "vmov.u8    d4, #0x0f                      \n"  // bits to clear with
                                                      // vbic.
1193
      "1:                                        \n"
1194 1195 1196
      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      ARGBTOARGB4444
1197
      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB4444.
1198 1199 1200 1201 1202 1203
      "bgt        1b                             \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb4444),  // %1
        "+r"(width)          // %2
      :
      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
1204
}
1205

Frank Barchard's avatar
Frank Barchard committed
1206
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1207 1208 1209 1210 1211
  asm volatile(
      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
      "vmov.u8    d27, #16                       \n"  // Add 16 constant
1212
      "1:                                        \n"
1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q2, d0, d24                    \n"  // B
      "vmlal.u8   q2, d1, d25                    \n"  // G
      "vmlal.u8   q2, d2, d26                    \n"  // R
      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d27                        \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
1227 1228
}

1229 1230 1231
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width) {
1232
  asm volatile(
1233
      "1:                                        \n"
1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
      "subs       %2, %2, #16                    \n"  // 16 processed per loop
      "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
      "bgt       1b                              \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_a),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
      );
1245 1246
}

Frank Barchard's avatar
Frank Barchard committed
1247
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1248 1249 1250 1251
  asm volatile(
      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
1252
      "1:                                        \n"
1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q2, d0, d24                    \n"  // B
      "vmlal.u8   q2, d1, d25                    \n"  // G
      "vmlal.u8   q2, d2, d26                    \n"  // R
      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
1266 1267
}

1268
// 8x1 pixels.
Frank Barchard's avatar
Frank Barchard committed
1269 1270 1271
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
1272
                         int width) {
1273 1274 1275 1276 1277 1278 1279 1280
  asm volatile(
      "vmov.u8    d24, #112                      \n"  // UB / VR 0.875
                                                      // coefficient
      "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
      "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
      "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
      "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
      "vmov.u16   q15, #0x8080                   \n"  // 128.5
1281
      "1:                                        \n"
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q2, d0, d24                    \n"  // B
      "vmlsl.u8   q2, d1, d25                    \n"  // G
      "vmlsl.u8   q2, d2, d26                    \n"  // R
      "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned

      "vmull.u8   q3, d2, d24                    \n"  // R
      "vmlsl.u8   q3, d1, d28                    \n"  // G
      "vmlsl.u8   q3, d0, d27                    \n"  // B
      "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned

      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V

      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
      "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
      "bgt        1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_u),     // %1
        "+r"(dst_v),     // %2
        "+r"(width)      // %3
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
        "q15");
1307
}
1308

1309
// clang-format off
1310
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1311
#define RGBTOUV(QB, QG, QR)                                                 \
1312 1313 1314
  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
Frank Barchard's avatar
Frank Barchard committed
1315
  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
1316 1317 1318
  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
Frank Barchard's avatar
Frank Barchard committed
1319 1320 1321
  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
1322
// clang-format on
1323

1324
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
Frank Barchard's avatar
Frank Barchard committed
1325
void ARGBToUVRow_NEON(const uint8_t* src_argb,
Frank Barchard's avatar
Frank Barchard committed
1326
                      int src_stride_argb,
Frank Barchard's avatar
Frank Barchard committed
1327 1328
                      uint8_t* dst_u,
                      uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1329
                      int width) {
fbarchard@google.com's avatar
fbarchard@google.com committed
1330
  asm volatile (
fbarchard@google.com's avatar
fbarchard@google.com committed
1331
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1332 1333 1334 1335 1336
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
fbarchard@google.com's avatar
fbarchard@google.com committed
1337
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1338
    "1:                                        \n"
fbarchard@google.com's avatar
fbarchard@google.com committed
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1349 1350 1351 1352 1353

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

fbarchard@google.com's avatar
fbarchard@google.com committed
1354
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
1355
    RGBTOUV(q0, q1, q2)
fbarchard@google.com's avatar
fbarchard@google.com committed
1356 1357
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
fbarchard@google.com's avatar
fbarchard@google.com committed
1358 1359
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
fbarchard@google.com's avatar
fbarchard@google.com committed
1360 1361 1362
    "+r"(src_stride_argb),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1363
    "+r"(width)        // %4
fbarchard@google.com's avatar
fbarchard@google.com committed
1364
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1365
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
fbarchard@google.com's avatar
fbarchard@google.com committed
1366 1367 1368
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}
1369

1370
// TODO(fbarchard): Subsample match C code.
Frank Barchard's avatar
Frank Barchard committed
1371
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
Frank Barchard's avatar
Frank Barchard committed
1372
                       int src_stride_argb,
Frank Barchard's avatar
Frank Barchard committed
1373 1374
                       uint8_t* dst_u,
                       uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1375
                       int width) {
1376 1377
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_argb
1378 1379 1380 1381 1382
    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
1383
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1384
    "1:                                        \n"
1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1395 1396 1397 1398 1399

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1400 1401 1402 1403 1404 1405 1406 1407 1408
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_argb),  // %0
    "+r"(src_stride_argb),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1409
    "+r"(width)        // %4
1410 1411 1412 1413 1414 1415
  :
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1416
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
Frank Barchard's avatar
Frank Barchard committed
1417
                      int src_stride_bgra,
Frank Barchard's avatar
Frank Barchard committed
1418 1419
                      uint8_t* dst_u,
                      uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1420
                      int width) {
1421 1422
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
1423 1424 1425 1426 1427
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1428
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1429
    "1:                                        \n"
1430 1431 1432 1433 1434 1435 1436 1437 1438 1439
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
1440 1441 1442 1443 1444

    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
    "vrshr.u16  q2, q2, #1                     \n"
    "vrshr.u16  q3, q3, #1                     \n"

1445 1446 1447 1448 1449 1450 1451 1452 1453
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q3, q2, q1)
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_bgra),  // %0
    "+r"(src_stride_bgra),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1454
    "+r"(width)        // %4
1455
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1456
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1457 1458 1459 1460
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1461
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
Frank Barchard's avatar
Frank Barchard committed
1462
                      int src_stride_abgr,
Frank Barchard's avatar
Frank Barchard committed
1463 1464
                      uint8_t* dst_u,
                      uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1465
                      int width) {
1466 1467
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
1468 1469 1470 1471 1472
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1473
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1474
    "1:                                        \n"
1475 1476 1477 1478 1479 1480 1481 1482 1483 1484
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1485 1486 1487 1488 1489

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1490 1491 1492 1493 1494 1495 1496 1497 1498
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q2, q1, q0)
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_abgr),  // %0
    "+r"(src_stride_abgr),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1499
    "+r"(width)        // %4
1500
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1501
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1502 1503 1504 1505
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1506
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
Frank Barchard's avatar
Frank Barchard committed
1507
                      int src_stride_rgba,
Frank Barchard's avatar
Frank Barchard committed
1508 1509
                      uint8_t* dst_u,
                      uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1510
                      int width) {
1511 1512
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
1513 1514 1515 1516 1517
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1518
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1519
    "1:                                        \n"
1520 1521 1522 1523 1524 1525 1526 1527 1528 1529
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
1530 1531 1532 1533 1534

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1535 1536 1537 1538 1539 1540 1541 1542 1543
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgba),  // %0
    "+r"(src_stride_rgba),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1544
    "+r"(width)        // %4
1545
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1546
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1547 1548 1549 1550
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1551
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
Frank Barchard's avatar
Frank Barchard committed
1552
                       int src_stride_rgb24,
Frank Barchard's avatar
Frank Barchard committed
1553 1554
                       uint8_t* dst_u,
                       uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1555
                       int width) {
1556 1557
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
1558 1559 1560 1561 1562
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1563
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1564
    "1:                                        \n"
1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
1575 1576 1577 1578 1579

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1580 1581 1582 1583 1584 1585 1586 1587 1588
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q0, q1, q2)
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_rgb24),  // %0
    "+r"(src_stride_rgb24),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1589
    "+r"(width)        // %4
1590
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1591
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1592 1593 1594 1595
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

Frank Barchard's avatar
Frank Barchard committed
1596
void RAWToUVRow_NEON(const uint8_t* src_raw,
Frank Barchard's avatar
Frank Barchard committed
1597
                     int src_stride_raw,
Frank Barchard's avatar
Frank Barchard committed
1598 1599
                     uint8_t* dst_u,
                     uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1600
                     int width) {
1601 1602
  asm volatile (
    "add        %1, %0, %1                     \n"  // src_stride + src_raw
1603 1604 1605 1606 1607
    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
1608
    "vmov.u16   q15, #0x8080                   \n"  // 128.5
1609
    "1:                                        \n"
1610 1611 1612 1613 1614 1615 1616 1617 1618 1619
    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
1620 1621 1622 1623 1624

    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
    "vrshr.u16  q1, q1, #1                     \n"
    "vrshr.u16  q2, q2, #1                     \n"

1625 1626 1627 1628 1629 1630 1631 1632 1633
    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
    RGBTOUV(q2, q1, q0)
    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
    "bgt        1b                             \n"
  : "+r"(src_raw),  // %0
    "+r"(src_stride_raw),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
1634
    "+r"(width)        // %4
1635
  :
fbarchard@google.com's avatar
fbarchard@google.com committed
1636
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1637 1638 1639 1640
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
}

1641
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1642
void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
Frank Barchard's avatar
Frank Barchard committed
1643
                        int src_stride_rgb565,
Frank Barchard's avatar
Frank Barchard committed
1644 1645
                        uint8_t* dst_u,
                        uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1646
                        int width) {
1647 1648 1649 1650 1651 1652 1653 1654 1655
  asm volatile(
      "add        %1, %0, %1                     \n"  // src_stride + src_argb
      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
                                                      // coefficient
      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
      "vmov.u16   q15, #0x8080                   \n"  // 128.5
1656
      "1:                                        \n"
1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
      RGB565TOARGB
      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
      "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
      RGB565TOARGB
      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

      "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
      RGB565TOARGB
      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
      "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
      RGB565TOARGB
      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
      "vrshr.u16  q5, q5, #1                     \n"
      "vrshr.u16  q6, q6, #1                     \n"

      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
      "vmul.s16   q8, q4, q10                    \n"  // B
      "vmls.s16   q8, q5, q11                    \n"  // G
      "vmls.s16   q8, q6, q12                    \n"  // R
      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
      "vmul.s16   q9, q6, q10                    \n"  // R
      "vmls.s16   q9, q5, q14                    \n"  // G
      "vmls.s16   q9, q4, q13                    \n"  // B
      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
      "bgt        1b                             \n"
      : "+r"(src_rgb565),         // %0
        "+r"(src_stride_rgb565),  // %1
        "+r"(dst_u),              // %2
        "+r"(dst_v),              // %3
        "+r"(width)               // %4
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
fbarchard@google.com's avatar
fbarchard@google.com committed
1705
}
1706

1707
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1708
void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
Frank Barchard's avatar
Frank Barchard committed
1709
                          int src_stride_argb1555,
Frank Barchard's avatar
Frank Barchard committed
1710 1711
                          uint8_t* dst_u,
                          uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1712
                          int width) {
1713 1714 1715 1716 1717 1718 1719 1720 1721
  asm volatile(
      "add        %1, %0, %1                     \n"  // src_stride + src_argb
      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
                                                      // coefficient
      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
      "vmov.u16   q15, #0x8080                   \n"  // 128.5
1722
      "1:                                        \n"
1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
      RGB555TOARGB
      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
      RGB555TOARGB
      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
      RGB555TOARGB
      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
      RGB555TOARGB
      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
      "vrshr.u16  q5, q5, #1                     \n"
      "vrshr.u16  q6, q6, #1                     \n"

      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
      "vmul.s16   q8, q4, q10                    \n"  // B
      "vmls.s16   q8, q5, q11                    \n"  // G
      "vmls.s16   q8, q6, q12                    \n"  // R
      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
      "vmul.s16   q9, q6, q10                    \n"  // R
      "vmls.s16   q9, q5, q14                    \n"  // G
      "vmls.s16   q9, q4, q13                    \n"  // B
      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
      "bgt        1b                             \n"
      : "+r"(src_argb1555),         // %0
        "+r"(src_stride_argb1555),  // %1
        "+r"(dst_u),                // %2
        "+r"(dst_v),                // %3
        "+r"(width)                 // %4
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1771 1772
}

1773
// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
Frank Barchard's avatar
Frank Barchard committed
1774
void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
Frank Barchard's avatar
Frank Barchard committed
1775
                          int src_stride_argb4444,
Frank Barchard's avatar
Frank Barchard committed
1776 1777
                          uint8_t* dst_u,
                          uint8_t* dst_v,
Frank Barchard's avatar
Frank Barchard committed
1778
                          int width) {
1779 1780 1781 1782 1783 1784 1785 1786 1787
  asm volatile(
      "add        %1, %0, %1                     \n"  // src_stride + src_argb
      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
                                                      // coefficient
      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
      "vmov.u16   q15, #0x8080                   \n"  // 128.5
1788
      "1:                                        \n"
1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
      ARGB4444TOARGB
      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
      ARGB4444TOARGB
      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
      ARGB4444TOARGB
      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
      ARGB4444TOARGB
      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
      "vrshr.u16  q5, q5, #1                     \n"
      "vrshr.u16  q6, q6, #1                     \n"

      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
      "vmul.s16   q8, q4, q10                    \n"  // B
      "vmls.s16   q8, q5, q11                    \n"  // G
      "vmls.s16   q8, q6, q12                    \n"  // R
      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
      "vmul.s16   q9, q6, q10                    \n"  // R
      "vmls.s16   q9, q5, q14                    \n"  // G
      "vmls.s16   q9, q4, q13                    \n"  // B
      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
      "bgt        1b                             \n"
      : "+r"(src_argb4444),         // %0
        "+r"(src_stride_argb4444),  // %1
        "+r"(dst_u),                // %2
        "+r"(dst_v),                // %3
        "+r"(width)                 // %4
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1837
}
fbarchard@google.com's avatar
fbarchard@google.com committed
1838

Frank Barchard's avatar
Frank Barchard committed
1839
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1840 1841 1842 1843 1844
  asm volatile(
      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
      "vmov.u8    d27, #16                       \n"  // Add 16 constant
1845
      "1:                                        \n"
1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      RGB565TOARGB
      "vmull.u8   q2, d0, d24                    \n"  // B
      "vmlal.u8   q2, d1, d25                    \n"  // G
      "vmlal.u8   q2, d2, d26                    \n"  // R
      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d27                        \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
1861 1862
}

1863 1864 1865
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                         uint8_t* dst_y,
                         int width) {
1866 1867 1868 1869 1870
  asm volatile(
      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
      "vmov.u8    d27, #16                       \n"  // Add 16 constant
1871
      "1:                                        \n"
1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      ARGB1555TOARGB
      "vmull.u8   q2, d0, d24                    \n"  // B
      "vmlal.u8   q2, d1, d25                    \n"  // G
      "vmlal.u8   q2, d2, d26                    \n"  // R
      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d27                        \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_argb1555),  // %0
        "+r"(dst_y),         // %1
        "+r"(width)          // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
1887 1888
}

1889 1890 1891
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                         uint8_t* dst_y,
                         int width) {
1892 1893 1894 1895 1896
  asm volatile(
      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
      "vmov.u8    d27, #16                       \n"  // Add 16 constant
1897
      "1:                                        \n"
1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912
      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      ARGB4444TOARGB
      "vmull.u8   q2, d0, d24                    \n"  // B
      "vmlal.u8   q2, d1, d25                    \n"  // G
      "vmlal.u8   q2, d2, d26                    \n"  // R
      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d27                        \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_y),         // %1
        "+r"(width)          // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
1913
}
1914

Frank Barchard's avatar
Frank Barchard committed
1915
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1916 1917 1918 1919 1920
  asm volatile(
      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
      "vmov.u8    d7, #16                        \n"  // Add 16 constant
1921
      "1:                                        \n"
1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q8, d1, d4                     \n"  // R
      "vmlal.u8   q8, d2, d5                     \n"  // G
      "vmlal.u8   q8, d3, d6                     \n"  // B
      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d7                         \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_bgra),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
1936 1937
}

Frank Barchard's avatar
Frank Barchard committed
1938
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1939 1940 1941 1942 1943
  asm volatile(
      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
      "vmov.u8    d7, #16                        \n"  // Add 16 constant
1944
      "1:                                        \n"
1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q8, d0, d4                     \n"  // R
      "vmlal.u8   q8, d1, d5                     \n"  // G
      "vmlal.u8   q8, d2, d6                     \n"  // B
      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d7                         \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_abgr),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
1959 1960
}

Frank Barchard's avatar
Frank Barchard committed
1961
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1962 1963 1964 1965 1966
  asm volatile(
      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
      "vmov.u8    d7, #16                        \n"  // Add 16 constant
1967
      "1:                                        \n"
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q8, d1, d4                     \n"  // B
      "vmlal.u8   q8, d2, d5                     \n"  // G
      "vmlal.u8   q8, d3, d6                     \n"  // R
      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d7                         \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_rgba),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
1982 1983
}

Frank Barchard's avatar
Frank Barchard committed
1984
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
1985 1986 1987 1988 1989
  asm volatile(
      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
      "vmov.u8    d7, #16                        \n"  // Add 16 constant
1990
      "1:                                        \n"
1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q8, d0, d4                     \n"  // B
      "vmlal.u8   q8, d1, d5                     \n"  // G
      "vmlal.u8   q8, d2, d6                     \n"  // R
      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d7                         \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_y),      // %1
        "+r"(width)       // %2
      :
      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
2005
}
2006

Frank Barchard's avatar
Frank Barchard committed
2007
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2008 2009 2010 2011 2012
  asm volatile(
      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
      "vmov.u8    d7, #16                        \n"  // Add 16 constant
2013
      "1:                                        \n"
2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027
      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q8, d0, d4                     \n"  // B
      "vmlal.u8   q8, d1, d5                     \n"  // G
      "vmlal.u8   q8, d2, d6                     \n"  // R
      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
      "vqadd.u8   d0, d7                         \n"
      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
      "bgt        1b                             \n"
      : "+r"(src_raw),  // %0
        "+r"(dst_y),    // %1
        "+r"(width)     // %2
      :
      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
2028
}
2029

2030
// Bilinear filter 16x2 -> 16x1
Frank Barchard's avatar
Frank Barchard committed
2031 2032
void InterpolateRow_NEON(uint8_t* dst_ptr,
                         const uint8_t* src_ptr,
Frank Barchard's avatar
Frank Barchard committed
2033 2034 2035
                         ptrdiff_t src_stride,
                         int dst_width,
                         int source_y_fraction) {
2036
  int y1_fraction = source_y_fraction;
2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047
  asm volatile(
      "cmp        %4, #0                         \n"
      "beq        100f                           \n"
      "add        %2, %1                         \n"
      "cmp        %4, #128                       \n"
      "beq        50f                            \n"

      "vdup.8     d5, %4                         \n"
      "rsb        %4, #256                       \n"
      "vdup.8     d4, %4                         \n"
      // General purpose row blend.
2048
      "1:                                        \n"
2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062
      "vld1.8     {q0}, [%1]!                    \n"
      "vld1.8     {q1}, [%2]!                    \n"
      "subs       %3, %3, #16                    \n"
      "vmull.u8   q13, d0, d4                    \n"
      "vmull.u8   q14, d1, d4                    \n"
      "vmlal.u8   q13, d2, d5                    \n"
      "vmlal.u8   q14, d3, d5                    \n"
      "vrshrn.u16 d0, q13, #8                    \n"
      "vrshrn.u16 d1, q14, #8                    \n"
      "vst1.8     {q0}, [%0]!                    \n"
      "bgt        1b                             \n"
      "b          99f                            \n"

      // Blend 50 / 50.
2063
      "50:                                       \n"
2064 2065 2066 2067 2068 2069 2070 2071 2072
      "vld1.8     {q0}, [%1]!                    \n"
      "vld1.8     {q1}, [%2]!                    \n"
      "subs       %3, %3, #16                    \n"
      "vrhadd.u8  q0, q1                         \n"
      "vst1.8     {q0}, [%0]!                    \n"
      "bgt        50b                            \n"
      "b          99f                            \n"

      // Blend 100 / 0 - Copy row unchanged.
2073
      "100:                                      \n"
2074 2075 2076 2077 2078
      "vld1.8     {q0}, [%1]!                    \n"
      "subs       %3, %3, #16                    \n"
      "vst1.8     {q0}, [%0]!                    \n"
      "bgt        100b                           \n"

2079
      "99:                                       \n"
2080 2081 2082 2083 2084 2085 2086
      : "+r"(dst_ptr),     // %0
        "+r"(src_ptr),     // %1
        "+r"(src_stride),  // %2
        "+r"(dst_width),   // %3
        "+r"(y1_fraction)  // %4
      :
      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
2087
}
fbarchard@google.com's avatar
fbarchard@google.com committed
2088 2089

// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
Frank Barchard's avatar
Frank Barchard committed
2090 2091 2092
void ARGBBlendRow_NEON(const uint8_t* src_argb0,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2093
                       int width) {
2094 2095 2096 2097
  asm volatile(
      "subs       %3, #8                         \n"
      "blt        89f                            \n"
      // Blend 8 pixels.
2098
      "8:                                        \n"
2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q10, d4, d3                    \n"  // db * a
      "vmull.u8   q11, d5, d3                    \n"  // dg * a
      "vmull.u8   q12, d6, d3                    \n"  // dr * a
      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
      "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
      "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
      "vqadd.u8   q0, q0, q2                     \n"  // + sbg
      "vqadd.u8   d2, d2, d6                     \n"  // + sr
      "vmov.u8    d3, #255                       \n"  // a = 255
      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
      "bge        8b                             \n"

2116
      "89:                                       \n"
2117 2118 2119 2120
      "adds       %3, #8-1                       \n"
      "blt        99f                            \n"

      // Blend 1 pixels.
2121
      "1:                                        \n"
2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146
      "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
      "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
      "subs       %3, %3, #1                     \n"    // 1 processed per loop.
      "vmull.u8   q10, d4, d3                    \n"    // db * a
      "vmull.u8   q11, d5, d3                    \n"    // dg * a
      "vmull.u8   q12, d6, d3                    \n"    // dr * a
      "vqrshrn.u16 d20, q10, #8                  \n"    // db >>= 8
      "vqrshrn.u16 d21, q11, #8                  \n"    // dg >>= 8
      "vqrshrn.u16 d22, q12, #8                  \n"    // dr >>= 8
      "vqsub.u8   q2, q2, q10                    \n"    // dbg - dbg * a / 256
      "vqsub.u8   d6, d6, d22                    \n"    // dr - dr * a / 256
      "vqadd.u8   q0, q0, q2                     \n"    // + sbg
      "vqadd.u8   d2, d2, d6                     \n"    // + sr
      "vmov.u8    d3, #255                       \n"    // a = 255
      "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
      "bge        1b                             \n"

      "99:                                         \n"

      : "+r"(src_argb0),  // %0
        "+r"(src_argb1),  // %1
        "+r"(dst_argb),   // %2
        "+r"(width)       // %3
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
fbarchard@google.com's avatar
fbarchard@google.com committed
2147 2148
}

2149
// Attenuate 8 pixels at a time.
2150 2151 2152
void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
2153 2154
  asm volatile(
      // Attenuate 8 pixels.
2155
      "1:                                        \n"
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q10, d0, d3                    \n"  // b * a
      "vmull.u8   q11, d1, d3                    \n"  // g * a
      "vmull.u8   q12, d2, d3                    \n"  // r * a
      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
2171 2172
}

fbarchard@google.com's avatar
fbarchard@google.com committed
2173 2174
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
Frank Barchard's avatar
Frank Barchard committed
2175
void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2176 2177 2178 2179
                          int scale,
                          int interval_size,
                          int interval_offset,
                          int width) {
2180 2181 2182 2183 2184 2185 2186
  asm volatile(
      "vdup.u16   q8, %2                         \n"
      "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
      "vdup.u16   q9, %3                         \n"  // interval multiply.
      "vdup.u16   q10, %4                        \n"  // interval add

      // 8 pixel loop.
2187
      "1:                                        \n"
2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212
      "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
      "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
      "vmovl.u8   q1, d2                         \n"
      "vmovl.u8   q2, d4                         \n"
      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
      "vqdmulh.s16 q1, q1, q8                    \n"  // g
      "vqdmulh.s16 q2, q2, q8                    \n"  // r
      "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
      "vmul.u16   q1, q1, q9                     \n"  // g
      "vmul.u16   q2, q2, q9                     \n"  // r
      "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
      "vadd.u16   q1, q1, q10                    \n"  // g
      "vadd.u16   q2, q2, q10                    \n"  // r
      "vqmovn.u16 d0, q0                         \n"
      "vqmovn.u16 d2, q1                         \n"
      "vqmovn.u16 d4, q2                         \n"
      "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(dst_argb),       // %0
        "+r"(width)           // %1
      : "r"(scale),           // %2
        "r"(interval_size),   // %3
        "r"(interval_offset)  // %4
      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
2213 2214 2215 2216
}

// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2217
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
Frank Barchard's avatar
Frank Barchard committed
2218 2219
void ARGBShadeRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2220
                       int width,
Frank Barchard's avatar
Frank Barchard committed
2221
                       uint32_t value) {
2222 2223 2224 2225 2226 2227
  asm volatile(
      "vdup.u32   q0, %3                         \n"  // duplicate scale value.
      "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
      "vshr.u16   q0, q0, #1                     \n"  // scale / 2.

      // 8 pixel loop.
2228
      "1:                                        \n"
2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249
      "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
      "vmovl.u8   q11, d22                       \n"
      "vmovl.u8   q12, d24                       \n"
      "vmovl.u8   q13, d26                       \n"
      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
      "vqmovn.u16 d20, q10                       \n"
      "vqmovn.u16 d22, q11                       \n"
      "vqmovn.u16 d24, q12                       \n"
      "vqmovn.u16 d26, q13                       \n"
      "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
      "bgt        1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      : "r"(value)       // %3
      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
fbarchard@google.com's avatar
fbarchard@google.com committed
2250 2251
}

2252
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2253 2254
// Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
Frank Barchard's avatar
Frank Barchard committed
2255
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
2256 2257 2258 2259
  asm volatile(
      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
2260
      "1:                                        \n"
2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q2, d0, d24                    \n"  // B
      "vmlal.u8   q2, d1, d25                    \n"  // G
      "vmlal.u8   q2, d2, d26                    \n"  // R
      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
      "vmov       d1, d0                         \n"  // G
      "vmov       d2, d0                         \n"  // R
      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
        "+r"(width)      // %2
      :
      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
2276 2277
}

fbarchard@google.com's avatar
fbarchard@google.com committed
2278 2279 2280 2281
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
//    b = (r * 35 + g * 68 + b * 17) >> 7
//    g = (r * 45 + g * 88 + b * 22) >> 7
//    r = (r * 50 + g * 98 + b * 24) >> 7
Frank Barchard's avatar
Frank Barchard committed
2282
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
2283 2284 2285 2286 2287 2288 2289 2290 2291 2292
  asm volatile(
      "vmov.u8    d20, #17                       \n"  // BB coefficient
      "vmov.u8    d21, #68                       \n"  // BG coefficient
      "vmov.u8    d22, #35                       \n"  // BR coefficient
      "vmov.u8    d24, #22                       \n"  // GB coefficient
      "vmov.u8    d25, #88                       \n"  // GG coefficient
      "vmov.u8    d26, #45                       \n"  // GR coefficient
      "vmov.u8    d28, #24                       \n"  // BB coefficient
      "vmov.u8    d29, #98                       \n"  // BG coefficient
      "vmov.u8    d30, #50                       \n"  // BR coefficient
2293
      "1:                                        \n"
2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314
      "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
      "vmlal.u8   q2, d1, d21                    \n"  // G
      "vmlal.u8   q2, d2, d22                    \n"  // R
      "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
      "vmlal.u8   q3, d1, d25                    \n"  // G
      "vmlal.u8   q3, d2, d26                    \n"  // R
      "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
      "vmlal.u8   q8, d1, d29                    \n"  // G
      "vmlal.u8   q8, d2, d30                    \n"  // R
      "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
      "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
      "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
      "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(dst_argb),  // %0
        "+r"(width)      // %1
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
        "q14", "q15");
fbarchard@google.com's avatar
fbarchard@google.com committed
2315 2316
}

2317
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2318 2319
// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
// needs to saturate.  Consider doing a non-saturating version.
Frank Barchard's avatar
Frank Barchard committed
2320 2321 2322
void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
Frank Barchard's avatar
Frank Barchard committed
2323
                             int width) {
2324 2325 2326 2327 2328
  asm volatile(
      "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
      "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
      "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.

2329
      "1:                                        \n"
2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375
      "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
      "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
      "vmovl.u8   q9, d18                        \n"  // g
      "vmovl.u8   q10, d20                       \n"  // r
      "vmovl.u8   q11, d22                       \n"  // a
      "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
      "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
      "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
      "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
      "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
      "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
      "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
      "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
      "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
      "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
      "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
      "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
      "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
      "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
      "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
      "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
      "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(src_argb),   // %0
        "+r"(dst_argb),   // %1
        "+r"(width)       // %2
      : "r"(matrix_argb)  // %3
      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
        "q10", "q11", "q12", "q13", "q14", "q15");
2376 2377
}

2378
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
Frank Barchard's avatar
Frank Barchard committed
2379 2380 2381
void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2382
                          int width) {
2383 2384
  asm volatile(
      // 8 pixel loop.
2385
      "1:                                        \n"
2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404
      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
      "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vmull.u8   q0, d0, d1                     \n"  // multiply B
      "vmull.u8   q1, d2, d3                     \n"  // multiply G
      "vmull.u8   q2, d4, d5                     \n"  // multiply R
      "vmull.u8   q3, d6, d7                     \n"  // multiply A
      "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
      "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
      "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
      "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(src_argb0),  // %0
        "+r"(src_argb1),  // %1
        "+r"(dst_argb),   // %2
        "+r"(width)       // %3
      :
      : "cc", "memory", "q0", "q1", "q2", "q3");
2405 2406 2407
}

// Add 2 rows of ARGB pixels together, 8 pixels at a time.
Frank Barchard's avatar
Frank Barchard committed
2408 2409 2410
void ARGBAddRow_NEON(const uint8_t* src_argb0,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2411
                     int width) {
2412 2413
  asm volatile(
      // 8 pixel loop.
2414
      "1:                                        \n"
2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vqadd.u8   q0, q0, q2                     \n"  // add B, G
      "vqadd.u8   q1, q1, q3                     \n"  // add R, A
      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(src_argb0),  // %0
        "+r"(src_argb1),  // %1
        "+r"(dst_argb),   // %2
        "+r"(width)       // %3
      :
      : "cc", "memory", "q0", "q1", "q2", "q3");
2428 2429 2430
}

// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
Frank Barchard's avatar
Frank Barchard committed
2431 2432 2433
void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2434
                          int width) {
2435 2436
  asm volatile(
      // 8 pixel loop.
2437
      "1:                                        \n"
2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450
      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
      "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(src_argb0),  // %0
        "+r"(src_argb1),  // %1
        "+r"(dst_argb),   // %2
        "+r"(width)       // %3
      :
      : "cc", "memory", "q0", "q1", "q2", "q3");
2451 2452
}

2453 2454 2455 2456 2457
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
// A = 255
// R = Sobel
// G = Sobel
// B = Sobel
Frank Barchard's avatar
Frank Barchard committed
2458 2459 2460
void SobelRow_NEON(const uint8_t* src_sobelx,
                   const uint8_t* src_sobely,
                   uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2461
                   int width) {
2462 2463 2464
  asm volatile(
      "vmov.u8    d3, #255                       \n"  // alpha
      // 8 pixel loop.
2465
      "1:                                        \n"
2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479
      "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
      "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vqadd.u8   d0, d0, d1                     \n"  // add
      "vmov.u8    d1, d0                         \n"
      "vmov.u8    d2, d0                         \n"
      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(src_sobelx),  // %0
        "+r"(src_sobely),  // %1
        "+r"(dst_argb),    // %2
        "+r"(width)        // %3
      :
      : "cc", "memory", "q0", "q1");
2480 2481 2482
}

// Adds Sobel X and Sobel Y and stores Sobel into plane.
Frank Barchard's avatar
Frank Barchard committed
2483 2484 2485
void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
                          const uint8_t* src_sobely,
                          uint8_t* dst_y,
Frank Barchard's avatar
Frank Barchard committed
2486
                          int width) {
2487 2488
  asm volatile(
      // 16 pixel loop.
2489
      "1:                                        \n"
2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501
      "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
      "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
      "vqadd.u8   q0, q0, q1                     \n"  // add
      "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
      "bgt        1b                             \n"
      : "+r"(src_sobelx),  // %0
        "+r"(src_sobely),  // %1
        "+r"(dst_y),       // %2
        "+r"(width)        // %3
      :
      : "cc", "memory", "q0", "q1");
2502 2503 2504 2505 2506 2507 2508
}

// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
// R = Sobel X
// G = Sobel
// B = Sobel Y
Frank Barchard's avatar
Frank Barchard committed
2509 2510 2511
void SobelXYRow_NEON(const uint8_t* src_sobelx,
                     const uint8_t* src_sobely,
                     uint8_t* dst_argb,
Frank Barchard's avatar
Frank Barchard committed
2512
                     int width) {
2513 2514 2515
  asm volatile(
      "vmov.u8    d3, #255                       \n"  // alpha
      // 8 pixel loop.
2516
      "1:                                        \n"
2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
      "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
      "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
      "vqadd.u8   d1, d0, d2                     \n"  // add
      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
      "bgt        1b                             \n"
      : "+r"(src_sobelx),  // %0
        "+r"(src_sobely),  // %1
        "+r"(dst_argb),    // %2
        "+r"(width)        // %3
      :
      : "cc", "memory", "q0", "q1");
2529 2530 2531 2532 2533 2534
}

// SobelX as a matrix is
// -1  0  1
// -2  0  2
// -1  0  1
Frank Barchard's avatar
Frank Barchard committed
2535 2536 2537 2538
void SobelXRow_NEON(const uint8_t* src_y0,
                    const uint8_t* src_y1,
                    const uint8_t* src_y2,
                    uint8_t* dst_sobelx,
Frank Barchard's avatar
Frank Barchard committed
2539
                    int width) {
2540
  asm volatile(
2541
      "1:                                        \n"
2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567
      "vld1.8     {d0}, [%0],%5                  \n"  // top
      "vld1.8     {d1}, [%0],%6                  \n"
      "vsubl.u8   q0, d0, d1                     \n"
      "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
      "vld1.8     {d3}, [%1],%6                  \n"
      "vsubl.u8   q1, d2, d3                     \n"
      "vadd.s16   q0, q0, q1                     \n"
      "vadd.s16   q0, q0, q1                     \n"
      "vld1.8     {d2}, [%2],%5                  \n"  // bottom
      "vld1.8     {d3}, [%2],%6                  \n"
      "subs       %4, %4, #8                     \n"  // 8 pixels
      "vsubl.u8   q1, d2, d3                     \n"
      "vadd.s16   q0, q0, q1                     \n"
      "vabs.s16   q0, q0                         \n"
      "vqmovn.u16 d0, q0                         \n"
      "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
      "bgt        1b                             \n"
      : "+r"(src_y0),               // %0
        "+r"(src_y1),               // %1
        "+r"(src_y2),               // %2
        "+r"(dst_sobelx),           // %3
        "+r"(width)                 // %4
      : "r"(2),                     // %5
        "r"(6)                      // %6
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
2568 2569 2570 2571 2572 2573
}

// SobelY as a matrix is
// -1 -2 -1
//  0  0  0
//  1  2  1
Frank Barchard's avatar
Frank Barchard committed
2574 2575 2576
void SobelYRow_NEON(const uint8_t* src_y0,
                    const uint8_t* src_y1,
                    uint8_t* dst_sobely,
Frank Barchard's avatar
Frank Barchard committed
2577
                    int width) {
2578
  asm volatile(
2579
      "1:                                        \n"
2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
      "vld1.8     {d0}, [%0],%4                  \n"  // left
      "vld1.8     {d1}, [%1],%4                  \n"
      "vsubl.u8   q0, d0, d1                     \n"
      "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
      "vld1.8     {d3}, [%1],%4                  \n"
      "vsubl.u8   q1, d2, d3                     \n"
      "vadd.s16   q0, q0, q1                     \n"
      "vadd.s16   q0, q0, q1                     \n"
      "vld1.8     {d2}, [%0],%5                  \n"  // right
      "vld1.8     {d3}, [%1],%5                  \n"
      "subs       %3, %3, #8                     \n"  // 8 pixels
      "vsubl.u8   q1, d2, d3                     \n"
      "vadd.s16   q0, q0, q1                     \n"
      "vabs.s16   q0, q0                         \n"
      "vqmovn.u16 d0, q0                         \n"
      "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
      "bgt        1b                             \n"
      : "+r"(src_y0),               // %0
        "+r"(src_y1),               // %1
        "+r"(dst_sobely),           // %2
        "+r"(width)                 // %3
      : "r"(1),                     // %4
        "r"(6)                      // %5
      : "cc", "memory", "q0", "q1"  // Clobber List
      );
2605
}
2606

2607 2608 2609 2610
// %y passes a float as a scalar vector for vector * scalar multiply.
// the regoster must be d0 to d15 and indexed with [0] or [1] to access
// the float in the first or second float of the d-reg

2611 2612 2613 2614
void HalfFloat1Row_NEON(const uint16_t* src,
                        uint16_t* dst,
                        float /*unused*/,
                        int width) {
2615 2616
  asm volatile(

2617
      "1:                                        \n"
2618 2619 2620 2621 2622 2623
      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
      "vmovl.u16  q2, d2                         \n"  // 8 int's
      "vmovl.u16  q3, d3                         \n"
      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
      "vcvt.f32.u32  q3, q3                      \n"
2624 2625
      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
      "vmul.f32   q3, q3, %y3                    \n"
2626 2627 2628 2629 2630 2631 2632
      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
      "vqshrn.u32 d3, q3, #13                    \n"
      "vst1.8     {q1}, [%1]!                    \n"
      "bgt        1b                             \n"
      : "+r"(src),              // %0
        "+r"(dst),              // %1
        "+r"(width)             // %2
2633 2634
      : "w"(1.9259299444e-34f)  // %3
      : "cc", "memory", "q1", "q2", "q3");
2635 2636
}

2637 2638 2639 2640
void HalfFloatRow_NEON(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
                       int width) {
2641 2642
  asm volatile(

2643
      "1:                                        \n"
2644 2645 2646 2647 2648 2649
      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
      "vmovl.u16  q2, d2                         \n"  // 8 int's
      "vmovl.u16  q3, d3                         \n"
      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
      "vcvt.f32.u32  q3, q3                      \n"
2650 2651
      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
      "vmul.f32   q3, q3, %y3                    \n"
2652 2653 2654 2655 2656 2657 2658
      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
      "vqshrn.u32 d3, q3, #13                    \n"
      "vst1.8     {q1}, [%1]!                    \n"
      "bgt        1b                             \n"
      : "+r"(src),                      // %0
        "+r"(dst),                      // %1
        "+r"(width)                     // %2
2659 2660
      : "w"(scale * 1.9259299444e-34f)  // %3
      : "cc", "memory", "q1", "q2", "q3");
2661 2662
}

2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676
void ByteToFloatRow_NEON(const uint8_t* src,
                         float* dst,
                         float scale,
                         int width) {
  asm volatile(

      "1:                                        \n"
      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
      "vmovl.u8   q1, d2                         \n"  // 8 shorts
      "vmovl.u16  q2, d2                         \n"  // 8 ints
      "vmovl.u16  q3, d3                         \n"
      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
      "vcvt.f32.u32  q3, q3                      \n"
2677 2678
      "vmul.f32   q2, q2, %y3                    \n"  // scale
      "vmul.f32   q3, q3, %y3                    \n"
2679 2680 2681 2682 2683
      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
      "bgt        1b                             \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
        "+r"(width)  // %2
2684 2685
      : "w"(scale)   // %3
      : "cc", "memory", "q1", "q2", "q3");
2686 2687
}

2688
#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
2689

2690 2691 2692 2693
#ifdef __cplusplus
}  // extern "C"
}  // namespace libyuv
#endif