enc.c 24.2 KB
Newer Older
AoD314's avatar
AoD314 committed
1 2
// Copyright 2011 Google Inc. All Rights Reserved.
//
3 4 5 6 7
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
AoD314's avatar
AoD314 committed
8 9 10 11 12 13
// -----------------------------------------------------------------------------
//
// Speed-critical encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)

14
#include <assert.h>
AoD314's avatar
AoD314 committed
15 16
#include <stdlib.h>  // for abs()

17 18
#include "src/dsp/dsp.h"
#include "src/enc/vp8i_enc.h"
AoD314's avatar
AoD314 committed
19

AoD314's avatar
AoD314 committed
20 21
static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
AoD314's avatar
AoD314 committed
22 23
}

24
#if !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
25 26
static WEBP_INLINE int clip_max(int v, int max) {
  return (v > max) ? max : v;
AoD314's avatar
AoD314 committed
27
}
28
#endif  // !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
29

AoD314's avatar
AoD314 committed
30 31 32 33
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.

AoD314's avatar
AoD314 committed
34 35 36 37 38 39 40 41 42 43 44
const int VP8DspScan[16 + 4 + 4] = {
  // Luma
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,

  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
};

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
// general-purpose util function
void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
                         VP8Histogram* const histo) {
  int max_value = 0, last_non_zero = 1;
  int k;
  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
    const int value = distribution[k];
    if (value > 0) {
      if (value > max_value) max_value = value;
      last_non_zero = k;
    }
  }
  histo->max_value = max_value;
  histo->last_non_zero = last_non_zero;
}

61 62 63 64
#if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
                               int start_block, int end_block,
                               VP8Histogram* const histo) {
AoD314's avatar
AoD314 committed
65
  int j;
66
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
AoD314's avatar
AoD314 committed
67
  for (j = start_block; j < end_block; ++j) {
AoD314's avatar
AoD314 committed
68 69
    int k;
    int16_t out[16];
AoD314's avatar
AoD314 committed
70

AoD314's avatar
AoD314 committed
71
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
AoD314's avatar
AoD314 committed
72

AoD314's avatar
AoD314 committed
73
    // Convert coefficients to bin.
AoD314's avatar
AoD314 committed
74
    for (k = 0; k < 16; ++k) {
75
      const int v = abs(out[k]) >> 3;
AoD314's avatar
AoD314 committed
76
      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
77
      ++distribution[clipped_value];
AoD314's avatar
AoD314 committed
78 79
    }
  }
80
  VP8SetHistogramData(distribution, histo);
AoD314's avatar
AoD314 committed
81
}
82
#endif  // !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
83 84 85 86 87 88 89 90 91 92

//------------------------------------------------------------------------------
// run-time tables (~4k)

static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]

// We declare this variable 'volatile' to prevent instruction reordering
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;

93
static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
AoD314's avatar
AoD314 committed
94 95 96
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255 + 255; ++i) {
AoD314's avatar
AoD314 committed
97
      clip1[255 + i] = clip_8b(i);
AoD314's avatar
AoD314 committed
98 99 100 101 102 103 104 105 106
    }
    tables_ok = 1;
  }
}


//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)

107 108
#if !WEBP_NEON_OMIT_C_CODE

AoD314's avatar
AoD314 committed
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
#define STORE(x, y, v) \
  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))

static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)

static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
                                      uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
  for (i = 0; i < 4; ++i) {    // vertical pass
    const int a = in[0] + in[8];
    const int b = in[0] - in[8];
    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
    tmp[0] = a + d;
    tmp[1] = b + c;
    tmp[2] = b - c;
    tmp[3] = a - d;
    tmp += 4;
    in++;
  }

  tmp = C;
  for (i = 0; i < 4; ++i) {    // horizontal pass
    const int dc = tmp[0] + 4;
    const int a =  dc +  tmp[8];
    const int b =  dc -  tmp[8];
    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
    STORE(0, i, a + d);
    STORE(1, i, b + c);
    STORE(2, i, b - c);
    STORE(3, i, a - d);
    tmp++;
  }
}

149 150
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
                         int do_two) {
AoD314's avatar
AoD314 committed
151 152 153 154 155 156
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
}

157
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
AoD314's avatar
AoD314 committed
158 159 160
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
AoD314's avatar
AoD314 committed
161
    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
AoD314's avatar
AoD314 committed
162 163 164
    const int d1 = src[1] - ref[1];
    const int d2 = src[2] - ref[2];
    const int d3 = src[3] - ref[3];
AoD314's avatar
AoD314 committed
165 166 167 168
    const int a0 = (d0 + d3);         // 10b                      [-510,510]
    const int a1 = (d1 + d2);
    const int a2 = (d1 - d2);
    const int a3 = (d0 - d3);
169
    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
AoD314's avatar
AoD314 committed
170
    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
171
    tmp[2 + i * 4] = (a0 - a1) * 8;
AoD314's avatar
AoD314 committed
172
    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
AoD314's avatar
AoD314 committed
173 174
  }
  for (i = 0; i < 4; ++i) {
AoD314's avatar
AoD314 committed
175
    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
AoD314's avatar
AoD314 committed
176 177 178
    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
    const int a3 = (tmp[0 + i] - tmp[12 + i]);
AoD314's avatar
AoD314 committed
179
    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
AoD314's avatar
AoD314 committed
180 181 182 183 184
    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
    out[8 + i] = (a0 - a1 + 7) >> 4;
    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
  }
}
185
#endif  // !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
186

187 188
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
                          int16_t* out) {
189 190
  VP8FTransform(src, ref, out);
  VP8FTransform(src + 4, ref + 4, out + 16);
AoD314's avatar
AoD314 committed
191 192
}

193 194
#if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
195
  // input is 12b signed
196
  int32_t tmp[16];
AoD314's avatar
AoD314 committed
197 198
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
199 200 201 202 203
    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
    const int a1 = (in[1 * 16] + in[3 * 16]);
    const int a2 = (in[1 * 16] - in[3 * 16]);
    const int a3 = (in[0 * 16] - in[2 * 16]);
    tmp[0 + i * 4] = a0 + a1;   // 14b
AoD314's avatar
AoD314 committed
204 205 206 207 208
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  for (i = 0; i < 4; ++i) {
209
    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
AoD314's avatar
AoD314 committed
210 211 212
    const int a1 = (tmp[4 + i] + tmp[12+ i]);
    const int a2 = (tmp[4 + i] - tmp[12+ i]);
    const int a3 = (tmp[0 + i] - tmp[8 + i]);
213
    const int b0 = a0 + a1;    // 16b
AoD314's avatar
AoD314 committed
214 215 216
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
217 218 219 220
    out[ 0 + i] = b0 >> 1;     // 15b
    out[ 4 + i] = b1 >> 1;
    out[ 8 + i] = b2 >> 1;
    out[12 + i] = b3 >> 1;
AoD314's avatar
AoD314 committed
221 222
  }
}
223
#endif  // !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240

#undef MUL
#undef STORE

//------------------------------------------------------------------------------
// Intra predictions

static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  int j;
  for (j = 0; j < size; ++j) {
    memset(dst + j * BPS, value, size);
  }
}

static WEBP_INLINE void VerticalPred(uint8_t* dst,
                                     const uint8_t* top, int size) {
  int j;
241
  if (top != NULL) {
AoD314's avatar
AoD314 committed
242 243 244 245 246 247 248 249
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
  } else {
    Fill(dst, 127, size);
  }
}

static WEBP_INLINE void HorizontalPred(uint8_t* dst,
                                       const uint8_t* left, int size) {
250
  if (left != NULL) {
AoD314's avatar
AoD314 committed
251 252 253 254 255 256 257 258 259 260 261 262
    int j;
    for (j = 0; j < size; ++j) {
      memset(dst + j * BPS, left[j], size);
    }
  } else {
    Fill(dst, 129, size);
  }
}

static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
                                   const uint8_t* top, int size) {
  int y;
263 264
  if (left != NULL) {
    if (top != NULL) {
AoD314's avatar
AoD314 committed
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
      const uint8_t* const clip = clip1 + 255 - left[-1];
      for (y = 0; y < size; ++y) {
        const uint8_t* const clip_table = clip + left[y];
        int x;
        for (x = 0; x < size; ++x) {
          dst[x] = clip_table[top[x]];
        }
        dst += BPS;
      }
    } else {
      HorizontalPred(dst, left, size);
    }
  } else {
    // true motion without left samples (hence: with default 129 value)
    // is equivalent to VE prediction where you just copy the top samples.
    // Note that if top samples are not available, the default value is
    // then 129, and not 127 as in the VerticalPred case.
282
    if (top != NULL) {
AoD314's avatar
AoD314 committed
283 284 285 286 287 288 289 290 291 292 293 294
      VerticalPred(dst, top, size);
    } else {
      Fill(dst, 129, size);
    }
  }
}

static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
                               const uint8_t* top,
                               int size, int round, int shift) {
  int DC = 0;
  int j;
295
  if (top != NULL) {
AoD314's avatar
AoD314 committed
296
    for (j = 0; j < size; ++j) DC += top[j];
297
    if (left != NULL) {   // top and left present
AoD314's avatar
AoD314 committed
298 299 300 301 302
      for (j = 0; j < size; ++j) DC += left[j];
    } else {      // top, but no left
      DC += DC;
    }
    DC = (DC + round) >> shift;
303
  } else if (left != NULL) {   // left but no top
AoD314's avatar
AoD314 committed
304 305 306 307 308 309 310 311 312 313 314 315
    for (j = 0; j < size; ++j) DC += left[j];
    DC += DC;
    DC = (DC + round) >> shift;
  } else {   // no top, no left, nothing.
    DC = 0x80;
  }
  Fill(dst, DC, size);
}

//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)

316 317
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
                               const uint8_t* top) {
AoD314's avatar
AoD314 committed
318 319 320 321 322 323 324
  // U block
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
  HorizontalPred(C8HE8 + dst, left, 8);
  TrueMotion(C8TM8 + dst, left, top, 8);
  // V block
  dst += 8;
325 326
  if (top != NULL) top += 8;
  if (left != NULL) left += 16;
AoD314's avatar
AoD314 committed
327 328 329 330 331 332 333 334 335
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
  HorizontalPred(C8HE8 + dst, left, 8);
  TrueMotion(C8TM8 + dst, left, top, 8);
}

//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)

336 337
static void Intra16Preds_C(uint8_t* dst,
                           const uint8_t* left, const uint8_t* top) {
AoD314's avatar
AoD314 committed
338 339 340 341 342 343 344 345 346
  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
  VerticalPred(I16VE16 + dst, top, 16);
  HorizontalPred(I16HE16 + dst, left, 16);
  TrueMotion(I16TM16 + dst, left, top, 16);
}

//------------------------------------------------------------------------------
// luma 4x4 prediction

347 348
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
AoD314's avatar
AoD314 committed
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
#define AVG2(a, b) (((a) + (b) + 1) >> 1)

static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
  const uint8_t vals[4] = {
    AVG3(top[-1], top[0], top[1]),
    AVG3(top[ 0], top[1], top[2]),
    AVG3(top[ 1], top[2], top[3]),
    AVG3(top[ 2], top[3], top[4])
  };
  int i;
  for (i = 0; i < 4; ++i) {
    memcpy(dst + i * BPS, vals, 4);
  }
}

static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
  const int L = top[-5];
370 371 372 373
  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
AoD314's avatar
AoD314 committed
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
}

static void DC4(uint8_t* dst, const uint8_t* top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
  Fill(dst, dc >> 3, 4);
}

static void RD4(uint8_t* dst, const uint8_t* top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
  const int L = top[-5];
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
  const int D = top[3];
  DST(0, 3)                                     = AVG3(J, K, L);
  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
  DST(3, 0)                                     = AVG3(D, C, B);
}

static void LD4(uint8_t* dst, const uint8_t* top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
  const int D = top[3];
  const int E = top[4];
  const int F = top[5];
  const int G = top[6];
  const int H = top[7];
  DST(0, 0)                                     = AVG3(A, B, C);
  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
  DST(3, 3)                                     = AVG3(G, H, H);
}

static void VR4(uint8_t* dst, const uint8_t* top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
  const int D = top[3];
  DST(0, 0) = DST(1, 2) = AVG2(X, A);
  DST(1, 0) = DST(2, 2) = AVG2(A, B);
  DST(2, 0) = DST(3, 2) = AVG2(B, C);
  DST(3, 0)             = AVG2(C, D);

  DST(0, 3) =             AVG3(K, J, I);
  DST(0, 2) =             AVG3(J, I, X);
  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
  DST(3, 1) =             AVG3(B, C, D);
}

static void VL4(uint8_t* dst, const uint8_t* top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
  const int D = top[3];
  const int E = top[4];
  const int F = top[5];
  const int G = top[6];
  const int H = top[7];
  DST(0, 0) =             AVG2(A, B);
  DST(1, 0) = DST(0, 2) = AVG2(B, C);
  DST(2, 0) = DST(1, 2) = AVG2(C, D);
  DST(3, 0) = DST(2, 2) = AVG2(D, E);

  DST(0, 1) =             AVG3(A, B, C);
  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
              DST(3, 2) = AVG3(E, F, G);
              DST(3, 3) = AVG3(F, G, H);
}

static void HU4(uint8_t* dst, const uint8_t* top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
  const int L = top[-5];
  DST(0, 0) =             AVG2(I, J);
  DST(2, 0) = DST(0, 1) = AVG2(J, K);
  DST(2, 1) = DST(0, 2) = AVG2(K, L);
  DST(1, 0) =             AVG3(I, J, K);
  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
  DST(3, 2) = DST(2, 2) =
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}

static void HD4(uint8_t* dst, const uint8_t* top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
  const int L = top[-5];
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];

  DST(0, 0) = DST(2, 1) = AVG2(I, X);
  DST(0, 1) = DST(2, 2) = AVG2(J, I);
  DST(0, 2) = DST(2, 3) = AVG2(K, J);
  DST(0, 3)             = AVG2(L, K);

  DST(3, 0)             = AVG3(A, B, C);
  DST(2, 0)             = AVG3(X, A, B);
  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
  DST(1, 3)             = AVG3(L, K, J);
}

static void TM4(uint8_t* dst, const uint8_t* top) {
  int x, y;
  const uint8_t* const clip = clip1 + 255 - top[-1];
  for (y = 0; y < 4; ++y) {
    const uint8_t* const clip_table = clip + top[-2 - y];
    for (x = 0; x < 4; ++x) {
      dst[x] = clip_table[top[x]];
    }
    dst += BPS;
  }
}

#undef DST
#undef AVG3
#undef AVG2

// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
520
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
AoD314's avatar
AoD314 committed
521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
  HE4(I4HE4 + dst, top);
  RD4(I4RD4 + dst, top);
  VR4(I4VR4 + dst, top);
  LD4(I4LD4 + dst, top);
  VL4(I4VL4 + dst, top);
  HD4(I4HD4 + dst, top);
  HU4(I4HU4 + dst, top);
}

//------------------------------------------------------------------------------
// Metric

536
#if !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
                              int w, int h) {
  int count = 0;
  int y, x;
  for (y = 0; y < h; ++y) {
    for (x = 0; x < w; ++x) {
      const int diff = (int)a[x] - b[x];
      count += diff * diff;
    }
    a += BPS;
    b += BPS;
  }
  return count;
}

552
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
AoD314's avatar
AoD314 committed
553 554
  return GetSSE(a, b, 16, 16);
}
555
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
AoD314's avatar
AoD314 committed
556 557
  return GetSSE(a, b, 16, 8);
}
558
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
AoD314's avatar
AoD314 committed
559 560
  return GetSSE(a, b, 8, 8);
}
561
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
AoD314's avatar
AoD314 committed
562 563
  return GetSSE(a, b, 4, 4);
}
564
#endif  // !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
565

566
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
567 568 569 570 571 572 573 574 575 576 577 578 579
  int k, x, y;
  for (k = 0; k < 4; ++k) {
    uint32_t avg = 0;
    for (y = 0; y < 4; ++y) {
      for (x = 0; x < 4; ++x) {
        avg += ref[x + y * BPS];
      }
    }
    dc[k] = avg;
    ref += 4;   // go to next 4x4 block.
  }
}

AoD314's avatar
AoD314 committed
580 581 582 583 584 585
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.

586
#if !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
587 588
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
589
// w[] contains a row-major 4 by 4 symmetric matrix.
AoD314's avatar
AoD314 committed
590 591 592 593 594 595
static int TTransform(const uint8_t* in, const uint16_t* w) {
  int sum = 0;
  int tmp[16];
  int i;
  // horizontal pass
  for (i = 0; i < 4; ++i, in += BPS) {
AoD314's avatar
AoD314 committed
596 597 598 599 600
    const int a0 = in[0] + in[2];
    const int a1 = in[1] + in[3];
    const int a2 = in[1] - in[3];
    const int a3 = in[0] - in[2];
    tmp[0 + i * 4] = a0 + a1;
AoD314's avatar
AoD314 committed
601 602 603 604 605 606
    tmp[1 + i * 4] = a3 + a2;
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
  // vertical pass
  for (i = 0; i < 4; ++i, ++w) {
AoD314's avatar
AoD314 committed
607 608 609 610
    const int a0 = tmp[0 + i] + tmp[8 + i];
    const int a1 = tmp[4 + i] + tmp[12+ i];
    const int a2 = tmp[4 + i] - tmp[12+ i];
    const int a3 = tmp[0 + i] - tmp[8 + i];
AoD314's avatar
AoD314 committed
611 612 613 614
    const int b0 = a0 + a1;
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
AoD314's avatar
AoD314 committed
615 616 617 618 619

    sum += w[ 0] * abs(b0);
    sum += w[ 4] * abs(b1);
    sum += w[ 8] * abs(b2);
    sum += w[12] * abs(b3);
AoD314's avatar
AoD314 committed
620 621 622 623
  }
  return sum;
}

624 625
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
                      const uint16_t* const w) {
AoD314's avatar
AoD314 committed
626 627
  const int sum1 = TTransform(a, w);
  const int sum2 = TTransform(b, w);
AoD314's avatar
AoD314 committed
628
  return abs(sum2 - sum1) >> 5;
AoD314's avatar
AoD314 committed
629 630
}

631 632
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
                        const uint16_t* const w) {
AoD314's avatar
AoD314 committed
633 634 635 636
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    for (x = 0; x < 16; x += 4) {
637
      D += Disto4x4_C(a + x + y, b + x + y, w);
AoD314's avatar
AoD314 committed
638 639 640 641
    }
  }
  return D;
}
642
#endif  // !WEBP_NEON_OMIT_C_CODE
AoD314's avatar
AoD314 committed
643 644 645 646 647 648 649 650 651 652

//------------------------------------------------------------------------------
// Quantization
//

static const uint8_t kZigzag[16] = {
  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};

// Simple quantization
653 654
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
                           const VP8Matrix* const mtx) {
AoD314's avatar
AoD314 committed
655
  int last = -1;
656 657
  int n;
  for (n = 0; n < 16; ++n) {
AoD314's avatar
AoD314 committed
658 659
    const int j = kZigzag[n];
    const int sign = (in[j] < 0);
660
    const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
AoD314's avatar
AoD314 committed
661
    if (coeff > mtx->zthresh_[j]) {
662 663 664 665 666 667 668 669 670
      const uint32_t Q = mtx->q_[j];
      const uint32_t iQ = mtx->iq_[j];
      const uint32_t B = mtx->bias_[j];
      int level = QUANTDIV(coeff, iQ, B);
      if (level > MAX_LEVEL) level = MAX_LEVEL;
      if (sign) level = -level;
      in[j] = level * (int)Q;
      out[n] = level;
      if (level) last = n;
AoD314's avatar
AoD314 committed
671 672 673 674 675 676 677 678
    } else {
      out[n] = 0;
      in[j] = 0;
    }
  }
  return (last >= 0);
}

679 680 681
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
                             const VP8Matrix* const mtx) {
682 683 684 685 686
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
}
687
#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
688

AoD314's avatar
AoD314 committed
689 690 691
//------------------------------------------------------------------------------
// Block copy

692
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
AoD314's avatar
AoD314 committed
693
  int y;
694 695
  for (y = 0; y < h; ++y) {
    memcpy(dst, src, w);
AoD314's avatar
AoD314 committed
696 697 698 699 700
    src += BPS;
    dst += BPS;
  }
}

701
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
702 703 704
  Copy(src, dst, 4, 4);
}

705
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
706 707 708
  Copy(src, dst, 16, 8);
}

AoD314's avatar
AoD314 committed
709 710 711 712 713 714 715 716
//------------------------------------------------------------------------------
// Initialization

// Speed-critical function pointers. We have to initialize them to the default
// implementations within VP8EncDspInit().
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
717
VP8Fdct VP8FTransform2;
AoD314's avatar
AoD314 committed
718 719 720 721 722 723 724 725 726 727
VP8WHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
VP8IntraPreds VP8EncPredChroma8;
VP8Metric VP8SSE16x16;
VP8Metric VP8SSE8x8;
VP8Metric VP8SSE16x8;
VP8Metric VP8SSE4x4;
VP8WMetric VP8TDisto4x4;
VP8WMetric VP8TDisto16x16;
728
VP8MeanMetric VP8Mean16x4;
AoD314's avatar
AoD314 committed
729
VP8QuantizeBlock VP8EncQuantizeBlock;
730 731
VP8Quantize2Blocks VP8EncQuantize2Blocks;
VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
AoD314's avatar
AoD314 committed
732
VP8BlockCopy VP8Copy4x4;
733
VP8BlockCopy VP8Copy16x8;
AoD314's avatar
AoD314 committed
734 735

extern void VP8EncDspInitSSE2(void);
736 737
extern void VP8EncDspInitSSE41(void);
extern void VP8EncDspInitAVX2(void);
AoD314's avatar
AoD314 committed
738
extern void VP8EncDspInitNEON(void);
739 740 741
extern void VP8EncDspInitMIPS32(void);
extern void VP8EncDspInitMIPSdspR2(void);
extern void VP8EncDspInitMSA(void);
AoD314's avatar
AoD314 committed
742

743
WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
744
  VP8DspInit();  // common inverse transforms
AoD314's avatar
AoD314 committed
745 746 747
  InitTables();

  // default C implementations
748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
#if !WEBP_NEON_OMIT_C_CODE
  VP8ITransform = ITransform_C;
  VP8FTransform = FTransform_C;
  VP8FTransformWHT = FTransformWHT_C;
  VP8TDisto4x4 = Disto4x4_C;
  VP8TDisto16x16 = Disto16x16_C;
  VP8CollectHistogram = CollectHistogram_C;
  VP8SSE16x16 = SSE16x16_C;
  VP8SSE16x8 = SSE16x8_C;
  VP8SSE8x8 = SSE8x8_C;
  VP8SSE4x4 = SSE4x4_C;
#endif

#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
  VP8EncQuantizeBlock = QuantizeBlock_C;
  VP8EncQuantize2Blocks = Quantize2Blocks_C;
#endif

  VP8FTransform2 = FTransform2_C;
  VP8EncPredLuma4 = Intra4Preds_C;
  VP8EncPredLuma16 = Intra16Preds_C;
  VP8EncPredChroma8 = IntraChromaPreds_C;
  VP8Mean16x4 = Mean16x4_C;
  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
  VP8Copy4x4 = Copy4x4_C;
  VP8Copy16x8 = Copy16x8_C;
AoD314's avatar
AoD314 committed
774 775

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
776
  if (VP8GetCPUInfo != NULL) {
AoD314's avatar
AoD314 committed
777 778 779
#if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8EncDspInitSSE2();
780 781 782 783 784 785 786 787 788 789
#if defined(WEBP_USE_SSE41)
      if (VP8GetCPUInfo(kSSE4_1)) {
        VP8EncDspInitSSE41();
      }
#endif
    }
#endif
#if defined(WEBP_USE_AVX2)
    if (VP8GetCPUInfo(kAVX2)) {
      VP8EncDspInitAVX2();
AoD314's avatar
AoD314 committed
790
    }
791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
#endif
#if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
    }
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8EncDspInitMIPSdspR2();
    }
#endif
#if defined(WEBP_USE_MSA)
    if (VP8GetCPUInfo(kMSA)) {
      VP8EncDspInitMSA();
    }
AoD314's avatar
AoD314 committed
806 807
#endif
  }
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835

#if defined(WEBP_USE_NEON)
  if (WEBP_NEON_OMIT_C_CODE ||
      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
    VP8EncDspInitNEON();
  }
#endif

  assert(VP8ITransform != NULL);
  assert(VP8FTransform != NULL);
  assert(VP8FTransformWHT != NULL);
  assert(VP8TDisto4x4 != NULL);
  assert(VP8TDisto16x16 != NULL);
  assert(VP8CollectHistogram != NULL);
  assert(VP8SSE16x16 != NULL);
  assert(VP8SSE16x8 != NULL);
  assert(VP8SSE8x8 != NULL);
  assert(VP8SSE4x4 != NULL);
  assert(VP8EncQuantizeBlock != NULL);
  assert(VP8EncQuantize2Blocks != NULL);
  assert(VP8FTransform2 != NULL);
  assert(VP8EncPredLuma4 != NULL);
  assert(VP8EncPredLuma16 != NULL);
  assert(VP8EncPredChroma8 != NULL);
  assert(VP8Mean16x4 != NULL);
  assert(VP8EncQuantizeBlockWHT != NULL);
  assert(VP8Copy4x4 != NULL);
  assert(VP8Copy16x8 != NULL);
AoD314's avatar
AoD314 committed
836
}