1. 26 Jan, 2017 2 commits
  2. 24 Jan, 2017 1 commit
  3. 20 Jan, 2017 3 commits
  4. 18 Jan, 2017 1 commit
    • Manojkumar Bhosale's avatar
      Add MSA optimized NV12/21 To RGB row functions · 09b8c971
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C auto-vectorized)
      NV12ToARGBRow_MSA       - ~1.5x
      NV12ToARGBRow_Any_MSA   - ~1.4x
      NV12ToRGB565Row_MSA     - ~1.4x
      NV12ToRGB565Row_Any_MSA - ~1.4x
      NV21ToARGBRow_MSA       - ~1.5x
      NV21ToARGBRow_Any_MSA   - ~1.5x
      SobelRow_MSA            - ~4.3x
      SobelRow_Any_MSA        - ~3.4x
      SobelToPlaneRow_MSA     - ~8.0x
      SobelToPlaneRow_Any_MSA - ~4.7x
      SobelXYRow_MSA          - ~3.0x
      SobelXYRow_Any_MSA      - ~2.5x
      
      Performance Gain (vs C non-vectorized)
      NV12ToARGBRow_MSA       - ~6.5x
      NV12ToARGBRow_Any_MSA   - ~6.5x
      NV12ToRGB565Row_MSA     - ~6.2x
      NV12ToRGB565Row_Any_MSA - ~6.1x
      NV21ToARGBRow_MSA       - ~6.5x
      NV21ToARGBRow_Any_MSA   - ~6.5x
      SobelRow_MSA            - ~14.5x
      SobelRow_Any_MSA        - ~11.3x
      SobelToPlaneRow_MSA     - ~34.2x
      SobelToPlaneRow_Any_MSA - ~19.4x
      SobelXYRow_MSA          - ~11.1x
      SobelXYRow_Any_MSA      - ~9.1x
      
      Review-Url: https://codereview.chromium.org/2636483002 .
      09b8c971
  5. 13 Jan, 2017 3 commits
    • Frank Barchard's avatar
      add Intel Code Analyst markers · a7c87e19
      Frank Barchard authored
      add macros to enable/disable code analyst around blocks of code.
      
      Normally these macros should not be used, but if performance
      details are wanted for intel code, enable them around the code
      and then run via the iaca tool, available on the intel website.
      
      BUG=libyuv:670
      TEST=~/iaca-lin64/bin/iaca.sh -64 out/Release/libyuv_unittest
      R=wangcheng@google.com
      
      Review-Url: https://codereview.chromium.org/2626193002 .
      a7c87e19
    • Manojkumar Bhosale's avatar
      Add MSA optimized rotate functions (used 16x16 transpose) · 73a6f100
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      TransposeWx16_MSA        - ~6.0x
      TransposeWx16_Any_MSA    - ~4.7x
      TransposeUVWx16_MSA      - ~6.3x
      TransposeUVWx16_Any_MSA  - ~5.4x
      
      Performance Gain (vs C non-vectorized)
      TransposeWx16_MSA        - ~6.0x
      TransposeWx16_Any_MSA    - ~4.8x
      TransposeUVWx16_MSA      - ~6.3x
      TransposeUVWx16_Any_MSA  - ~5.4x
      
      Review-Url: https://codereview.chromium.org/2617703002 .
      73a6f100
    • Manojkumar Bhosale's avatar
      Add MSA optimized RAW/RGB/ARGB to ARGB/Y/UV row functions · 7c64163f
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      ARGB1555ToARGBRow_MSA     - 1.85
      ARGB1555ToARGBRow_Any_MSA - 1.82
      RGB565ToARGBRow_MSA       - 2.14
      RGB565ToARGBRow_Any_MSA   - 2.08
      RGB24ToARGBRow_MSA        - 8.57
      RGB24ToARGBRow_Any_MSA    - 7.42
      RAWToARGBRow_MSA          - 8.57
      RAWToARGBRow_Any_MSA      - 7.42
      ARGB1555ToYRow_MSA        - 2.60
      ARGB1555ToYRow_Any_MSA    - 2.47
      RGB565ToYRow_MSA          - 2.45
      RGB565ToYRow_Any_MSA      - 2.33
      RGB24ToYRow_MSA           - 2.23
      RGB24ToYRow_Any_MSA       - 2.01
      RAWToYRow_MSA             - 2.25
      RAWToYRow_Any_MSA         - 2.02
      ARGB1555ToUVRow_MSA       - 1.40
      ARGB1555ToUVRow_Any_MSA   - 1.37
      RGB565ToUVRow_MSA         - 1.68
      RGB565ToUVRow_Any_MSA     - 1.63
      RGB24ToUVRow_MSA          - 3.02
      RGB24ToUVRow_Any_MSA      - 2.87
      RAWToUVRow_MSA            - 3.04
      RAWToUVRow_Any_MSA        - 2.85
      
      Performance Gain (vs C non-vectorized)
      ARGB1555ToARGBRow_MSA     - 4.66
      ARGB1555ToARGBRow_Any_MSA - 4.45
      RGB565ToARGBRow_MSA       - 5.58
      RGB565ToARGBRow_Any_MSA   - 5.34
      RGB24ToARGBRow_MSA        - 8.57
      RGB24ToARGBRow_Any_MSA    - 7.42
      RAWToARGBRow_MSA          - 8.57
      RAWToARGBRow_Any_MSA      - 7.42
      ARGB1555ToYRow_MSA        - 6.38
      ARGB1555ToYRow_Any_MSA    - 5.98
      RGB565ToYRow_MSA          - 6.42
      RGB565ToYRow_Any_MSA      - 6.05
      RGB24ToYRow_MSA           - 7.87
      RGB24ToYRow_Any_MSA       - 7.01
      RAWToYRow_MSA             - 7.98
      RAWToYRow_Any_MSA         - 7.01
      ARGB1555ToUVRow_MSA       - 5.39
      ARGB1555ToUVRow_Any_MSA   - 5.06
      RGB565ToUVRow_MSA         - 6.39
      RGB565ToUVRow_Any_MSA     - 5.90
      RGB24ToUVRow_MSA          - 3.04
      RGB24ToUVRow_Any_MSA      - 2.87
      RAWToUVRow_MSA            - 3.04
      RAWToUVRow_Any_MSA        - 2.88
      
      Review-Url: https://codereview.chromium.org/2600713002 .
      7c64163f
  6. 11 Jan, 2017 2 commits
  7. 21 Dec, 2016 1 commit
    • Manojkumar Bhosale's avatar
      Add MSA optimized remaining scale row functions · 288bfbef
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      ScaleRowDown2_MSA            - ~22.3x
      ScaleRowDown2_Any_MSA        - ~19.9x
      ScaleRowDown2Linear_MSA      - ~31.2x
      ScaleRowDown2Linear_Any_MSA  - ~29.4x
      ScaleRowDown2Box_MSA         - ~20.1x
      ScaleRowDown2Box_Any_MSA     - ~19.6x
      ScaleRowDown4_MSA            - ~11.7x
      ScaleRowDown4_Any_MSA        - ~11.2x
      ScaleRowDown4Box_MSA         - ~15.1x
      ScaleRowDown4Box_Any_MSA     - ~15.1x
      ScaleRowDown38_MSA           - ~1x
      ScaleRowDown38_Any_MSA       - ~1x
      ScaleRowDown38_2_Box_MSA     - ~1.7x
      ScaleRowDown38_2_Box_Any_MSA - ~1.7x
      ScaleRowDown38_3_Box_MSA     - ~1.7x
      ScaleRowDown38_3_Box_Any_MSA - ~1.7x
      ScaleAddRow_MSA              - ~1.2x
      ScaleAddRow_Any_MSA          - ~1.15x
      
      Performance Gain (vs C non-vectorized)
      ScaleRowDown2_MSA            - ~22.4x
      ScaleRowDown2_Any_MSA        - ~19.8x
      ScaleRowDown2Linear_MSA      - ~31.6x
      ScaleRowDown2Linear_Any_MSA  - ~29.4x
      ScaleRowDown2Box_MSA         - ~20.1x
      ScaleRowDown2Box_Any_MSA     - ~19.6x
      ScaleRowDown4_MSA            - ~11.7x
      ScaleRowDown4_Any_MSA        - ~11.2x
      ScaleRowDown4Box_MSA         - ~15.1x
      ScaleRowDown4Box_Any_MSA     - ~15.1x
      ScaleRowDown38_MSA           - ~3.2x
      ScaleRowDown38_Any_MSA       - ~3.2x
      ScaleRowDown38_2_Box_MSA     - ~2.4x
      ScaleRowDown38_2_Box_Any_MSA - ~2.3x
      ScaleRowDown38_3_Box_MSA     - ~2.9x
      ScaleRowDown38_3_Box_Any_MSA - ~2.8x
      ScaleAddRow_MSA              - ~8x
      ScaleAddRow_Any_MSA          - ~7.46x
      
      Review-Url: https://codereview.chromium.org/2559683002 .
      288bfbef
  8. 19 Dec, 2016 1 commit
  9. 15 Dec, 2016 2 commits
    • Manojkumar Bhosale's avatar
      Add MSA optimized ARGB Attenuate/RGB565/Shuffle/Shader/Gray/Sepia row functions · a899dea2
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      ARGBAttenuateRow_MSA          - ~1.1x
      ARGBAttenuateRow_Any_MSA      - ~1.1x
      ARGBToRGB565DitherRow_MSA     - ~6.4x
      ARGBToRGB565DitherRow_Any_MSA - ~6.2x
      ARGBShuffleRow_MSA            - ~5.1x
      ARGBShuffleRow_Any_MSA        - ~1.9x
      ARGBShadeRow_MSA              - ~1.1x
      ARGBGrayRow_MSA               - ~2.6x
      ARGBSepiaRow_MSA              - ~11.6x
      
      Performance Gain (vs C non-vectorized)
      ARGBAttenuateRow_MSA          - ~2.46x
      ARGBAttenuateRow_Any_MSA      - ~2.45x
      ARGBToRGB565DitherRow_MSA     - ~9.4x
      ARGBToRGB565DitherRow_Any_MSA - ~12.5x
      ARGBShuffleRow_MSA            - ~5.2x
      ARGBShuffleRow_Any_MSA        - ~1.9x
      ARGBShadeRow_MSA              - ~4.3x
      ARGBGrayRow_MSA               - ~10.5x
      ARGBSepiaRow_MSA              - ~12.2x
      
      Review-Url: https://codereview.chromium.org/2559693002 .
      a899dea2
    • Manojkumar Bhosale's avatar
      Add MSA optimized TransposeWx8_MSA and TransposeUVWx8_MSA functions · 6fa5e4eb
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      TransposeWx8_MSA          - ~2.7x
      TransposeWx8_Any_MSA      - ~2.1x
      TransposeUVWx8_MSA        - ~2.5x
      TransposeUVWx8_Any_MSA    - ~2.7x
      
      Performance Gain (vs C non-vectorized)
      TransposeWx8_MSA          - ~4.6x
      TransposeWx8_Any_MSA      - ~2.9x
      TransposeUVWx8_MSA        - ~4.4x
      TransposeUVWx8_Any_MSA    - ~3.7x
      
      Review URL: https://codereview.chromium.org/2553403002 .
      6fa5e4eb
  10. 14 Dec, 2016 1 commit
  11. 07 Dec, 2016 2 commits
    • Frank Barchard's avatar
      ConvertFromI420: use halfstride instead of halfwidth · dde8ba70
      Frank Barchard authored
      BUG=libyuv:660
      TEST=try bots
      R=kjellander@chromium.org
      
      Review URL: https://codereview.chromium.org/2554213003 .
      dde8ba70
    • Manojkumar Bhosale's avatar
      Add MSA optimized ARGB scaling functions · 56b5bbb0
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      ScaleARGBRowDown2_MSA           - ~2.6x
      ScaleARGBRowDown2Linear_MSA     - ~7.9x
      ScaleARGBRowDown2Box_MSA        - ~3.7x
      ScaleARGBRowDownEven_MSA        - ~1.2x
      ScaleARGBRowDownEvenBox_MSA     - ~3.5x
      
      ScaleARGBRowDown2_Any_MSA       - ~2.6x
      ScaleARGBRowDown2Linear_Any_MSA - ~7.9x
      ScaleARGBRowDown2Box_Any_MSA    - ~3.6x
      ScaleARGBRowDownEven_Any_MSA    - ~1.2x
      ScaleARGBRowDownEvenBox_Any_MSA - ~3.5x
      
      Performance Gain (vs C non-vectorized)
      ScaleARGBRowDown2_MSA           - 2.6x
      ScaleARGBRowDown2Linear_MSA     - 13.5x
      ScaleARGBRowDown2Box_MSA        - 5.8x
      ScaleARGBRowDownEven_MSA        - 1.2x
      ScaleARGBRowDownEvenBox_MSA     - 3.7x
      
      ScaleARGBRowDown2_Any_MSA       - 2.6x
      ScaleARGBRowDown2Linear_Any_MSA - 13.5x
      ScaleARGBRowDown2Box_Any_MSA    - 5.3x
      ScaleARGBRowDownEven_Any_MSA    - 1.2x
      ScaleARGBRowDownEvenBox_Any_MSA - 3.7x
      
      Review URL: https://codereview.chromium.org/2527983002 .
      56b5bbb0
  12. 02 Dec, 2016 1 commit
    • Manojkumar Bhosale's avatar
      Add MSA optimized ARGB Multiply/Add/Subtract row functions · 83f460be
      Manojkumar Bhosale authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      ARGBMultiplyRow_MSA       - 1.4x
      ARGBAddRow_MSA            - 8.6x
      ARGBSubtractRow_MSA       - 8.6x
      
      ARGBMultiplyRow_Any_MSA   - 1.35x
      ARGBAddRow_Any_MSA        - 7.3x
      ARGBSubtractRow_Any_MSA   - 7.2x
      
      Performance Gain (vs C non-vectorized)
      ARGBMultiplyRow_MSA       - 4.4x
      ARGBAddRow_MSA            - 27x
      ARGBSubtractRow_MSA       - 22x
      
      ARGBMultiplyRow_Any_MSA   - 3.5x
      ARGBAddRow_Any_MSA        - 23x
      ARGBSubtractRow_Any_MSA   - 18x
      
      Review URL: https://codereview.chromium.org/2529983002 .
      83f460be
  13. 22 Nov, 2016 1 commit
    • Frank Barchard's avatar
      Add MSA optimized ARGBToRGB565Row_MSA, ARGBToARGB1555Row_MSA,… · da0c29da
      Frank Barchard authored
      Add MSA optimized ARGBToRGB565Row_MSA, ARGBToARGB1555Row_MSA, ARGBToARGB4444Row_MSA, ARGBToUV444Row_MSA functions
      
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gain (vs C vectorized)
      ARGBToRGB565Row_MSA       - ~1.6x
      ARGBToRGB565Row_Any_MSA   - ~1.6x
      ARGBToARGB1555Row_MSA     - ~1.3x
      ARGBToARGB1555Row_Any_MSA - ~1.3x
      ARGBToARGB4444Row_MSA     - ~3.8x
      ARGBToARGB4444Row_Any_MSA - ~3.8x
      ARGBToUV444Row_MSA        - ~2.4x
      ARGBToUV444Row_Any_MSA    - ~2.4x
      
      Performance Gain (vs C non-vectorized)
      ARGBToRGB565Row_MSA       - ~2.8x
      ARGBToRGB565Row_Any_MSA   - ~2.8x
      ARGBToARGB1555Row_MSA     - ~2.2x
      ARGBToARGB1555Row_Any_MSA - ~2.2x
      ARGBToARGB4444Row_MSA     - ~6.8x
      ARGBToARGB4444Row_Any_MSA - ~6.6x
      ARGBToUV444Row_MSA        - ~6.7x
      ARGBToUV444Row_Any_MSA    - ~6.7x
      
      Review URL: https://codereview.chromium.org/2520003004 .
      da0c29da
  14. 18 Nov, 2016 1 commit
  15. 09 Nov, 2016 1 commit
  16. 08 Nov, 2016 3 commits
  17. 07 Nov, 2016 1 commit
  18. 01 Nov, 2016 1 commit
  19. 27 Oct, 2016 1 commit
  20. 26 Oct, 2016 3 commits
  21. 25 Oct, 2016 3 commits
  22. 24 Oct, 2016 1 commit
    • Frank Barchard's avatar
      Add MSA optimized I422ToARGBRow_MSA and I422ToRGBARow_MSA functions · f5d5bd88
      Frank Barchard authored
      R=fbarchard@google.com
      BUG=libyuv:634
      
      Performance Gains :- (vs C vectorized)
      
      I422ToARGBRow_MSA     : ~1.6x
      I422ToRGBARow_MSA     : ~1.6x
      
      I422ToARGBRow_Any_MSA : ~1.58x
      I422ToRGBARow_Any_MSA : ~1.6x
      
      Performance Gains :- (vs C non-vectorized)
      
      I422ToARGBRow_MSA     : ~7x
      I422ToRGBARow_MSA     : ~7x
      
      I422ToARGBRow_Any_MSA : ~6.9x
      I422ToRGBARow_Any_MSA : ~6.8x
      
      Regarding performance measurement, We have created standalone tests which pass in row's data from a 1920x1080 filled buffer to both the C and MSA functions. And such N iterations are executed to get more accurate timings of C vs MSA.
      
      Review URL: https://codereview.chromium.org/2430313005 .
      f5d5bd88
  23. 21 Oct, 2016 1 commit
    • Frank Barchard's avatar
      scale by 1 for neon implemented · 451af5e9
      Frank Barchard authored
      void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
        asm volatile (
        "1:                                          \n"
          MEMACCESS(0)
          "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
          "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
          "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
          "uxtl2      v1.4s, v1.8h                   \n"
          "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
          "scvtf      v1.4s, v1.4s                   \n"
          "fcvtn      v4.4h, v2.4s                   \n"  // 8 floatsgit
          "fcvtn2     v4.8h, v1.4s                   \n"
         MEMACCESS(1)
          "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
          "b.gt       1b                             \n"
        : "+r"(src),    // %0
          "+r"(dst),    // %1
          "+r"(width)   // %2
        :
        : "cc", "memory", "v1", "v2", "v4"
        );
      }
      
      void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
        asm volatile (
        "1:                                          \n"
          MEMACCESS(0)
          "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
          "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
          "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
          "uxtl2      v1.4s, v1.8h                   \n"
          "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
          "scvtf      v1.4s, v1.4s                   \n"
          "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
          "fmul       v1.4s, v1.4s, %3.s[0]          \n"
          "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
          "uqshrn2    v4.8h, v1.4s, #13              \n"
         MEMACCESS(1)
          "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
          "b.gt       1b                             \n"
        : "+r"(src),    // %0
          "+r"(dst),    // %1
          "+r"(width)   // %2
        : "w"(scale * 1.9259299444e-34f)    // %3
        : "cc", "memory", "v1", "v2", "v4"
        );
      }
      
      TEST=LibYUVPlanarTest.TestHalfFloatPlane_One
      BUG=libyuv:560
      R=hubbe@chromium.org
      
      Review URL: https://codereview.chromium.org/2430313008 .
      451af5e9
  24. 20 Oct, 2016 2 commits
  25. 19 Oct, 2016 1 commit