Home | History | Annotate | Download | only in dsp
      1 // Copyright 2017 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // SSE2 version of distortion calculation
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_SSE2)
     17 
     18 #include <assert.h>
     19 #include <emmintrin.h>
     20 
     21 #include "src/dsp/common_sse2.h"
     22 
     23 #if !defined(WEBP_DISABLE_STATS)
     24 
     25 // Helper function
     26 static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
     27                                                __m128i* const sum) {
     28   // take abs(a-b) in 8b
     29   const __m128i a_b = _mm_subs_epu8(a, b);
     30   const __m128i b_a = _mm_subs_epu8(b, a);
     31   const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
     32   // zero-extend to 16b
     33   const __m128i zero = _mm_setzero_si128();
     34   const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
     35   const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
     36   // multiply with self
     37   const __m128i sum1 = _mm_madd_epi16(C0, C0);
     38   const __m128i sum2 = _mm_madd_epi16(C1, C1);
     39   *sum = _mm_add_epi32(sum1, sum2);
     40 }
     41 
     42 //------------------------------------------------------------------------------
     43 // SSIM / PSNR entry point
     44 
     45 static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
     46                                    const uint8_t* src2, int len) {
     47   int i = 0;
     48   uint32_t sse2 = 0;
     49   if (len >= 16) {
     50     const int limit = len - 32;
     51     int32_t tmp[4];
     52     __m128i sum1;
     53     __m128i sum = _mm_setzero_si128();
     54     __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
     55     __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
     56     i += 16;
     57     while (i <= limit) {
     58       const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
     59       const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
     60       __m128i sum2;
     61       i += 16;
     62       SubtractAndSquare_SSE2(a0, b0, &sum1);
     63       sum = _mm_add_epi32(sum, sum1);
     64       a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
     65       b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
     66       i += 16;
     67       SubtractAndSquare_SSE2(a1, b1, &sum2);
     68       sum = _mm_add_epi32(sum, sum2);
     69     }
     70     SubtractAndSquare_SSE2(a0, b0, &sum1);
     71     sum = _mm_add_epi32(sum, sum1);
     72     _mm_storeu_si128((__m128i*)tmp, sum);
     73     sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
     74   }
     75 
     76   for (; i < len; ++i) {
     77     const int32_t diff = src1[i] - src2[i];
     78     sse2 += diff * diff;
     79   }
     80   return sse2;
     81 }
     82 #endif  // !defined(WEBP_DISABLE_STATS)
     83 
     84 #if !defined(WEBP_REDUCE_SIZE)
     85 
     86 static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
     87   uint16_t tmp[8];
     88   const __m128i a = _mm_srli_si128(*m, 8);
     89   const __m128i b = _mm_add_epi16(*m, a);
     90   _mm_storeu_si128((__m128i*)tmp, b);
     91   return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
     92 }
     93 
     94 static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
     95   const __m128i a = _mm_srli_si128(*m, 8);
     96   const __m128i b = _mm_add_epi32(*m, a);
     97   const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
     98   return (uint32_t)_mm_cvtsi128_si32(c);
     99 }
    100 
    101 static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
    102 
    103 #define ACCUMULATE_ROW(WEIGHT) do {                         \
    104   /* compute row weight (Wx * Wy) */                        \
    105   const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
    106   const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
    107   /* process 8 bytes at a time (7 bytes, actually) */       \
    108   const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
    109   const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
    110   /* convert to 16b and multiply by weight */               \
    111   const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
    112   const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
    113   const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
    114   const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
    115   /* accumulate */                                          \
    116   xm  = _mm_add_epi16(xm, wa1);                             \
    117   ym  = _mm_add_epi16(ym, wb1);                             \
    118   xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
    119   xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
    120   yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
    121   src1 += stride1;                                          \
    122   src2 += stride2;                                          \
    123 } while (0)
    124 
    125 static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
    126                            const uint8_t* src2, int stride2) {
    127   VP8DistoStats stats;
    128   const __m128i zero = _mm_setzero_si128();
    129   __m128i xm = zero, ym = zero;                // 16b accums
    130   __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
    131   const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
    132   assert(2 * VP8_SSIM_KERNEL + 1 == 7);
    133   ACCUMULATE_ROW(1);
    134   ACCUMULATE_ROW(2);
    135   ACCUMULATE_ROW(3);
    136   ACCUMULATE_ROW(4);
    137   ACCUMULATE_ROW(3);
    138   ACCUMULATE_ROW(2);
    139   ACCUMULATE_ROW(1);
    140   stats.xm  = HorizontalAdd16b_SSE2(&xm);
    141   stats.ym  = HorizontalAdd16b_SSE2(&ym);
    142   stats.xxm = HorizontalAdd32b_SSE2(&xxm);
    143   stats.xym = HorizontalAdd32b_SSE2(&xym);
    144   stats.yym = HorizontalAdd32b_SSE2(&yym);
    145   return VP8SSIMFromStats(&stats);
    146 }
    147 
    148 #endif  // !defined(WEBP_REDUCE_SIZE)
    149 
    150 extern void VP8SSIMDspInitSSE2(void);
    151 
    152 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
    153 #if !defined(WEBP_DISABLE_STATS)
    154   VP8AccumulateSSE = AccumulateSSE_SSE2;
    155 #endif
    156 #if !defined(WEBP_REDUCE_SIZE)
    157   VP8SSIMGet = SSIMGet_SSE2;
    158 #endif
    159 }
    160 
    161 #else  // !WEBP_USE_SSE2
    162 
    163 WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
    164 
    165 #endif  // WEBP_USE_SSE2
    166