Home | History | Annotate | Download | only in util
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "../util/ssim.h"  // NOLINT
     12 
     13 #include <string.h>
     14 
     15 #ifdef __cplusplus
     16 extern "C" {
     17 #endif
     18 
     19 typedef unsigned int uint32;     // NOLINT
     20 typedef unsigned short uint16;   // NOLINT
     21 
     22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
     23   (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
     24 #define __SSE2__
     25 #endif
     26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
     27 #include <emmintrin.h>
     28 #endif
     29 
     30 #ifdef _OPENMP
     31 #include <omp.h>
     32 #endif
     33 
     34 // SSIM
     35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
     36 
     37 // Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
     38 // The maximum value (11 x 11) must be less than 128 to avoid sign
     39 // problems during the calls to _mm_mullo_epi16().
     40 static const int K[KERNEL_SIZE] = {
     41   1, 3, 7, 11, 7, 3, 1    // ~11 * exp(-0.3 * i * i)
     42 };
     43 static const double kiW[KERNEL + 1 + 1] = {
     44   1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
     45   1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
     46   1. / 1056.,   // 1 / sum(i:0..5, j..6) K[i]*K[j]
     47   1. / 957.,    // 1 / sum(i:0..4, j..6) K[i]*K[j]
     48   1. / 726.,    // 1 / sum(i:0..3, j..6) K[i]*K[j]
     49 };
     50 
     51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
     52 
     53 #define PWEIGHT(A, B)  static_cast<uint16>(K[(A)] * K[(B)])   // weight product
     54 #define MAKE_WEIGHT(L)                                               \
     55   { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3),  \
     56         PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
     57 
     58 // We need this union trick to be able to initialize constant static __m128i
     59 // values. We can't call _mm_set_epi16() for static compile-time initialization.
     60 static const struct {
     61   union {
     62     uint16 i16_[8];
     63     __m128i m_;
     64   } values_;
     65 } W0 = MAKE_WEIGHT(0),
     66   W1 = MAKE_WEIGHT(1),
     67   W2 = MAKE_WEIGHT(2),
     68   W3 = MAKE_WEIGHT(3);
     69   // ... the rest is symmetric.
     70 #undef MAKE_WEIGHT
     71 #undef PWEIGHT
     72 #endif
     73 
     74 // Common final expression for SSIM, once the weighted sums are known.
     75 static double FinalizeSSIM(double iw, double xm, double ym,
     76                            double xxm, double xym, double yym) {
     77   const double iwx = xm * iw;
     78   const double iwy = ym * iw;
     79   double sxx = xxm * iw - iwx * iwx;
     80   double syy = yym * iw - iwy * iwy;
     81   // small errors are possible, due to rounding. Clamp to zero.
     82   if (sxx < 0.) sxx = 0.;
     83   if (syy < 0.) syy = 0.;
     84   const double sxsy = sqrt(sxx * syy);
     85   const double sxy = xym * iw - iwx * iwy;
     86   static const double C11 = (0.01 * 0.01) * (255 * 255);
     87   static const double C22 = (0.03 * 0.03) * (255 * 255);
     88   static const double C33 = (0.015 * 0.015) * (255 * 255);
     89   const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
     90   const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
     91   const double s = (sxy + C33) / (sxsy + C33);
     92   return l * c * s;
     93 }
     94 
     95 // GetSSIM() does clipping.  GetSSIMFullKernel() does not
     96 
     97 // TODO(skal): use summed tables?
     98 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
     99 // with a diff of 255, squared. The maximum error is thus 0x4388241,
    100 // which fits into 32 bits integers.
    101 double GetSSIM(const uint8 *org, const uint8 *rec,
    102                int xo, int yo, int W, int H, int stride) {
    103   uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
    104   org += (yo - KERNEL) * stride;
    105   org += (xo - KERNEL);
    106   rec += (yo - KERNEL) * stride;
    107   rec += (xo - KERNEL);
    108   for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
    109     if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
    110     const int Wy = K[y_];
    111     for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
    112       const int Wxy = Wy * K[x_];
    113       if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
    114         const int org_x = org[x_];
    115         const int rec_x = rec[x_];
    116         ws += Wxy;
    117         xm  += Wxy * org_x;
    118         ym  += Wxy * rec_x;
    119         xxm += Wxy * org_x * org_x;
    120         xym += Wxy * org_x * rec_x;
    121         yym += Wxy * rec_x * rec_x;
    122       }
    123     }
    124   }
    125   return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
    126 }
    127 
    128 double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
    129                          int xo, int yo, int stride,
    130                          double area_weight) {
    131   uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
    132 
    133 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
    134 
    135   org += yo * stride + xo;
    136   rec += yo * stride + xo;
    137   for (int y = 1; y <= KERNEL; y++) {
    138     const int dy1 = y * stride;
    139     const int dy2 = y * stride;
    140     const int Wy = K[KERNEL + y];
    141 
    142     for (int x = 1; x <= KERNEL; x++) {
    143       // Compute the contributions of upper-left (ul), upper-right (ur)
    144       // lower-left (ll) and lower-right (lr) points (see the diagram below).
    145       // Symmetric Kernel will have same weight on those points.
    146       //       -  -  -  -  -  -  -
    147       //       -  ul -  -  -  ur -
    148       //       -  -  -  -  -  -  -
    149       //       -  -  -  0  -  -  -
    150       //       -  -  -  -  -  -  -
    151       //       -  ll -  -  -  lr -
    152       //       -  -  -  -  -  -  -
    153       const int Wxy = Wy * K[KERNEL + x];
    154       const int ul1 = org[-dy1 - x];
    155       const int ur1 = org[-dy1 + x];
    156       const int ll1 = org[dy1 - x];
    157       const int lr1 = org[dy1 + x];
    158 
    159       const int ul2 = rec[-dy2 - x];
    160       const int ur2 = rec[-dy2 + x];
    161       const int ll2 = rec[dy2 - x];
    162       const int lr2 = rec[dy2 + x];
    163 
    164       xm  += Wxy * (ul1 + ur1 + ll1 + lr1);
    165       ym  += Wxy * (ul2 + ur2 + ll2 + lr2);
    166       xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
    167       xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
    168       yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
    169     }
    170 
    171     // Compute the contributions of up (u), down (d), left (l) and right (r)
    172     // points across the main axes (see the diagram below).
    173     // Symmetric Kernel will have same weight on those points.
    174     //       -  -  -  -  -  -  -
    175     //       -  -  -  u  -  -  -
    176     //       -  -  -  -  -  -  -
    177     //       -  l  -  0  -  r  -
    178     //       -  -  -  -  -  -  -
    179     //       -  -  -  d  -  -  -
    180     //       -  -  -  -  -  -  -
    181     const int Wxy = Wy * K[KERNEL];
    182     const int u1 = org[-dy1];
    183     const int d1 = org[dy1];
    184     const int l1 = org[-y];
    185     const int r1 = org[y];
    186 
    187     const int u2 = rec[-dy2];
    188     const int d2 = rec[dy2];
    189     const int l2 = rec[-y];
    190     const int r2 = rec[y];
    191 
    192     xm  += Wxy * (u1 + d1 + l1 + r1);
    193     ym  += Wxy * (u2 + d2 + l2 + r2);
    194     xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
    195     xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
    196     yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
    197   }
    198 
    199   // Lastly the contribution of (x0, y0) point.
    200   const int Wxy = K[KERNEL] * K[KERNEL];
    201   const int s1 = org[0];
    202   const int s2 = rec[0];
    203 
    204   xm  += Wxy * s1;
    205   ym  += Wxy * s2;
    206   xxm += Wxy * s1 * s1;
    207   xym += Wxy * s1 * s2;
    208   yym += Wxy * s2 * s2;
    209 
    210 #else   // __SSE2__
    211 
    212   org += (yo - KERNEL) * stride + (xo - KERNEL);
    213   rec += (yo - KERNEL) * stride + (xo - KERNEL);
    214 
    215   const __m128i zero = _mm_setzero_si128();
    216   __m128i x = zero;
    217   __m128i y = zero;
    218   __m128i xx = zero;
    219   __m128i xy = zero;
    220   __m128i yy = zero;
    221 
    222 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
    223 // and acccumulate.
    224 #define LOAD_LINE_PAIR(L, WEIGHT) do {                                       \
    225   const __m128i v0 =                                                         \
    226       _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
    227   const __m128i v1 =                                                         \
    228       _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
    229   const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
    230   const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
    231   const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
    232   const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
    233   x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
    234   y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
    235   x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
    236   y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
    237   xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
    238   xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
    239   yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
    240 } while (0)
    241 
    242 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do {                                \
    243   uint32 tmp[4];                                                             \
    244   _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M));                    \
    245   (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];                                 \
    246 } while (0)
    247 
    248   LOAD_LINE_PAIR(0, W0);
    249   LOAD_LINE_PAIR(1, W1);
    250   LOAD_LINE_PAIR(2, W2);
    251   LOAD_LINE_PAIR(3, W3);
    252   LOAD_LINE_PAIR(4, W2);
    253   LOAD_LINE_PAIR(5, W1);
    254   LOAD_LINE_PAIR(6, W0);
    255 
    256   ADD_AND_STORE_FOUR_EPI32(x, xm);
    257   ADD_AND_STORE_FOUR_EPI32(y, ym);
    258   ADD_AND_STORE_FOUR_EPI32(xx, xxm);
    259   ADD_AND_STORE_FOUR_EPI32(xy, xym);
    260   ADD_AND_STORE_FOUR_EPI32(yy, yym);
    261 
    262 #undef LOAD_LINE_PAIR
    263 #undef ADD_AND_STORE_FOUR_EPI32
    264 #endif
    265 
    266   return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
    267 }
    268 
    269 static int start_max(int x, int y) { return (x > y) ? x : y; }
    270 
    271 double CalcSSIM(const uint8 *org, const uint8 *rec,
    272                 const int image_width, const int image_height) {
    273   double SSIM = 0.;
    274   const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
    275   const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
    276   const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
    277   const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
    278   const int stride = image_width;
    279 
    280   for (int j = 0; j < KERNEL_Y; ++j) {
    281     for (int i = 0; i < image_width; ++i) {
    282       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
    283     }
    284   }
    285 
    286 #ifdef _OPENMP
    287   #pragma omp parallel for reduction(+: SSIM)
    288 #endif
    289   for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
    290     for (int i = 0; i < KERNEL_X; ++i) {
    291       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
    292     }
    293     for (int i = KERNEL_X; i < start_x; ++i) {
    294       SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
    295     }
    296     if (start_x < image_width) {
    297       // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
    298       // copy the 8 rightmost pixels on a cache area, and pad this area with
    299       // zeros which won't contribute to the overall SSIM value (but we need
    300       // to pass the correct normalizing constant!). By using this cache, we can
    301       // still call GetSSIMFullKernel() instead of the slower GetSSIM().
    302       // NOTE: we could use similar method for the left-most pixels too.
    303       const int kScratchWidth = 8;
    304       const int kScratchStride = kScratchWidth + KERNEL + 1;
    305       uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
    306       uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
    307 
    308       for (int k = 0; k < KERNEL_SIZE; ++k) {
    309         const int offset =
    310             (j - KERNEL + k) * stride + image_width - kScratchWidth;
    311         memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
    312         memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
    313       }
    314       for (int k = 0;  k <= KERNEL_X + 1; ++k) {
    315         SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
    316                                   KERNEL + k, KERNEL, kScratchStride, kiW[k]);
    317       }
    318     }
    319   }
    320 
    321   for (int j = start_y; j < image_height; ++j) {
    322     for (int i = 0; i < image_width; ++i) {
    323       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
    324     }
    325   }
    326   return SSIM;
    327 }
    328 
    329 double CalcLSSIM(double ssim) {
    330   return -10.0 * log10(1.0 - ssim);
    331 }
    332 
    333 #ifdef __cplusplus
    334 }  // extern "C"
    335 #endif
    336 
    337