Home | History | Annotate | Download | only in util
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "../util/ssim.h"  // NOLINT
     12 
     13 #include <string.h>
     14 
     15 #ifdef __cplusplus
     16 extern "C" {
     17 #endif
     18 
     19 typedef unsigned int uint32;    // NOLINT
     20 typedef unsigned short uint16;  // NOLINT
     21 
     22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
     23     (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
     24 #define __SSE2__
     25 #endif
     26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
     27 #include <emmintrin.h>
     28 #endif
     29 
     30 #ifdef _OPENMP
     31 #include <omp.h>
     32 #endif
     33 
     34 // SSIM
     35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
     36 
     37 // Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
     38 // The maximum value (11 x 11) must be less than 128 to avoid sign
     39 // problems during the calls to _mm_mullo_epi16().
     40 static const int K[KERNEL_SIZE] = {
     41     1, 3, 7, 11, 7, 3, 1  // ~11 * exp(-0.3 * i * i)
     42 };
     43 static const double kiW[KERNEL + 1 + 1] = {
     44     1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
     45     1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
     46     1. / 1056.,  // 1 / sum(i:0..5, j..6) K[i]*K[j]
     47     1. / 957.,   // 1 / sum(i:0..4, j..6) K[i]*K[j]
     48     1. / 726.,   // 1 / sum(i:0..3, j..6) K[i]*K[j]
     49 };
     50 
     51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
     52 
     53 #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)])  // weight product
     54 #define MAKE_WEIGHT(L)                                                \
     55   {                                                                   \
     56     {                                                                 \
     57       {                                                               \
     58         PWEIGHT(L, 0)                                                 \
     59         , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
     60             PWEIGHT(L, 5), PWEIGHT(L, 6), 0                           \
     61       }                                                               \
     62     }                                                                 \
     63   }
     64 
     65 // We need this union trick to be able to initialize constant static __m128i
     66 // values. We can't call _mm_set_epi16() for static compile-time initialization.
     67 static const struct {
     68   union {
     69     uint16 i16_[8];
     70     __m128i m_;
     71   } values_;
     72 } W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
     73   W3 = MAKE_WEIGHT(3);
     74 // ... the rest is symmetric.
     75 #undef MAKE_WEIGHT
     76 #undef PWEIGHT
     77 #endif
     78 
     79 // Common final expression for SSIM, once the weighted sums are known.
     80 static double FinalizeSSIM(double iw,
     81                            double xm,
     82                            double ym,
     83                            double xxm,
     84                            double xym,
     85                            double yym) {
     86   const double iwx = xm * iw;
     87   const double iwy = ym * iw;
     88   double sxx = xxm * iw - iwx * iwx;
     89   double syy = yym * iw - iwy * iwy;
     90   // small errors are possible, due to rounding. Clamp to zero.
     91   if (sxx < 0.)
     92     sxx = 0.;
     93   if (syy < 0.)
     94     syy = 0.;
     95   const double sxsy = sqrt(sxx * syy);
     96   const double sxy = xym * iw - iwx * iwy;
     97   static const double C11 = (0.01 * 0.01) * (255 * 255);
     98   static const double C22 = (0.03 * 0.03) * (255 * 255);
     99   static const double C33 = (0.015 * 0.015) * (255 * 255);
    100   const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
    101   const double c = (2. * sxsy + C22) / (sxx + syy + C22);
    102   const double s = (sxy + C33) / (sxsy + C33);
    103   return l * c * s;
    104 }
    105 
    106 // GetSSIM() does clipping.  GetSSIMFullKernel() does not
    107 
    108 // TODO(skal): use summed tables?
    109 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
    110 // with a diff of 255, squared. The maximum error is thus 0x4388241,
    111 // which fits into 32 bits integers.
    112 double GetSSIM(const uint8* org,
    113                const uint8* rec,
    114                int xo,
    115                int yo,
    116                int W,
    117                int H,
    118                int stride) {
    119   uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
    120   org += (yo - KERNEL) * stride;
    121   org += (xo - KERNEL);
    122   rec += (yo - KERNEL) * stride;
    123   rec += (xo - KERNEL);
    124   for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
    125     if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H))
    126       continue;
    127     const int Wy = K[y_];
    128     for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
    129       const int Wxy = Wy * K[x_];
    130       if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
    131         const int org_x = org[x_];
    132         const int rec_x = rec[x_];
    133         ws += Wxy;
    134         xm += Wxy * org_x;
    135         ym += Wxy * rec_x;
    136         xxm += Wxy * org_x * org_x;
    137         xym += Wxy * org_x * rec_x;
    138         yym += Wxy * rec_x * rec_x;
    139       }
    140     }
    141   }
    142   return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
    143 }
    144 
    145 double GetSSIMFullKernel(const uint8* org,
    146                          const uint8* rec,
    147                          int xo,
    148                          int yo,
    149                          int stride,
    150                          double area_weight) {
    151   uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
    152 
    153 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
    154 
    155   org += yo * stride + xo;
    156   rec += yo * stride + xo;
    157   for (int y = 1; y <= KERNEL; y++) {
    158     const int dy1 = y * stride;
    159     const int dy2 = y * stride;
    160     const int Wy = K[KERNEL + y];
    161 
    162     for (int x = 1; x <= KERNEL; x++) {
    163       // Compute the contributions of upper-left (ul), upper-right (ur)
    164       // lower-left (ll) and lower-right (lr) points (see the diagram below).
    165       // Symmetric Kernel will have same weight on those points.
    166       //       -  -  -  -  -  -  -
    167       //       -  ul -  -  -  ur -
    168       //       -  -  -  -  -  -  -
    169       //       -  -  -  0  -  -  -
    170       //       -  -  -  -  -  -  -
    171       //       -  ll -  -  -  lr -
    172       //       -  -  -  -  -  -  -
    173       const int Wxy = Wy * K[KERNEL + x];
    174       const int ul1 = org[-dy1 - x];
    175       const int ur1 = org[-dy1 + x];
    176       const int ll1 = org[dy1 - x];
    177       const int lr1 = org[dy1 + x];
    178 
    179       const int ul2 = rec[-dy2 - x];
    180       const int ur2 = rec[-dy2 + x];
    181       const int ll2 = rec[dy2 - x];
    182       const int lr2 = rec[dy2 + x];
    183 
    184       xm += Wxy * (ul1 + ur1 + ll1 + lr1);
    185       ym += Wxy * (ul2 + ur2 + ll2 + lr2);
    186       xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
    187       xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
    188       yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
    189     }
    190 
    191     // Compute the contributions of up (u), down (d), left (l) and right (r)
    192     // points across the main axes (see the diagram below).
    193     // Symmetric Kernel will have same weight on those points.
    194     //       -  -  -  -  -  -  -
    195     //       -  -  -  u  -  -  -
    196     //       -  -  -  -  -  -  -
    197     //       -  l  -  0  -  r  -
    198     //       -  -  -  -  -  -  -
    199     //       -  -  -  d  -  -  -
    200     //       -  -  -  -  -  -  -
    201     const int Wxy = Wy * K[KERNEL];
    202     const int u1 = org[-dy1];
    203     const int d1 = org[dy1];
    204     const int l1 = org[-y];
    205     const int r1 = org[y];
    206 
    207     const int u2 = rec[-dy2];
    208     const int d2 = rec[dy2];
    209     const int l2 = rec[-y];
    210     const int r2 = rec[y];
    211 
    212     xm += Wxy * (u1 + d1 + l1 + r1);
    213     ym += Wxy * (u2 + d2 + l2 + r2);
    214     xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
    215     xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
    216     yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
    217   }
    218 
    219   // Lastly the contribution of (x0, y0) point.
    220   const int Wxy = K[KERNEL] * K[KERNEL];
    221   const int s1 = org[0];
    222   const int s2 = rec[0];
    223 
    224   xm += Wxy * s1;
    225   ym += Wxy * s2;
    226   xxm += Wxy * s1 * s1;
    227   xym += Wxy * s1 * s2;
    228   yym += Wxy * s2 * s2;
    229 
    230 #else  // __SSE2__
    231 
    232   org += (yo - KERNEL) * stride + (xo - KERNEL);
    233   rec += (yo - KERNEL) * stride + (xo - KERNEL);
    234 
    235   const __m128i zero = _mm_setzero_si128();
    236   __m128i x = zero;
    237   __m128i y = zero;
    238   __m128i xx = zero;
    239   __m128i xy = zero;
    240   __m128i yy = zero;
    241 
    242 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
    243 // and acccumulate.
    244 #define LOAD_LINE_PAIR(L, WEIGHT)                                            \
    245   do {                                                                       \
    246     const __m128i v0 =                                                       \
    247         _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
    248     const __m128i v1 =                                                       \
    249         _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
    250     const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
    251     const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
    252     const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
    253     const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
    254     x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
    255     y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
    256     x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
    257     y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
    258     xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
    259     xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
    260     yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
    261   } while (0)
    262 
    263 #define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \
    264   do {                                                      \
    265     uint32 tmp[4];                                          \
    266     _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
    267     (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];              \
    268   } while (0)
    269 
    270   LOAD_LINE_PAIR(0, W0);
    271   LOAD_LINE_PAIR(1, W1);
    272   LOAD_LINE_PAIR(2, W2);
    273   LOAD_LINE_PAIR(3, W3);
    274   LOAD_LINE_PAIR(4, W2);
    275   LOAD_LINE_PAIR(5, W1);
    276   LOAD_LINE_PAIR(6, W0);
    277 
    278   ADD_AND_STORE_FOUR_EPI32(x, xm);
    279   ADD_AND_STORE_FOUR_EPI32(y, ym);
    280   ADD_AND_STORE_FOUR_EPI32(xx, xxm);
    281   ADD_AND_STORE_FOUR_EPI32(xy, xym);
    282   ADD_AND_STORE_FOUR_EPI32(yy, yym);
    283 
    284 #undef LOAD_LINE_PAIR
    285 #undef ADD_AND_STORE_FOUR_EPI32
    286 #endif
    287 
    288   return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
    289 }
    290 
    291 static int start_max(int x, int y) {
    292   return (x > y) ? x : y;
    293 }
    294 
    295 double CalcSSIM(const uint8* org,
    296                 const uint8* rec,
    297                 const int image_width,
    298                 const int image_height) {
    299   double SSIM = 0.;
    300   const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
    301   const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
    302   const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
    303   const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
    304   const int stride = image_width;
    305 
    306   for (int j = 0; j < KERNEL_Y; ++j) {
    307     for (int i = 0; i < image_width; ++i) {
    308       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
    309     }
    310   }
    311 
    312 #ifdef _OPENMP
    313 #pragma omp parallel for reduction(+ : SSIM)
    314 #endif
    315   for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
    316     for (int i = 0; i < KERNEL_X; ++i) {
    317       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
    318     }
    319     for (int i = KERNEL_X; i < start_x; ++i) {
    320       SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
    321     }
    322     if (start_x < image_width) {
    323       // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
    324       // copy the 8 rightmost pixels on a cache area, and pad this area with
    325       // zeros which won't contribute to the overall SSIM value (but we need
    326       // to pass the correct normalizing constant!). By using this cache, we can
    327       // still call GetSSIMFullKernel() instead of the slower GetSSIM().
    328       // NOTE: we could use similar method for the left-most pixels too.
    329       const int kScratchWidth = 8;
    330       const int kScratchStride = kScratchWidth + KERNEL + 1;
    331       uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0};
    332       uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
    333 
    334       for (int k = 0; k < KERNEL_SIZE; ++k) {
    335         const int offset =
    336             (j - KERNEL + k) * stride + image_width - kScratchWidth;
    337         memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
    338         memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
    339       }
    340       for (int k = 0; k <= KERNEL_X + 1; ++k) {
    341         SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
    342                                   kScratchStride, kiW[k]);
    343       }
    344     }
    345   }
    346 
    347   for (int j = start_y; j < image_height; ++j) {
    348     for (int i = 0; i < image_width; ++i) {
    349       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
    350     }
    351   }
    352   return SSIM;
    353 }
    354 
    355 double CalcLSSIM(double ssim) {
    356   return -10.0 * log10(1.0 - ssim);
    357 }
    358 
    359 #ifdef __cplusplus
    360 }  // extern "C"
    361 #endif
    362