1 /* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "../util/ssim.h" // NOLINT 12 13 #include <string.h> 14 15 #ifdef __cplusplus 16 extern "C" { 17 #endif 18 19 typedef unsigned int uint32; // NOLINT 20 typedef unsigned short uint16; // NOLINT 21 22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ 23 (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) 24 #define __SSE2__ 25 #endif 26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 27 #include <emmintrin.h> 28 #endif 29 30 #ifdef _OPENMP 31 #include <omp.h> 32 #endif 33 34 // SSIM 35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; 36 37 // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) 38 // The maximum value (11 x 11) must be less than 128 to avoid sign 39 // problems during the calls to _mm_mullo_epi16(). 40 static const int K[KERNEL_SIZE] = { 41 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) 42 }; 43 static const double kiW[KERNEL + 1 + 1] = { 44 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 45 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 46 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] 47 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] 48 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] 49 }; 50 51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 52 53 #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product 54 #define MAKE_WEIGHT(L) \ 55 { \ 56 { \ 57 { \ 58 PWEIGHT(L, 0) \ 59 , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \ 60 PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \ 61 } \ 62 } \ 63 } 64 65 // We need this union trick to be able to initialize constant static __m128i 66 // values. We can't call _mm_set_epi16() for static compile-time initialization. 67 static const struct { 68 union { 69 uint16 i16_[8]; 70 __m128i m_; 71 } values_; 72 } W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2), 73 W3 = MAKE_WEIGHT(3); 74 // ... the rest is symmetric. 75 #undef MAKE_WEIGHT 76 #undef PWEIGHT 77 #endif 78 79 // Common final expression for SSIM, once the weighted sums are known. 80 static double FinalizeSSIM(double iw, 81 double xm, 82 double ym, 83 double xxm, 84 double xym, 85 double yym) { 86 const double iwx = xm * iw; 87 const double iwy = ym * iw; 88 double sxx = xxm * iw - iwx * iwx; 89 double syy = yym * iw - iwy * iwy; 90 // small errors are possible, due to rounding. Clamp to zero. 91 if (sxx < 0.) 92 sxx = 0.; 93 if (syy < 0.) 94 syy = 0.; 95 const double sxsy = sqrt(sxx * syy); 96 const double sxy = xym * iw - iwx * iwy; 97 static const double C11 = (0.01 * 0.01) * (255 * 255); 98 static const double C22 = (0.03 * 0.03) * (255 * 255); 99 static const double C33 = (0.015 * 0.015) * (255 * 255); 100 const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); 101 const double c = (2. * sxsy + C22) / (sxx + syy + C22); 102 const double s = (sxy + C33) / (sxsy + C33); 103 return l * c * s; 104 } 105 106 // GetSSIM() does clipping. GetSSIMFullKernel() does not 107 108 // TODO(skal): use summed tables? 109 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) 110 // with a diff of 255, squared. The maximum error is thus 0x4388241, 111 // which fits into 32 bits integers. 112 double GetSSIM(const uint8* org, 113 const uint8* rec, 114 int xo, 115 int yo, 116 int W, 117 int H, 118 int stride) { 119 uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 120 org += (yo - KERNEL) * stride; 121 org += (xo - KERNEL); 122 rec += (yo - KERNEL) * stride; 123 rec += (xo - KERNEL); 124 for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { 125 if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) 126 continue; 127 const int Wy = K[y_]; 128 for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { 129 const int Wxy = Wy * K[x_]; 130 if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { 131 const int org_x = org[x_]; 132 const int rec_x = rec[x_]; 133 ws += Wxy; 134 xm += Wxy * org_x; 135 ym += Wxy * rec_x; 136 xxm += Wxy * org_x * org_x; 137 xym += Wxy * org_x * rec_x; 138 yym += Wxy * rec_x * rec_x; 139 } 140 } 141 } 142 return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); 143 } 144 145 double GetSSIMFullKernel(const uint8* org, 146 const uint8* rec, 147 int xo, 148 int yo, 149 int stride, 150 double area_weight) { 151 uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 152 153 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) 154 155 org += yo * stride + xo; 156 rec += yo * stride + xo; 157 for (int y = 1; y <= KERNEL; y++) { 158 const int dy1 = y * stride; 159 const int dy2 = y * stride; 160 const int Wy = K[KERNEL + y]; 161 162 for (int x = 1; x <= KERNEL; x++) { 163 // Compute the contributions of upper-left (ul), upper-right (ur) 164 // lower-left (ll) and lower-right (lr) points (see the diagram below). 165 // Symmetric Kernel will have same weight on those points. 166 // - - - - - - - 167 // - ul - - - ur - 168 // - - - - - - - 169 // - - - 0 - - - 170 // - - - - - - - 171 // - ll - - - lr - 172 // - - - - - - - 173 const int Wxy = Wy * K[KERNEL + x]; 174 const int ul1 = org[-dy1 - x]; 175 const int ur1 = org[-dy1 + x]; 176 const int ll1 = org[dy1 - x]; 177 const int lr1 = org[dy1 + x]; 178 179 const int ul2 = rec[-dy2 - x]; 180 const int ur2 = rec[-dy2 + x]; 181 const int ll2 = rec[dy2 - x]; 182 const int lr2 = rec[dy2 + x]; 183 184 xm += Wxy * (ul1 + ur1 + ll1 + lr1); 185 ym += Wxy * (ul2 + ur2 + ll2 + lr2); 186 xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); 187 xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); 188 yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); 189 } 190 191 // Compute the contributions of up (u), down (d), left (l) and right (r) 192 // points across the main axes (see the diagram below). 193 // Symmetric Kernel will have same weight on those points. 194 // - - - - - - - 195 // - - - u - - - 196 // - - - - - - - 197 // - l - 0 - r - 198 // - - - - - - - 199 // - - - d - - - 200 // - - - - - - - 201 const int Wxy = Wy * K[KERNEL]; 202 const int u1 = org[-dy1]; 203 const int d1 = org[dy1]; 204 const int l1 = org[-y]; 205 const int r1 = org[y]; 206 207 const int u2 = rec[-dy2]; 208 const int d2 = rec[dy2]; 209 const int l2 = rec[-y]; 210 const int r2 = rec[y]; 211 212 xm += Wxy * (u1 + d1 + l1 + r1); 213 ym += Wxy * (u2 + d2 + l2 + r2); 214 xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); 215 xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); 216 yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); 217 } 218 219 // Lastly the contribution of (x0, y0) point. 220 const int Wxy = K[KERNEL] * K[KERNEL]; 221 const int s1 = org[0]; 222 const int s2 = rec[0]; 223 224 xm += Wxy * s1; 225 ym += Wxy * s2; 226 xxm += Wxy * s1 * s1; 227 xym += Wxy * s1 * s2; 228 yym += Wxy * s2 * s2; 229 230 #else // __SSE2__ 231 232 org += (yo - KERNEL) * stride + (xo - KERNEL); 233 rec += (yo - KERNEL) * stride + (xo - KERNEL); 234 235 const __m128i zero = _mm_setzero_si128(); 236 __m128i x = zero; 237 __m128i y = zero; 238 __m128i xx = zero; 239 __m128i xy = zero; 240 __m128i yy = zero; 241 242 // Read 8 pixels at line #L, and convert to 16bit, perform weighting 243 // and acccumulate. 244 #define LOAD_LINE_PAIR(L, WEIGHT) \ 245 do { \ 246 const __m128i v0 = \ 247 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \ 248 const __m128i v1 = \ 249 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \ 250 const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ 251 const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ 252 const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ 253 const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ 254 x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ 255 y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ 256 x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ 257 y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ 258 xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ 259 xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ 260 yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ 261 } while (0) 262 263 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) \ 264 do { \ 265 uint32 tmp[4]; \ 266 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ 267 (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ 268 } while (0) 269 270 LOAD_LINE_PAIR(0, W0); 271 LOAD_LINE_PAIR(1, W1); 272 LOAD_LINE_PAIR(2, W2); 273 LOAD_LINE_PAIR(3, W3); 274 LOAD_LINE_PAIR(4, W2); 275 LOAD_LINE_PAIR(5, W1); 276 LOAD_LINE_PAIR(6, W0); 277 278 ADD_AND_STORE_FOUR_EPI32(x, xm); 279 ADD_AND_STORE_FOUR_EPI32(y, ym); 280 ADD_AND_STORE_FOUR_EPI32(xx, xxm); 281 ADD_AND_STORE_FOUR_EPI32(xy, xym); 282 ADD_AND_STORE_FOUR_EPI32(yy, yym); 283 284 #undef LOAD_LINE_PAIR 285 #undef ADD_AND_STORE_FOUR_EPI32 286 #endif 287 288 return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); 289 } 290 291 static int start_max(int x, int y) { 292 return (x > y) ? x : y; 293 } 294 295 double CalcSSIM(const uint8* org, 296 const uint8* rec, 297 const int image_width, 298 const int image_height) { 299 double SSIM = 0.; 300 const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; 301 const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; 302 const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); 303 const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); 304 const int stride = image_width; 305 306 for (int j = 0; j < KERNEL_Y; ++j) { 307 for (int i = 0; i < image_width; ++i) { 308 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 309 } 310 } 311 312 #ifdef _OPENMP 313 #pragma omp parallel for reduction(+ : SSIM) 314 #endif 315 for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { 316 for (int i = 0; i < KERNEL_X; ++i) { 317 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 318 } 319 for (int i = KERNEL_X; i < start_x; ++i) { 320 SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); 321 } 322 if (start_x < image_width) { 323 // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we 324 // copy the 8 rightmost pixels on a cache area, and pad this area with 325 // zeros which won't contribute to the overall SSIM value (but we need 326 // to pass the correct normalizing constant!). By using this cache, we can 327 // still call GetSSIMFullKernel() instead of the slower GetSSIM(). 328 // NOTE: we could use similar method for the left-most pixels too. 329 const int kScratchWidth = 8; 330 const int kScratchStride = kScratchWidth + KERNEL + 1; 331 uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0}; 332 uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0}; 333 334 for (int k = 0; k < KERNEL_SIZE; ++k) { 335 const int offset = 336 (j - KERNEL + k) * stride + image_width - kScratchWidth; 337 memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); 338 memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); 339 } 340 for (int k = 0; k <= KERNEL_X + 1; ++k) { 341 SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL, 342 kScratchStride, kiW[k]); 343 } 344 } 345 } 346 347 for (int j = start_y; j < image_height; ++j) { 348 for (int i = 0; i < image_width; ++i) { 349 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 350 } 351 } 352 return SSIM; 353 } 354 355 double CalcLSSIM(double ssim) { 356 return -10.0 * log10(1.0 - ssim); 357 } 358 359 #ifdef __cplusplus 360 } // extern "C" 361 #endif 362