Home | History | Annotate | Download | only in util
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./psnr.h"  // NOLINT
     12 
     13 #ifdef _OPENMP
     14 #include <omp.h>
     15 #endif
     16 #ifdef _MSC_VER
     17 #include <intrin.h>  // For __cpuid()
     18 #endif
     19 
     20 #ifdef __cplusplus
     21 extern "C" {
     22 #endif
     23 
     24 typedef unsigned int uint32;  // NOLINT
     25 #ifdef _MSC_VER
     26 typedef unsigned __int64 uint64;
     27 #else  // COMPILER_MSVC
     28 #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
     29 typedef unsigned long uint64;  // NOLINT
     30 #else   // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
     31 typedef unsigned long long uint64;  // NOLINT
     32 #endif  // __LP64__
     33 #endif  // _MSC_VER
     34 
     35 // libyuv provides this function when linking library for jpeg support.
     36 #if !defined(HAVE_JPEG)
     37 
     38 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     39     !defined(__aarch64__)
     40 #define HAS_SUMSQUAREERROR_NEON
     41 static uint32 SumSquareError_NEON(const uint8* src_a,
     42                                   const uint8* src_b,
     43                                   int count) {
     44   volatile uint32 sse;
     45   asm volatile(
     46       "vmov.u8    q7, #0                         \n"
     47       "vmov.u8    q9, #0                         \n"
     48       "vmov.u8    q8, #0                         \n"
     49       "vmov.u8    q10, #0                        \n"
     50 
     51       "1:                                        \n"
     52       "vld1.u8    {q0}, [%0]!                    \n"
     53       "vld1.u8    {q1}, [%1]!                    \n"
     54       "vsubl.u8   q2, d0, d2                     \n"
     55       "vsubl.u8   q3, d1, d3                     \n"
     56       "vmlal.s16  q7, d4, d4                     \n"
     57       "vmlal.s16  q8, d6, d6                     \n"
     58       "vmlal.s16  q8, d5, d5                     \n"
     59       "vmlal.s16  q10, d7, d7                    \n"
     60       "subs       %2, %2, #16                    \n"
     61       "bhi        1b                             \n"
     62 
     63       "vadd.u32   q7, q7, q8                     \n"
     64       "vadd.u32   q9, q9, q10                    \n"
     65       "vadd.u32   q10, q7, q9                    \n"
     66       "vpaddl.u32 q1, q10                        \n"
     67       "vadd.u64   d0, d2, d3                     \n"
     68       "vmov.32    %3, d0[0]                      \n"
     69       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
     70       :
     71       : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
     72   return sse;
     73 }
     74 #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
     75 #define HAS_SUMSQUAREERROR_NEON
     76 static uint32 SumSquareError_NEON(const uint8* src_a,
     77                                   const uint8* src_b,
     78                                   int count) {
     79   volatile uint32 sse;
     80   asm volatile(
     81       "eor        v16.16b, v16.16b, v16.16b      \n"
     82       "eor        v18.16b, v18.16b, v18.16b      \n"
     83       "eor        v17.16b, v17.16b, v17.16b      \n"
     84       "eor        v19.16b, v19.16b, v19.16b      \n"
     85 
     86       "1:                                        \n"
     87       "ld1        {v0.16b}, [%0], #16            \n"
     88       "ld1        {v1.16b}, [%1], #16            \n"
     89       "subs       %w2, %w2, #16                  \n"
     90       "usubl      v2.8h, v0.8b, v1.8b            \n"
     91       "usubl2     v3.8h, v0.16b, v1.16b          \n"
     92       "smlal      v16.4s, v2.4h, v2.4h           \n"
     93       "smlal      v17.4s, v3.4h, v3.4h           \n"
     94       "smlal2     v18.4s, v2.8h, v2.8h           \n"
     95       "smlal2     v19.4s, v3.8h, v3.8h           \n"
     96       "b.gt       1b                             \n"
     97 
     98       "add        v16.4s, v16.4s, v17.4s         \n"
     99       "add        v18.4s, v18.4s, v19.4s         \n"
    100       "add        v19.4s, v16.4s, v18.4s         \n"
    101       "addv       s0, v19.4s                     \n"
    102       "fmov       %w3, s0                        \n"
    103       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
    104       :
    105       : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
    106   return sse;
    107 }
    108 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
    109 #define HAS_SUMSQUAREERROR_SSE2
    110 __declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
    111                                                     const uint8* /*src_b*/,
    112                                                     int /*count*/) {
    113   __asm {
    114     mov        eax, [esp + 4]  // src_a
    115     mov        edx, [esp + 8]  // src_b
    116     mov        ecx, [esp + 12]  // count
    117     pxor       xmm0, xmm0
    118     pxor       xmm5, xmm5
    119     sub        edx, eax
    120 
    121   wloop:
    122     movdqu     xmm1, [eax]
    123     movdqu     xmm2, [eax + edx]
    124     lea        eax,  [eax + 16]
    125     movdqu     xmm3, xmm1
    126     psubusb    xmm1, xmm2
    127     psubusb    xmm2, xmm3
    128     por        xmm1, xmm2
    129     movdqu     xmm2, xmm1
    130     punpcklbw  xmm1, xmm5
    131     punpckhbw  xmm2, xmm5
    132     pmaddwd    xmm1, xmm1
    133     pmaddwd    xmm2, xmm2
    134     paddd      xmm0, xmm1
    135     paddd      xmm0, xmm2
    136     sub        ecx, 16
    137     ja         wloop
    138 
    139     pshufd     xmm1, xmm0, 0EEh
    140     paddd      xmm0, xmm1
    141     pshufd     xmm1, xmm0, 01h
    142     paddd      xmm0, xmm1
    143     movd       eax, xmm0
    144     ret
    145   }
    146 }
    147 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
    148 #define HAS_SUMSQUAREERROR_SSE2
    149 static uint32 SumSquareError_SSE2(const uint8* src_a,
    150                                   const uint8* src_b,
    151                                   int count) {
    152   uint32 sse;
    153   asm volatile(  // NOLINT
    154       "pxor      %%xmm0,%%xmm0                   \n"
    155       "pxor      %%xmm5,%%xmm5                   \n"
    156       "sub       %0,%1                           \n"
    157 
    158       "1:                                        \n"
    159       "movdqu    (%0),%%xmm1                     \n"
    160       "movdqu    (%0,%1,1),%%xmm2                \n"
    161       "lea       0x10(%0),%0                     \n"
    162       "movdqu    %%xmm1,%%xmm3                   \n"
    163       "psubusb   %%xmm2,%%xmm1                   \n"
    164       "psubusb   %%xmm3,%%xmm2                   \n"
    165       "por       %%xmm2,%%xmm1                   \n"
    166       "movdqu    %%xmm1,%%xmm2                   \n"
    167       "punpcklbw %%xmm5,%%xmm1                   \n"
    168       "punpckhbw %%xmm5,%%xmm2                   \n"
    169       "pmaddwd   %%xmm1,%%xmm1                   \n"
    170       "pmaddwd   %%xmm2,%%xmm2                   \n"
    171       "paddd     %%xmm1,%%xmm0                   \n"
    172       "paddd     %%xmm2,%%xmm0                   \n"
    173       "sub       $0x10,%2                        \n"
    174       "ja        1b                              \n"
    175 
    176       "pshufd    $0xee,%%xmm0,%%xmm1             \n"
    177       "paddd     %%xmm1,%%xmm0                   \n"
    178       "pshufd    $0x1,%%xmm0,%%xmm1              \n"
    179       "paddd     %%xmm1,%%xmm0                   \n"
    180       "movd      %%xmm0,%3                       \n"
    181 
    182       : "+r"(src_a),  // %0
    183         "+r"(src_b),  // %1
    184         "+r"(count),  // %2
    185         "=g"(sse)     // %3
    186       :
    187       : "memory", "cc"
    188 #if defined(__SSE2__)
    189         ,
    190         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
    191 #endif
    192       );  // NOLINT
    193   return sse;
    194 }
    195 #endif  // LIBYUV_DISABLE_X86 etc
    196 
    197 #if defined(HAS_SUMSQUAREERROR_SSE2)
    198 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
    199 static __inline void __cpuid(int cpu_info[4], int info_type) {
    200   asm volatile(  // NOLINT
    201       "mov %%ebx, %%edi                          \n"
    202       "cpuid                                     \n"
    203       "xchg %%edi, %%ebx                         \n"
    204       : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
    205         "=d"(cpu_info[3])
    206       : "a"(info_type));
    207 }
    208 // For gcc/clang but not clangcl.
    209 #elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
    210 static __inline void __cpuid(int cpu_info[4], int info_type) {
    211   asm volatile(  // NOLINT
    212       "cpuid                                     \n"
    213       : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
    214         "=d"(cpu_info[3])
    215       : "a"(info_type));
    216 }
    217 #endif
    218 
    219 static int CpuHasSSE2() {
    220 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
    221   int cpu_info[4];
    222   __cpuid(cpu_info, 1);
    223   if (cpu_info[3] & 0x04000000) {
    224     return 1;
    225   }
    226 #endif
    227   return 0;
    228 }
    229 #endif  // HAS_SUMSQUAREERROR_SSE2
    230 
    231 static uint32 SumSquareError_C(const uint8* src_a,
    232                                const uint8* src_b,
    233                                int count) {
    234   uint32 sse = 0u;
    235   for (int x = 0; x < count; ++x) {
    236     int diff = src_a[x] - src_b[x];
    237     sse += static_cast<uint32>(diff * diff);
    238   }
    239   return sse;
    240 }
    241 
    242 double ComputeSumSquareError(const uint8* src_a,
    243                              const uint8* src_b,
    244                              int count) {
    245   uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
    246       SumSquareError_C;
    247 #if defined(HAS_SUMSQUAREERROR_NEON)
    248   SumSquareError = SumSquareError_NEON;
    249 #endif
    250 #if defined(HAS_SUMSQUAREERROR_SSE2)
    251   if (CpuHasSSE2()) {
    252     SumSquareError = SumSquareError_SSE2;
    253   }
    254 #endif
    255   const int kBlockSize = 1 << 15;
    256   uint64 sse = 0;
    257 #ifdef _OPENMP
    258 #pragma omp parallel for reduction(+ : sse)
    259 #endif
    260   for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
    261     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
    262   }
    263   src_a += count & ~(kBlockSize - 1);
    264   src_b += count & ~(kBlockSize - 1);
    265   int remainder = count & (kBlockSize - 1) & ~15;
    266   if (remainder) {
    267     sse += SumSquareError(src_a, src_b, remainder);
    268     src_a += remainder;
    269     src_b += remainder;
    270   }
    271   remainder = count & 15;
    272   if (remainder) {
    273     sse += SumSquareError_C(src_a, src_b, remainder);
    274   }
    275   return static_cast<double>(sse);
    276 }
    277 #endif
    278 
    279 // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
    280 // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
    281 double ComputePSNR(double sse, double size) {
    282   const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
    283   if (sse <= kMINSSE)
    284     sse = kMINSSE;  // Produces max PSNR of 128
    285   return 10.0 * log10(255.0 * 255.0 * size / sse);
    286 }
    287 
    288 #ifdef __cplusplus
    289 }  // extern "C"
    290 #endif
    291