Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 
     13 #include "./vpx_config.h"
     14 
     15 #include "vp9/encoder/vp9_variance.h"
     16 #include "vpx_ports/mem.h"
     17 
     18 typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
     19                                        const unsigned char *ref, int ref_stride,
     20                                        unsigned int *sse, int *sum);
     21 
     22 unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
     23   __m128i vsum = _mm_setzero_si128();
     24   int i;
     25 
     26   for (i = 0; i < 32; ++i) {
     27     const __m128i v = _mm_loadu_si128((const __m128i *)src);
     28     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
     29     src += 8;
     30   }
     31 
     32   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
     33   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
     34   return  _mm_cvtsi128_si32(vsum);
     35 }
     36 
     37 #define READ64(p, stride, i) \
     38   _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
     39       _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
     40 
     41 unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
     42                                 const uint8_t *ref, int ref_stride,
     43                                 unsigned int *sse, int *sum) {
     44   const __m128i zero = _mm_setzero_si128();
     45   const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
     46   const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
     47   const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
     48   const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
     49   const __m128i diff0 = _mm_sub_epi16(src0, ref0);
     50   const __m128i diff1 = _mm_sub_epi16(src1, ref1);
     51 
     52   // sum
     53   __m128i vsum = _mm_add_epi16(diff0, diff1);
     54   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
     55   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
     56   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
     57   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
     58 
     59   // sse
     60   vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
     61                        _mm_madd_epi16(diff1, diff1));
     62   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
     63   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
     64   *sse = _mm_cvtsi128_si32(vsum);
     65 
     66   return 0;
     67 }
     68 
     69 unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
     70                                 const uint8_t *ref, int ref_stride,
     71                                 unsigned int *sse, int *sum) {
     72   const __m128i zero = _mm_setzero_si128();
     73   __m128i vsum = _mm_setzero_si128();
     74   __m128i vsse = _mm_setzero_si128();
     75   int i;
     76 
     77   for (i = 0; i < 8; i += 2) {
     78     const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
     79         (const __m128i *)(src + i * src_stride)), zero);
     80     const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
     81         (const __m128i *)(ref + i * ref_stride)), zero);
     82     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
     83 
     84     const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
     85         (const __m128i *)(src + (i + 1) * src_stride)), zero);
     86     const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
     87         (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
     88     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
     89 
     90     vsum = _mm_add_epi16(vsum, diff0);
     91     vsum = _mm_add_epi16(vsum, diff1);
     92     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
     93     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
     94   }
     95 
     96   // sum
     97   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
     98   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
     99   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
    100   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
    101 
    102   // sse
    103   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
    104   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
    105   *sse = _mm_cvtsi128_si32(vsse);
    106 
    107   return 0;
    108 }
    109 
    110 unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
    111                                   const uint8_t *ref, int ref_stride,
    112                                   unsigned int *sse, int *sum) {
    113   const __m128i zero = _mm_setzero_si128();
    114   __m128i vsum = _mm_setzero_si128();
    115   __m128i vsse = _mm_setzero_si128();
    116   int i;
    117 
    118   for (i = 0; i < 16; ++i) {
    119     const __m128i s = _mm_loadu_si128((const __m128i *)src);
    120     const __m128i r = _mm_loadu_si128((const __m128i *)ref);
    121 
    122     const __m128i src0 = _mm_unpacklo_epi8(s, zero);
    123     const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
    124     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
    125 
    126     const __m128i src1 = _mm_unpackhi_epi8(s, zero);
    127     const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
    128     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
    129 
    130     vsum = _mm_add_epi16(vsum, diff0);
    131     vsum = _mm_add_epi16(vsum, diff1);
    132     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
    133     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
    134 
    135     src += src_stride;
    136     ref += ref_stride;
    137   }
    138 
    139   // sum
    140   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
    141   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
    142   *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
    143              (int16_t)_mm_extract_epi16(vsum, 1);
    144 
    145   // sse
    146   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
    147   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
    148   *sse = _mm_cvtsi128_si32(vsse);
    149 
    150   return 0;
    151 }
    152 
    153 
    154 static void variance_sse2(const unsigned char *src, int src_stride,
    155                           const unsigned char *ref, int ref_stride,
    156                           int w, int h, unsigned int *sse, int *sum,
    157                           variance_fn_t var_fn, int block_size) {
    158   int i, j;
    159 
    160   *sse = 0;
    161   *sum = 0;
    162 
    163   for (i = 0; i < h; i += block_size) {
    164     for (j = 0; j < w; j += block_size) {
    165       unsigned int sse0;
    166       int sum0;
    167       var_fn(src + src_stride * i + j, src_stride,
    168              ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
    169       *sse += sse0;
    170       *sum += sum0;
    171     }
    172   }
    173 }
    174 
    175 unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
    176                                   const unsigned char *ref, int ref_stride,
    177                                   unsigned int *sse) {
    178   int sum;
    179   vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
    180   return *sse - (((unsigned int)sum * sum) >> 4);
    181 }
    182 
    183 unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
    184                                   const uint8_t *ref, int ref_stride,
    185                                   unsigned int *sse) {
    186   int sum;
    187   variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
    188                 sse, &sum, vp9_get4x4var_sse2, 4);
    189   return *sse - (((unsigned int)sum * sum) >> 5);
    190 }
    191 
    192 unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
    193                                   const uint8_t *ref, int ref_stride,
    194                                   unsigned int *sse) {
    195   int sum;
    196   variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
    197                 sse, &sum, vp9_get4x4var_sse2, 4);
    198   return *sse - (((unsigned int)sum * sum) >> 5);
    199 }
    200 
    201 unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
    202                                   const unsigned char *ref, int ref_stride,
    203                                   unsigned int *sse) {
    204   int sum;
    205   vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
    206   return *sse - (((unsigned int)sum * sum) >> 6);
    207 }
    208 
    209 unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
    210                                    const unsigned char *ref, int ref_stride,
    211                                    unsigned int *sse) {
    212   int sum;
    213   variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
    214                 sse, &sum, vp9_get8x8var_sse2, 8);
    215   return *sse - (((unsigned int)sum * sum) >> 7);
    216 }
    217 
    218 unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
    219                                    const unsigned char *ref, int ref_stride,
    220                                    unsigned int *sse) {
    221   int sum;
    222   variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
    223                 sse, &sum, vp9_get8x8var_sse2, 8);
    224   return *sse - (((unsigned int)sum * sum) >> 7);
    225 }
    226 
    227 unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
    228                                     const unsigned char *ref, int ref_stride,
    229                                     unsigned int *sse) {
    230   int sum;
    231   vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
    232   return *sse - (((unsigned int)sum * sum) >> 8);
    233 }
    234 
    235 unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
    236                                     const uint8_t *ref, int ref_stride,
    237                                     unsigned int *sse) {
    238   int sum;
    239   variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
    240                 sse, &sum, vp9_get16x16var_sse2, 16);
    241   return *sse - (((int64_t)sum * sum) >> 10);
    242 }
    243 
    244 unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
    245                                     const uint8_t *ref, int ref_stride,
    246                                     unsigned int *sse) {
    247   int sum;
    248   variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
    249                 sse, &sum, vp9_get16x16var_sse2, 16);
    250   return *sse - (((int64_t)sum * sum) >> 9);
    251 }
    252 
    253 unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
    254                                     const uint8_t *ref, int ref_stride,
    255                                     unsigned int *sse) {
    256   int sum;
    257   variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
    258                 sse, &sum, vp9_get16x16var_sse2, 16);
    259   return *sse - (((int64_t)sum * sum) >> 9);
    260 }
    261 
    262 unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
    263                                     const uint8_t *ref, int ref_stride,
    264                                     unsigned int *sse) {
    265   int sum;
    266   variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
    267                 sse, &sum, vp9_get16x16var_sse2, 16);
    268   return *sse - (((int64_t)sum * sum) >> 12);
    269 }
    270 
    271 unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
    272                                     const uint8_t *ref, int ref_stride,
    273                                     unsigned int *sse) {
    274   int sum;
    275   variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
    276                 sse, &sum, vp9_get16x16var_sse2, 16);
    277   return *sse - (((int64_t)sum * sum) >> 11);
    278 }
    279 
    280 unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
    281                                     const uint8_t *ref, int ref_stride,
    282                                     unsigned int *sse) {
    283   int sum;
    284   variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
    285                 sse, &sum, vp9_get16x16var_sse2, 16);
    286   return *sse - (((int64_t)sum * sum) >> 11);
    287 }
    288 
    289 unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
    290                              const uint8_t *ref, int ref_stride,
    291                              unsigned int *sse) {
    292   vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
    293   return *sse;
    294 }
    295 
    296 unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
    297                               const uint8_t *ref, int ref_stride,
    298                               unsigned int *sse) {
    299   vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
    300   return *sse;
    301 }
    302 
    303 unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
    304                               const uint8_t *ref, int ref_stride,
    305                               unsigned int *sse) {
    306   vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
    307   return *sse;
    308 }
    309 
    310 unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
    311                                const uint8_t *ref, int ref_stride,
    312                                unsigned int *sse) {
    313   vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
    314   return *sse;
    315 }
    316 
    317 #define DECL(w, opt) \
    318 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
    319                                         ptrdiff_t src_stride, \
    320                                         int x_offset, int y_offset, \
    321                                         const uint8_t *dst, \
    322                                         ptrdiff_t dst_stride, \
    323                                         int height, unsigned int *sse)
    324 #define DECLS(opt1, opt2) \
    325 DECL(4, opt2); \
    326 DECL(8, opt1); \
    327 DECL(16, opt1)
    328 
    329 DECLS(sse2, sse);
    330 DECLS(ssse3, ssse3);
    331 #undef DECLS
    332 #undef DECL
    333 
    334 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    335 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
    336                                                      int src_stride, \
    337                                                      int x_offset, \
    338                                                      int y_offset, \
    339                                                      const uint8_t *dst, \
    340                                                      int dst_stride, \
    341                                                      unsigned int *sse_ptr) { \
    342   unsigned int sse; \
    343   int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
    344                                                 y_offset, dst, dst_stride, \
    345                                                 h, &sse); \
    346   if (w > wf) { \
    347     unsigned int sse2; \
    348     int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
    349                                                    x_offset, y_offset, \
    350                                                    dst + 16, dst_stride, \
    351                                                    h, &sse2); \
    352     se += se2; \
    353     sse += sse2; \
    354     if (w > wf * 2) { \
    355       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
    356                                                  x_offset, y_offset, \
    357                                                  dst + 32, dst_stride, \
    358                                                  h, &sse2); \
    359       se += se2; \
    360       sse += sse2; \
    361       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
    362                                                  x_offset, y_offset, \
    363                                                  dst + 48, dst_stride, \
    364                                                  h, &sse2); \
    365       se += se2; \
    366       sse += sse2; \
    367     } \
    368   } \
    369   *sse_ptr = sse; \
    370   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    371 }
    372 
    373 #define FNS(opt1, opt2) \
    374 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    375 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    376 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    377 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    378 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    379 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    380 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    381 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
    382 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
    383 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
    384 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
    385 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
    386 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
    387 
    388 FNS(sse2, sse);
    389 FNS(ssse3, ssse3);
    390 
    391 #undef FNS
    392 #undef FN
    393 
    394 #define DECL(w, opt) \
    395 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
    396                                             ptrdiff_t src_stride, \
    397                                             int x_offset, int y_offset, \
    398                                             const uint8_t *dst, \
    399                                             ptrdiff_t dst_stride, \
    400                                             const uint8_t *sec, \
    401                                             ptrdiff_t sec_stride, \
    402                                             int height, unsigned int *sse)
    403 #define DECLS(opt1, opt2) \
    404 DECL(4, opt2); \
    405 DECL(8, opt1); \
    406 DECL(16, opt1)
    407 
    408 DECLS(sse2, sse);
    409 DECLS(ssse3, ssse3);
    410 #undef DECL
    411 #undef DECLS
    412 
    413 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    414 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
    415                                                          int src_stride, \
    416                                                          int x_offset, \
    417                                                          int y_offset, \
    418                                                          const uint8_t *dst, \
    419                                                          int dst_stride, \
    420                                                          unsigned int *sseptr, \
    421                                                          const uint8_t *sec) { \
    422   unsigned int sse; \
    423   int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
    424                                                     y_offset, dst, dst_stride, \
    425                                                     sec, w, h, &sse); \
    426   if (w > wf) { \
    427     unsigned int sse2; \
    428     int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
    429                                                        x_offset, y_offset, \
    430                                                        dst + 16, dst_stride, \
    431                                                        sec + 16, w, h, &sse2); \
    432     se += se2; \
    433     sse += sse2; \
    434     if (w > wf * 2) { \
    435       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
    436                                                      x_offset, y_offset, \
    437                                                      dst + 32, dst_stride, \
    438                                                      sec + 32, w, h, &sse2); \
    439       se += se2; \
    440       sse += sse2; \
    441       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
    442                                                      x_offset, y_offset, \
    443                                                      dst + 48, dst_stride, \
    444                                                      sec + 48, w, h, &sse2); \
    445       se += se2; \
    446       sse += sse2; \
    447     } \
    448   } \
    449   *sseptr = sse; \
    450   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    451 }
    452 
    453 #define FNS(opt1, opt2) \
    454 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    455 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    456 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    457 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    458 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    459 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    460 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    461 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
    462 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
    463 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
    464 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
    465 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
    466 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
    467 
    468 FNS(sse2, sse);
    469 FNS(ssse3, ssse3);
    470 
    471 #undef FNS
    472 #undef FN
    473