Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 
     13 #include "./vpx_config.h"
     14 #include "./vpx_dsp_rtcd.h"
     15 
     16 #include "vpx_ports/mem.h"
     17 
     18 typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
     19                                const unsigned char *ref, int ref_stride,
     20                                unsigned int *sse, int *sum);
     21 
     22 unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
     23   __m128i vsum = _mm_setzero_si128();
     24   int i;
     25 
     26   for (i = 0; i < 32; ++i) {
     27     const __m128i v = _mm_loadu_si128((const __m128i *)src);
     28     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
     29     src += 8;
     30   }
     31 
     32   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
     33   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
     34   return _mm_cvtsi128_si32(vsum);
     35 }
     36 
     37 #define READ64(p, stride, i)                                  \
     38   _mm_unpacklo_epi8(                                          \
     39       _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
     40       _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
     41 
     42 static void get4x4var_sse2(const uint8_t *src, int src_stride,
     43                            const uint8_t *ref, int ref_stride,
     44                            unsigned int *sse, int *sum) {
     45   const __m128i zero = _mm_setzero_si128();
     46   const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
     47   const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
     48   const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
     49   const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
     50   const __m128i diff0 = _mm_sub_epi16(src0, ref0);
     51   const __m128i diff1 = _mm_sub_epi16(src1, ref1);
     52 
     53   // sum
     54   __m128i vsum = _mm_add_epi16(diff0, diff1);
     55   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
     56   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
     57   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
     58   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
     59 
     60   // sse
     61   vsum =
     62       _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
     63   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
     64   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
     65   *sse = _mm_cvtsi128_si32(vsum);
     66 }
     67 
     68 void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
     69                         int ref_stride, unsigned int *sse, int *sum) {
     70   const __m128i zero = _mm_setzero_si128();
     71   __m128i vsum = _mm_setzero_si128();
     72   __m128i vsse = _mm_setzero_si128();
     73   int i;
     74 
     75   for (i = 0; i < 8; i += 2) {
     76     const __m128i src0 = _mm_unpacklo_epi8(
     77         _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
     78     const __m128i ref0 = _mm_unpacklo_epi8(
     79         _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
     80     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
     81 
     82     const __m128i src1 = _mm_unpacklo_epi8(
     83         _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
     84     const __m128i ref1 = _mm_unpacklo_epi8(
     85         _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
     86     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
     87 
     88     vsum = _mm_add_epi16(vsum, diff0);
     89     vsum = _mm_add_epi16(vsum, diff1);
     90     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
     91     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
     92   }
     93 
     94   // sum
     95   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
     96   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
     97   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
     98   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
     99 
    100   // sse
    101   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
    102   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
    103   *sse = _mm_cvtsi128_si32(vsse);
    104 }
    105 
    106 void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
    107                           const uint8_t *ref, int ref_stride, unsigned int *sse,
    108                           int *sum) {
    109   const __m128i zero = _mm_setzero_si128();
    110   __m128i vsum = _mm_setzero_si128();
    111   __m128i vsse = _mm_setzero_si128();
    112   int i;
    113 
    114   for (i = 0; i < 16; ++i) {
    115     const __m128i s = _mm_loadu_si128((const __m128i *)src);
    116     const __m128i r = _mm_loadu_si128((const __m128i *)ref);
    117 
    118     const __m128i src0 = _mm_unpacklo_epi8(s, zero);
    119     const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
    120     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
    121 
    122     const __m128i src1 = _mm_unpackhi_epi8(s, zero);
    123     const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
    124     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
    125 
    126     vsum = _mm_add_epi16(vsum, diff0);
    127     vsum = _mm_add_epi16(vsum, diff1);
    128     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
    129     vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
    130 
    131     src += src_stride;
    132     ref += ref_stride;
    133   }
    134 
    135   // sum
    136   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
    137   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
    138   *sum =
    139       (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
    140 
    141   // sse
    142   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
    143   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
    144   *sse = _mm_cvtsi128_si32(vsse);
    145 }
    146 
    147 static void variance_sse2(const unsigned char *src, int src_stride,
    148                           const unsigned char *ref, int ref_stride, int w,
    149                           int h, unsigned int *sse, int *sum,
    150                           getNxMvar_fn_t var_fn, int block_size) {
    151   int i, j;
    152 
    153   *sse = 0;
    154   *sum = 0;
    155 
    156   for (i = 0; i < h; i += block_size) {
    157     for (j = 0; j < w; j += block_size) {
    158       unsigned int sse0;
    159       int sum0;
    160       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
    161              ref_stride, &sse0, &sum0);
    162       *sse += sse0;
    163       *sum += sum0;
    164     }
    165   }
    166 }
    167 
    168 unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
    169                                   const unsigned char *ref, int ref_stride,
    170                                   unsigned int *sse) {
    171   int sum;
    172   get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
    173   return *sse - ((sum * sum) >> 4);
    174 }
    175 
    176 unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
    177                                   const uint8_t *ref, int ref_stride,
    178                                   unsigned int *sse) {
    179   int sum;
    180   variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
    181                 get4x4var_sse2, 4);
    182   return *sse - ((sum * sum) >> 5);
    183 }
    184 
    185 unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
    186                                   const uint8_t *ref, int ref_stride,
    187                                   unsigned int *sse) {
    188   int sum;
    189   variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
    190                 get4x4var_sse2, 4);
    191   return *sse - ((sum * sum) >> 5);
    192 }
    193 
    194 unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
    195                                   const unsigned char *ref, int ref_stride,
    196                                   unsigned int *sse) {
    197   int sum;
    198   vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
    199   return *sse - ((sum * sum) >> 6);
    200 }
    201 
    202 unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
    203                                    const unsigned char *ref, int ref_stride,
    204                                    unsigned int *sse) {
    205   int sum;
    206   variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
    207                 vpx_get8x8var_sse2, 8);
    208   return *sse - ((sum * sum) >> 7);
    209 }
    210 
    211 unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
    212                                    const unsigned char *ref, int ref_stride,
    213                                    unsigned int *sse) {
    214   int sum;
    215   variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
    216                 vpx_get8x8var_sse2, 8);
    217   return *sse - ((sum * sum) >> 7);
    218 }
    219 
    220 unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
    221                                     const unsigned char *ref, int ref_stride,
    222                                     unsigned int *sse) {
    223   int sum;
    224   vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
    225   return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
    226 }
    227 
    228 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
    229                                     const uint8_t *ref, int ref_stride,
    230                                     unsigned int *sse) {
    231   int sum;
    232   variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
    233                 vpx_get16x16var_sse2, 16);
    234   return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
    235 }
    236 
    237 unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
    238                                     const uint8_t *ref, int ref_stride,
    239                                     unsigned int *sse) {
    240   int sum;
    241   variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
    242                 vpx_get16x16var_sse2, 16);
    243   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
    244 }
    245 
    246 unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
    247                                     const uint8_t *ref, int ref_stride,
    248                                     unsigned int *sse) {
    249   int sum;
    250   variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
    251                 vpx_get16x16var_sse2, 16);
    252   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
    253 }
    254 
    255 unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
    256                                     const uint8_t *ref, int ref_stride,
    257                                     unsigned int *sse) {
    258   int sum;
    259   variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
    260                 vpx_get16x16var_sse2, 16);
    261   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
    262 }
    263 
    264 unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
    265                                     const uint8_t *ref, int ref_stride,
    266                                     unsigned int *sse) {
    267   int sum;
    268   variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
    269                 vpx_get16x16var_sse2, 16);
    270   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
    271 }
    272 
    273 unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
    274                                     const uint8_t *ref, int ref_stride,
    275                                     unsigned int *sse) {
    276   int sum;
    277   variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
    278                 vpx_get16x16var_sse2, 16);
    279   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
    280 }
    281 
    282 unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
    283                              const uint8_t *ref, int ref_stride,
    284                              unsigned int *sse) {
    285   vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
    286   return *sse;
    287 }
    288 
    289 unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
    290                               const uint8_t *ref, int ref_stride,
    291                               unsigned int *sse) {
    292   vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
    293   return *sse;
    294 }
    295 
    296 unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
    297                               const uint8_t *ref, int ref_stride,
    298                               unsigned int *sse) {
    299   vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
    300   return *sse;
    301 }
    302 
    303 unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
    304                                const uint8_t *ref, int ref_stride,
    305                                unsigned int *sse) {
    306   vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
    307   return *sse;
    308 }
    309 
    310 // The 2 unused parameters are place holders for PIC enabled build.
    311 // These definitions are for functions defined in subpel_variance.asm
    312 #define DECL(w, opt)                                                           \
    313   int vpx_sub_pixel_variance##w##xh_##opt(                                     \
    314       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
    315       const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
    316       void *unused0, void *unused)
    317 #define DECLS(opt1, opt2) \
    318   DECL(4, opt1);          \
    319   DECL(8, opt1);          \
    320   DECL(16, opt1)
    321 
    322 DECLS(sse2, sse2);
    323 DECLS(ssse3, ssse3);
    324 #undef DECLS
    325 #undef DECL
    326 
    327 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
    328   unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                        \
    329       const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
    330       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
    331     unsigned int sse;                                                          \
    332     int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
    333                                                   y_offset, dst, dst_stride,   \
    334                                                   h, &sse, NULL, NULL);        \
    335     if (w > wf) {                                                              \
    336       unsigned int sse2;                                                       \
    337       int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                          \
    338           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
    339           &sse2, NULL, NULL);                                                  \
    340       se += se2;                                                               \
    341       sse += sse2;                                                             \
    342       if (w > wf * 2) {                                                        \
    343         se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
    344             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
    345             &sse2, NULL, NULL);                                                \
    346         se += se2;                                                             \
    347         sse += sse2;                                                           \
    348         se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
    349             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
    350             &sse2, NULL, NULL);                                                \
    351         se += se2;                                                             \
    352         sse += sse2;                                                           \
    353       }                                                                        \
    354     }                                                                          \
    355     *sse_ptr = sse;                                                            \
    356     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
    357   }
    358 
    359 #define FNS(opt1, opt2)                              \
    360   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
    361   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
    362   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
    363   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
    364   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
    365   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
    366   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
    367   FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
    368   FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
    369   FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
    370   FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
    371   FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
    372   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
    373 
    374 FNS(sse2, sse2);
    375 FNS(ssse3, ssse3);
    376 
    377 #undef FNS
    378 #undef FN
    379 
    380 // The 2 unused parameters are place holders for PIC enabled build.
    381 #define DECL(w, opt)                                                        \
    382   int vpx_sub_pixel_avg_variance##w##xh_##opt(                              \
    383       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
    384       const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
    385       ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
    386       void *unused)
    387 #define DECLS(opt1, opt2) \
    388   DECL(4, opt1);          \
    389   DECL(8, opt1);          \
    390   DECL(16, opt1)
    391 
    392 DECLS(sse2, sse2);
    393 DECLS(ssse3, ssse3);
    394 #undef DECL
    395 #undef DECLS
    396 
    397 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
    398   unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(                    \
    399       const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
    400       const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
    401       const uint8_t *sec) {                                                    \
    402     unsigned int sse;                                                          \
    403     int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                         \
    404         src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
    405         NULL, NULL);                                                           \
    406     if (w > wf) {                                                              \
    407       unsigned int sse2;                                                       \
    408       int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                      \
    409           src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
    410           sec + 16, w, h, &sse2, NULL, NULL);                                  \
    411       se += se2;                                                               \
    412       sse += sse2;                                                             \
    413       if (w > wf * 2) {                                                        \
    414         se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
    415             src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
    416             sec + 32, w, h, &sse2, NULL, NULL);                                \
    417         se += se2;                                                             \
    418         sse += sse2;                                                           \
    419         se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
    420             src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
    421             sec + 48, w, h, &sse2, NULL, NULL);                                \
    422         se += se2;                                                             \
    423         sse += sse2;                                                           \
    424       }                                                                        \
    425     }                                                                          \
    426     *sseptr = sse;                                                             \
    427     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
    428   }
    429 
    430 #define FNS(opt1, opt2)                              \
    431   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
    432   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
    433   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
    434   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
    435   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
    436   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
    437   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
    438   FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
    439   FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
    440   FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
    441   FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
    442   FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
    443   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
    444 
    445 FNS(sse2, sse);
    446 FNS(ssse3, ssse3);
    447 
    448 #undef FNS
    449 #undef FN
    450