Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_config.h"
     12 
     13 #include "vp9/encoder/vp9_variance.h"
     14 #include "vp9/common/vp9_pragmas.h"
     15 #include "vpx_ports/mem.h"
     16 
     17 extern unsigned int vp9_get4x4var_mmx
     18 (
     19   const unsigned char *src_ptr,
     20   int  source_stride,
     21   const unsigned char *ref_ptr,
     22   int  recon_stride,
     23   unsigned int *SSE,
     24   int *Sum
     25 );
     26 
     27 unsigned int vp9_get_mb_ss_sse2
     28 (
     29   const short *src_ptr
     30 );
     31 unsigned int vp9_get16x16var_sse2
     32 (
     33   const unsigned char *src_ptr,
     34   int source_stride,
     35   const unsigned char *ref_ptr,
     36   int recon_stride,
     37   unsigned int *SSE,
     38   int *Sum
     39 );
     40 unsigned int vp9_get8x8var_sse2
     41 (
     42   const unsigned char *src_ptr,
     43   int source_stride,
     44   const unsigned char *ref_ptr,
     45   int recon_stride,
     46   unsigned int *SSE,
     47   int *Sum
     48 );
     49 void vp9_half_horiz_vert_variance8x_h_sse2
     50 (
     51   const unsigned char *ref_ptr,
     52   int ref_pixels_per_line,
     53   const unsigned char *src_ptr,
     54   int src_pixels_per_line,
     55   unsigned int Height,
     56   int *sum,
     57   unsigned int *sumsquared
     58 );
     59 void vp9_half_horiz_vert_variance16x_h_sse2
     60 (
     61   const unsigned char *ref_ptr,
     62   int ref_pixels_per_line,
     63   const unsigned char *src_ptr,
     64   int src_pixels_per_line,
     65   unsigned int Height,
     66   int *sum,
     67   unsigned int *sumsquared
     68 );
     69 void vp9_half_horiz_variance8x_h_sse2
     70 (
     71   const unsigned char *ref_ptr,
     72   int ref_pixels_per_line,
     73   const unsigned char *src_ptr,
     74   int src_pixels_per_line,
     75   unsigned int Height,
     76   int *sum,
     77   unsigned int *sumsquared
     78 );
     79 void vp9_half_horiz_variance16x_h_sse2
     80 (
     81   const unsigned char *ref_ptr,
     82   int ref_pixels_per_line,
     83   const unsigned char *src_ptr,
     84   int src_pixels_per_line,
     85   unsigned int Height,
     86   int *sum,
     87   unsigned int *sumsquared
     88 );
     89 void vp9_half_vert_variance8x_h_sse2
     90 (
     91   const unsigned char *ref_ptr,
     92   int ref_pixels_per_line,
     93   const unsigned char *src_ptr,
     94   int src_pixels_per_line,
     95   unsigned int Height,
     96   int *sum,
     97   unsigned int *sumsquared
     98 );
     99 void vp9_half_vert_variance16x_h_sse2
    100 (
    101   const unsigned char *ref_ptr,
    102   int ref_pixels_per_line,
    103   const unsigned char *src_ptr,
    104   int src_pixels_per_line,
    105   unsigned int Height,
    106   int *sum,
    107   unsigned int *sumsquared
    108 );
    109 
    110 typedef unsigned int (*get_var_sse2) (
    111   const unsigned char *src_ptr,
    112   int source_stride,
    113   const unsigned char *ref_ptr,
    114   int recon_stride,
    115   unsigned int *SSE,
    116   int *Sum
    117 );
    118 
    119 static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
    120                         const unsigned char *ref_ptr, int  recon_stride,
    121                         int  w, int  h, unsigned int *sse, int *sum,
    122                         get_var_sse2 var_fn, int block_size) {
    123   unsigned int sse0;
    124   int sum0;
    125   int i, j;
    126 
    127   *sse = 0;
    128   *sum = 0;
    129 
    130   for (i = 0; i < h; i += block_size) {
    131     for (j = 0; j < w; j += block_size) {
    132       var_fn(src_ptr + source_stride * i + j, source_stride,
    133              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
    134       *sse += sse0;
    135       *sum += sum0;
    136     }
    137   }
    138 }
    139 
    140 unsigned int vp9_variance4x4_sse2(
    141   const unsigned char *src_ptr,
    142   int  source_stride,
    143   const unsigned char *ref_ptr,
    144   int  recon_stride,
    145   unsigned int *sse) {
    146   unsigned int var;
    147   int avg;
    148 
    149   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
    150                   &var, &avg, vp9_get4x4var_mmx, 4);
    151   *sse = var;
    152   return (var - (((unsigned int)avg * avg) >> 4));
    153 }
    154 
    155 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
    156                                   int  source_stride,
    157                                   const uint8_t *ref_ptr,
    158                                   int  recon_stride,
    159                                   unsigned int *sse) {
    160   unsigned int var;
    161   int avg;
    162 
    163   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
    164                   &var, &avg, vp9_get4x4var_mmx, 4);
    165   *sse = var;
    166   return (var - (((unsigned int)avg * avg) >> 5));
    167 }
    168 
    169 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
    170                                   int  source_stride,
    171                                   const uint8_t *ref_ptr,
    172                                   int  recon_stride,
    173                                   unsigned int *sse) {
    174   unsigned int var;
    175   int avg;
    176 
    177   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
    178                   &var, &avg, vp9_get4x4var_mmx, 4);
    179   *sse = var;
    180   return (var - (((unsigned int)avg * avg) >> 5));
    181 }
    182 
    183 unsigned int vp9_variance8x8_sse2
    184 (
    185   const unsigned char *src_ptr,
    186   int  source_stride,
    187   const unsigned char *ref_ptr,
    188   int  recon_stride,
    189   unsigned int *sse) {
    190   unsigned int var;
    191   int avg;
    192 
    193   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
    194                   &var, &avg, vp9_get8x8var_sse2, 8);
    195   *sse = var;
    196   return (var - (((unsigned int)avg * avg) >> 6));
    197 }
    198 
    199 unsigned int vp9_variance16x8_sse2
    200 (
    201   const unsigned char *src_ptr,
    202   int  source_stride,
    203   const unsigned char *ref_ptr,
    204   int  recon_stride,
    205   unsigned int *sse) {
    206   unsigned int var;
    207   int avg;
    208 
    209   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
    210                   &var, &avg, vp9_get8x8var_sse2, 8);
    211   *sse = var;
    212   return (var - (((unsigned int)avg * avg) >> 7));
    213 }
    214 
    215 unsigned int vp9_variance8x16_sse2
    216 (
    217   const unsigned char *src_ptr,
    218   int  source_stride,
    219   const unsigned char *ref_ptr,
    220   int  recon_stride,
    221   unsigned int *sse) {
    222   unsigned int var;
    223   int avg;
    224 
    225   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
    226                 &var, &avg, vp9_get8x8var_sse2, 8);
    227   *sse = var;
    228   return (var - (((unsigned int)avg * avg) >> 7));
    229 }
    230 
    231 unsigned int vp9_variance16x16_sse2
    232 (
    233   const unsigned char *src_ptr,
    234   int  source_stride,
    235   const unsigned char *ref_ptr,
    236   int  recon_stride,
    237   unsigned int *sse) {
    238   unsigned int var;
    239   int avg;
    240 
    241   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
    242                 &var, &avg, vp9_get16x16var_sse2, 16);
    243   *sse = var;
    244   return (var - (((unsigned int)avg * avg) >> 8));
    245 }
    246 
    247 unsigned int vp9_mse16x16_sse2(
    248   const unsigned char *src_ptr,
    249   int  source_stride,
    250   const unsigned char *ref_ptr,
    251   int  recon_stride,
    252   unsigned int *sse) {
    253 
    254   unsigned int sse0;
    255   int sum0;
    256   vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
    257                        &sum0);
    258   *sse = sse0;
    259   return sse0;
    260 }
    261 
    262 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
    263                                     int  source_stride,
    264                                     const uint8_t *ref_ptr,
    265                                     int  recon_stride,
    266                                     unsigned int *sse) {
    267   unsigned int var;
    268   int avg;
    269 
    270   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
    271                 &var, &avg, vp9_get16x16var_sse2, 16);
    272   *sse = var;
    273   return (var - (((int64_t)avg * avg) >> 10));
    274 }
    275 
    276 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
    277                                     int  source_stride,
    278                                     const uint8_t *ref_ptr,
    279                                     int  recon_stride,
    280                                     unsigned int *sse) {
    281   unsigned int var;
    282   int avg;
    283 
    284   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
    285                 &var, &avg, vp9_get16x16var_sse2, 16);
    286   *sse = var;
    287   return (var - (((int64_t)avg * avg) >> 9));
    288 }
    289 
    290 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
    291                                     int  source_stride,
    292                                     const uint8_t *ref_ptr,
    293                                     int  recon_stride,
    294                                     unsigned int *sse) {
    295   unsigned int var;
    296   int avg;
    297 
    298   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
    299                 &var, &avg, vp9_get16x16var_sse2, 16);
    300   *sse = var;
    301   return (var - (((int64_t)avg * avg) >> 9));
    302 }
    303 
    304 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
    305                                     int  source_stride,
    306                                     const uint8_t *ref_ptr,
    307                                     int  recon_stride,
    308                                     unsigned int *sse) {
    309   unsigned int var;
    310   int avg;
    311 
    312   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
    313                 &var, &avg, vp9_get16x16var_sse2, 16);
    314   *sse = var;
    315   return (var - (((int64_t)avg * avg) >> 12));
    316 }
    317 
    318 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
    319                                     int  source_stride,
    320                                     const uint8_t *ref_ptr,
    321                                     int  recon_stride,
    322                                     unsigned int *sse) {
    323   unsigned int var;
    324   int avg;
    325 
    326   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
    327                 &var, &avg, vp9_get16x16var_sse2, 16);
    328   *sse = var;
    329   return (var - (((int64_t)avg * avg) >> 11));
    330 }
    331 
    332 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
    333                                     int  source_stride,
    334                                     const uint8_t *ref_ptr,
    335                                     int  recon_stride,
    336                                     unsigned int *sse) {
    337   unsigned int var;
    338   int avg;
    339 
    340   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
    341                 &var, &avg, vp9_get16x16var_sse2, 16);
    342   *sse = var;
    343   return (var - (((int64_t)avg * avg) >> 11));
    344 }
    345 
    346 #define DECL(w, opt) \
    347 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
    348                                         ptrdiff_t src_stride, \
    349                                         int x_offset, int y_offset, \
    350                                         const uint8_t *dst, \
    351                                         ptrdiff_t dst_stride, \
    352                                         int height, unsigned int *sse)
    353 #define DECLS(opt1, opt2) \
    354 DECL(4, opt2); \
    355 DECL(8, opt1); \
    356 DECL(16, opt1)
    357 
    358 DECLS(sse2, sse);
    359 DECLS(ssse3, ssse3);
    360 #undef DECLS
    361 #undef DECL
    362 
    363 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    364 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
    365                                                      int src_stride, \
    366                                                      int x_offset, \
    367                                                      int y_offset, \
    368                                                      const uint8_t *dst, \
    369                                                      int dst_stride, \
    370                                                      unsigned int *sse_ptr) { \
    371   unsigned int sse; \
    372   int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
    373                                                 y_offset, dst, dst_stride, \
    374                                                 h, &sse); \
    375   if (w > wf) { \
    376     unsigned int sse2; \
    377     int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
    378                                                    x_offset, y_offset, \
    379                                                    dst + 16, dst_stride, \
    380                                                    h, &sse2); \
    381     se += se2; \
    382     sse += sse2; \
    383     if (w > wf * 2) { \
    384       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
    385                                                  x_offset, y_offset, \
    386                                                  dst + 32, dst_stride, \
    387                                                  h, &sse2); \
    388       se += se2; \
    389       sse += sse2; \
    390       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
    391                                                  x_offset, y_offset, \
    392                                                  dst + 48, dst_stride, \
    393                                                  h, &sse2); \
    394       se += se2; \
    395       sse += sse2; \
    396     } \
    397   } \
    398   *sse_ptr = sse; \
    399   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    400 }
    401 
    402 #define FNS(opt1, opt2) \
    403 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    404 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    405 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    406 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    407 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    408 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    409 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    410 FN(16,  8, 16, 4, 3, opt1,); \
    411 FN(8,  16,  8, 3, 4, opt1,); \
    412 FN(8,   8,  8, 3, 3, opt1,); \
    413 FN(8,   4,  8, 3, 2, opt1,); \
    414 FN(4,   8,  4, 2, 3, opt2,); \
    415 FN(4,   4,  4, 2, 2, opt2,)
    416 
    417 FNS(sse2, sse);
    418 FNS(ssse3, ssse3);
    419 
    420 #undef FNS
    421 #undef FN
    422 
    423 #define DECL(w, opt) \
    424 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
    425                                             ptrdiff_t src_stride, \
    426                                             int x_offset, int y_offset, \
    427                                             const uint8_t *dst, \
    428                                             ptrdiff_t dst_stride, \
    429                                             const uint8_t *sec, \
    430                                             ptrdiff_t sec_stride, \
    431                                             int height, unsigned int *sse)
    432 #define DECLS(opt1, opt2) \
    433 DECL(4, opt2); \
    434 DECL(8, opt1); \
    435 DECL(16, opt1)
    436 
    437 DECLS(sse2, sse);
    438 DECLS(ssse3, ssse3);
    439 #undef DECL
    440 #undef DECLS
    441 
    442 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    443 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
    444                                                          int src_stride, \
    445                                                          int x_offset, \
    446                                                          int y_offset, \
    447                                                          const uint8_t *dst, \
    448                                                          int dst_stride, \
    449                                                          unsigned int *sseptr, \
    450                                                          const uint8_t *sec) { \
    451   unsigned int sse; \
    452   int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
    453                                                     y_offset, dst, dst_stride, \
    454                                                     sec, w, h, &sse); \
    455   if (w > wf) { \
    456     unsigned int sse2; \
    457     int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
    458                                                        x_offset, y_offset, \
    459                                                        dst + 16, dst_stride, \
    460                                                        sec + 16, w, h, &sse2); \
    461     se += se2; \
    462     sse += sse2; \
    463     if (w > wf * 2) { \
    464       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
    465                                                      x_offset, y_offset, \
    466                                                      dst + 32, dst_stride, \
    467                                                      sec + 32, w, h, &sse2); \
    468       se += se2; \
    469       sse += sse2; \
    470       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
    471                                                      x_offset, y_offset, \
    472                                                      dst + 48, dst_stride, \
    473                                                      sec + 48, w, h, &sse2); \
    474       se += se2; \
    475       sse += sse2; \
    476     } \
    477   } \
    478   *sseptr = sse; \
    479   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    480 }
    481 
    482 #define FNS(opt1, opt2) \
    483 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    484 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    485 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    486 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    487 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    488 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    489 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    490 FN(16,  8, 16, 4, 3, opt1,); \
    491 FN(8,  16,  8, 3, 4, opt1,); \
    492 FN(8,   8,  8, 3, 3, opt1,); \
    493 FN(8,   4,  8, 3, 2, opt1,); \
    494 FN(4,   8,  4, 2, 3, opt2,); \
    495 FN(4,   4,  4, 2, 2, opt2,)
    496 
    497 FNS(sse2, sse);
    498 FNS(ssse3, ssse3);
    499 
    500 #undef FNS
    501 #undef FN
    502 
    503 unsigned int vp9_variance_halfpixvar16x16_h_sse2(
    504   const unsigned char *src_ptr,
    505   int  src_pixels_per_line,
    506   const unsigned char *dst_ptr,
    507   int  dst_pixels_per_line,
    508   unsigned int *sse) {
    509   int xsum0;
    510   unsigned int xxsum0;
    511 
    512   vp9_half_horiz_variance16x_h_sse2(
    513     src_ptr, src_pixels_per_line,
    514     dst_ptr, dst_pixels_per_line, 16,
    515     &xsum0, &xxsum0);
    516 
    517   *sse = xxsum0;
    518   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    519 }
    520 
    521 
    522 unsigned int vp9_variance_halfpixvar16x16_v_sse2(
    523   const unsigned char *src_ptr,
    524   int  src_pixels_per_line,
    525   const unsigned char *dst_ptr,
    526   int  dst_pixels_per_line,
    527   unsigned int *sse) {
    528   int xsum0;
    529   unsigned int xxsum0;
    530   vp9_half_vert_variance16x_h_sse2(
    531     src_ptr, src_pixels_per_line,
    532     dst_ptr, dst_pixels_per_line, 16,
    533     &xsum0, &xxsum0);
    534 
    535   *sse = xxsum0;
    536   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    537 }
    538 
    539 
    540 unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
    541   const unsigned char *src_ptr,
    542   int  src_pixels_per_line,
    543   const unsigned char *dst_ptr,
    544   int  dst_pixels_per_line,
    545   unsigned int *sse) {
    546   int xsum0;
    547   unsigned int xxsum0;
    548 
    549   vp9_half_horiz_vert_variance16x_h_sse2(
    550     src_ptr, src_pixels_per_line,
    551     dst_ptr, dst_pixels_per_line, 16,
    552     &xsum0, &xxsum0);
    553 
    554   *sse = xxsum0;
    555   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    556 }
    557