Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_config.h"
     12 
     13 #include "vp9/encoder/vp9_variance.h"
     14 #include "vp9/common/vp9_pragmas.h"
     15 #include "vpx_ports/mem.h"
     16 
     17 extern unsigned int vp9_get4x4var_mmx
     18 (
     19   const unsigned char *src_ptr,
     20   int  source_stride,
     21   const unsigned char *ref_ptr,
     22   int  recon_stride,
     23   unsigned int *SSE,
     24   int *Sum
     25 );
     26 
     27 unsigned int vp9_get_mb_ss_sse2
     28 (
     29   const int16_t *src_ptr
     30 );
     31 unsigned int vp9_get16x16var_sse2
     32 (
     33   const unsigned char *src_ptr,
     34   int source_stride,
     35   const unsigned char *ref_ptr,
     36   int recon_stride,
     37   unsigned int *SSE,
     38   int *Sum
     39 );
     40 unsigned int vp9_get8x8var_sse2
     41 (
     42   const unsigned char *src_ptr,
     43   int source_stride,
     44   const unsigned char *ref_ptr,
     45   int recon_stride,
     46   unsigned int *SSE,
     47   int *Sum
     48 );
     49 void vp9_half_horiz_vert_variance8x_h_sse2
     50 (
     51   const unsigned char *ref_ptr,
     52   int ref_pixels_per_line,
     53   const unsigned char *src_ptr,
     54   int src_pixels_per_line,
     55   unsigned int Height,
     56   int *sum,
     57   unsigned int *sumsquared
     58 );
     59 void vp9_half_horiz_vert_variance16x_h_sse2
     60 (
     61   const unsigned char *ref_ptr,
     62   int ref_pixels_per_line,
     63   const unsigned char *src_ptr,
     64   int src_pixels_per_line,
     65   unsigned int Height,
     66   int *sum,
     67   unsigned int *sumsquared
     68 );
     69 void vp9_half_horiz_variance8x_h_sse2
     70 (
     71   const unsigned char *ref_ptr,
     72   int ref_pixels_per_line,
     73   const unsigned char *src_ptr,
     74   int src_pixels_per_line,
     75   unsigned int Height,
     76   int *sum,
     77   unsigned int *sumsquared
     78 );
     79 void vp9_half_horiz_variance16x_h_sse2
     80 (
     81   const unsigned char *ref_ptr,
     82   int ref_pixels_per_line,
     83   const unsigned char *src_ptr,
     84   int src_pixels_per_line,
     85   unsigned int Height,
     86   int *sum,
     87   unsigned int *sumsquared
     88 );
     89 void vp9_half_vert_variance8x_h_sse2
     90 (
     91   const unsigned char *ref_ptr,
     92   int ref_pixels_per_line,
     93   const unsigned char *src_ptr,
     94   int src_pixels_per_line,
     95   unsigned int Height,
     96   int *sum,
     97   unsigned int *sumsquared
     98 );
     99 void vp9_half_vert_variance16x_h_sse2
    100 (
    101   const unsigned char *ref_ptr,
    102   int ref_pixels_per_line,
    103   const unsigned char *src_ptr,
    104   int src_pixels_per_line,
    105   unsigned int Height,
    106   int *sum,
    107   unsigned int *sumsquared
    108 );
    109 
    110 typedef unsigned int (*get_var_sse2) (
    111   const unsigned char *src_ptr,
    112   int source_stride,
    113   const unsigned char *ref_ptr,
    114   int recon_stride,
    115   unsigned int *SSE,
    116   int *Sum
    117 );
    118 
    119 static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
    120                         const unsigned char *ref_ptr, int  recon_stride,
    121                         int  w, int  h, unsigned int *sse, int *sum,
    122                         get_var_sse2 var_fn, int block_size) {
    123   unsigned int sse0;
    124   int sum0;
    125   int i, j;
    126 
    127   *sse = 0;
    128   *sum = 0;
    129 
    130   for (i = 0; i < h; i += block_size) {
    131     for (j = 0; j < w; j += block_size) {
    132       var_fn(src_ptr + source_stride * i + j, source_stride,
    133              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
    134       *sse += sse0;
    135       *sum += sum0;
    136     }
    137   }
    138 }
    139 
    140 unsigned int vp9_variance4x4_sse2(
    141   const unsigned char *src_ptr,
    142   int  source_stride,
    143   const unsigned char *ref_ptr,
    144   int  recon_stride,
    145   unsigned int *sse) {
    146   unsigned int var;
    147   int avg;
    148 
    149   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
    150                   &var, &avg, vp9_get4x4var_mmx, 4);
    151   *sse = var;
    152   return (var - (((unsigned int)avg * avg) >> 4));
    153 }
    154 
    155 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
    156                                   int  source_stride,
    157                                   const uint8_t *ref_ptr,
    158                                   int  recon_stride,
    159                                   unsigned int *sse) {
    160   unsigned int var;
    161   int avg;
    162 
    163   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
    164                   &var, &avg, vp9_get4x4var_mmx, 4);
    165   *sse = var;
    166   return (var - (((unsigned int)avg * avg) >> 5));
    167 }
    168 
    169 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
    170                                   int  source_stride,
    171                                   const uint8_t *ref_ptr,
    172                                   int  recon_stride,
    173                                   unsigned int *sse) {
    174   unsigned int var;
    175   int avg;
    176 
    177   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
    178                   &var, &avg, vp9_get4x4var_mmx, 4);
    179   *sse = var;
    180   return (var - (((unsigned int)avg * avg) >> 5));
    181 }
    182 
    183 unsigned int vp9_variance8x8_sse2
    184 (
    185   const unsigned char *src_ptr,
    186   int  source_stride,
    187   const unsigned char *ref_ptr,
    188   int  recon_stride,
    189   unsigned int *sse) {
    190   unsigned int var;
    191   int avg;
    192 
    193   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
    194                   &var, &avg, vp9_get8x8var_sse2, 8);
    195   *sse = var;
    196   return (var - (((unsigned int)avg * avg) >> 6));
    197 }
    198 
    199 unsigned int vp9_variance16x8_sse2
    200 (
    201   const unsigned char *src_ptr,
    202   int  source_stride,
    203   const unsigned char *ref_ptr,
    204   int  recon_stride,
    205   unsigned int *sse) {
    206   unsigned int var;
    207   int avg;
    208 
    209   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
    210                   &var, &avg, vp9_get8x8var_sse2, 8);
    211   *sse = var;
    212   return (var - (((unsigned int)avg * avg) >> 7));
    213 }
    214 
    215 unsigned int vp9_variance8x16_sse2
    216 (
    217   const unsigned char *src_ptr,
    218   int  source_stride,
    219   const unsigned char *ref_ptr,
    220   int  recon_stride,
    221   unsigned int *sse) {
    222   unsigned int var;
    223   int avg;
    224 
    225   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
    226                 &var, &avg, vp9_get8x8var_sse2, 8);
    227   *sse = var;
    228   return (var - (((unsigned int)avg * avg) >> 7));
    229 }
    230 
    231 unsigned int vp9_variance16x16_sse2
    232 (
    233   const unsigned char *src_ptr,
    234   int  source_stride,
    235   const unsigned char *ref_ptr,
    236   int  recon_stride,
    237   unsigned int *sse) {
    238   unsigned int var;
    239   int avg;
    240 
    241   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
    242                 &var, &avg, vp9_get16x16var_sse2, 16);
    243   *sse = var;
    244   return (var - (((unsigned int)avg * avg) >> 8));
    245 }
    246 
    247 unsigned int vp9_mse16x16_sse2(
    248   const unsigned char *src_ptr,
    249   int  source_stride,
    250   const unsigned char *ref_ptr,
    251   int  recon_stride,
    252   unsigned int *sse) {
    253   unsigned int sse0;
    254   int sum0;
    255   vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
    256                        &sum0);
    257   *sse = sse0;
    258   return sse0;
    259 }
    260 
    261 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
    262                                     int  source_stride,
    263                                     const uint8_t *ref_ptr,
    264                                     int  recon_stride,
    265                                     unsigned int *sse) {
    266   unsigned int var;
    267   int avg;
    268 
    269   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
    270                 &var, &avg, vp9_get16x16var_sse2, 16);
    271   *sse = var;
    272   return (var - (((int64_t)avg * avg) >> 10));
    273 }
    274 
    275 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
    276                                     int  source_stride,
    277                                     const uint8_t *ref_ptr,
    278                                     int  recon_stride,
    279                                     unsigned int *sse) {
    280   unsigned int var;
    281   int avg;
    282 
    283   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
    284                 &var, &avg, vp9_get16x16var_sse2, 16);
    285   *sse = var;
    286   return (var - (((int64_t)avg * avg) >> 9));
    287 }
    288 
    289 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
    290                                     int  source_stride,
    291                                     const uint8_t *ref_ptr,
    292                                     int  recon_stride,
    293                                     unsigned int *sse) {
    294   unsigned int var;
    295   int avg;
    296 
    297   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
    298                 &var, &avg, vp9_get16x16var_sse2, 16);
    299   *sse = var;
    300   return (var - (((int64_t)avg * avg) >> 9));
    301 }
    302 
    303 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
    304                                     int  source_stride,
    305                                     const uint8_t *ref_ptr,
    306                                     int  recon_stride,
    307                                     unsigned int *sse) {
    308   unsigned int var;
    309   int avg;
    310 
    311   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
    312                 &var, &avg, vp9_get16x16var_sse2, 16);
    313   *sse = var;
    314   return (var - (((int64_t)avg * avg) >> 12));
    315 }
    316 
    317 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
    318                                     int  source_stride,
    319                                     const uint8_t *ref_ptr,
    320                                     int  recon_stride,
    321                                     unsigned int *sse) {
    322   unsigned int var;
    323   int avg;
    324 
    325   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
    326                 &var, &avg, vp9_get16x16var_sse2, 16);
    327   *sse = var;
    328   return (var - (((int64_t)avg * avg) >> 11));
    329 }
    330 
    331 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
    332                                     int  source_stride,
    333                                     const uint8_t *ref_ptr,
    334                                     int  recon_stride,
    335                                     unsigned int *sse) {
    336   unsigned int var;
    337   int avg;
    338 
    339   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
    340                 &var, &avg, vp9_get16x16var_sse2, 16);
    341   *sse = var;
    342   return (var - (((int64_t)avg * avg) >> 11));
    343 }
    344 
    345 #define DECL(w, opt) \
    346 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
    347                                         ptrdiff_t src_stride, \
    348                                         int x_offset, int y_offset, \
    349                                         const uint8_t *dst, \
    350                                         ptrdiff_t dst_stride, \
    351                                         int height, unsigned int *sse)
    352 #define DECLS(opt1, opt2) \
    353 DECL(4, opt2); \
    354 DECL(8, opt1); \
    355 DECL(16, opt1)
    356 
    357 DECLS(sse2, sse);
    358 DECLS(ssse3, ssse3);
    359 #undef DECLS
    360 #undef DECL
    361 
    362 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    363 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
    364                                                      int src_stride, \
    365                                                      int x_offset, \
    366                                                      int y_offset, \
    367                                                      const uint8_t *dst, \
    368                                                      int dst_stride, \
    369                                                      unsigned int *sse_ptr) { \
    370   unsigned int sse; \
    371   int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
    372                                                 y_offset, dst, dst_stride, \
    373                                                 h, &sse); \
    374   if (w > wf) { \
    375     unsigned int sse2; \
    376     int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
    377                                                    x_offset, y_offset, \
    378                                                    dst + 16, dst_stride, \
    379                                                    h, &sse2); \
    380     se += se2; \
    381     sse += sse2; \
    382     if (w > wf * 2) { \
    383       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
    384                                                  x_offset, y_offset, \
    385                                                  dst + 32, dst_stride, \
    386                                                  h, &sse2); \
    387       se += se2; \
    388       sse += sse2; \
    389       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
    390                                                  x_offset, y_offset, \
    391                                                  dst + 48, dst_stride, \
    392                                                  h, &sse2); \
    393       se += se2; \
    394       sse += sse2; \
    395     } \
    396   } \
    397   *sse_ptr = sse; \
    398   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    399 }
    400 
    401 #define FNS(opt1, opt2) \
    402 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    403 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    404 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    405 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    406 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    407 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    408 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    409 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
    410 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
    411 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
    412 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
    413 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
    414 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
    415 
    416 FNS(sse2, sse);
    417 FNS(ssse3, ssse3);
    418 
    419 #undef FNS
    420 #undef FN
    421 
    422 #define DECL(w, opt) \
    423 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
    424                                             ptrdiff_t src_stride, \
    425                                             int x_offset, int y_offset, \
    426                                             const uint8_t *dst, \
    427                                             ptrdiff_t dst_stride, \
    428                                             const uint8_t *sec, \
    429                                             ptrdiff_t sec_stride, \
    430                                             int height, unsigned int *sse)
    431 #define DECLS(opt1, opt2) \
    432 DECL(4, opt2); \
    433 DECL(8, opt1); \
    434 DECL(16, opt1)
    435 
    436 DECLS(sse2, sse);
    437 DECLS(ssse3, ssse3);
    438 #undef DECL
    439 #undef DECLS
    440 
    441 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    442 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
    443                                                          int src_stride, \
    444                                                          int x_offset, \
    445                                                          int y_offset, \
    446                                                          const uint8_t *dst, \
    447                                                          int dst_stride, \
    448                                                          unsigned int *sseptr, \
    449                                                          const uint8_t *sec) { \
    450   unsigned int sse; \
    451   int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
    452                                                     y_offset, dst, dst_stride, \
    453                                                     sec, w, h, &sse); \
    454   if (w > wf) { \
    455     unsigned int sse2; \
    456     int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
    457                                                        x_offset, y_offset, \
    458                                                        dst + 16, dst_stride, \
    459                                                        sec + 16, w, h, &sse2); \
    460     se += se2; \
    461     sse += sse2; \
    462     if (w > wf * 2) { \
    463       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
    464                                                      x_offset, y_offset, \
    465                                                      dst + 32, dst_stride, \
    466                                                      sec + 32, w, h, &sse2); \
    467       se += se2; \
    468       sse += sse2; \
    469       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
    470                                                      x_offset, y_offset, \
    471                                                      dst + 48, dst_stride, \
    472                                                      sec + 48, w, h, &sse2); \
    473       se += se2; \
    474       sse += sse2; \
    475     } \
    476   } \
    477   *sseptr = sse; \
    478   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    479 }
    480 
    481 #define FNS(opt1, opt2) \
    482 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    483 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    484 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    485 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    486 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    487 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    488 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    489 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
    490 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
    491 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
    492 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
    493 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
    494 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
    495 
    496 FNS(sse2, sse);
    497 FNS(ssse3, ssse3);
    498 
    499 #undef FNS
    500 #undef FN
    501 
    502 unsigned int vp9_variance_halfpixvar16x16_h_sse2(
    503   const unsigned char *src_ptr,
    504   int  src_pixels_per_line,
    505   const unsigned char *dst_ptr,
    506   int  dst_pixels_per_line,
    507   unsigned int *sse) {
    508   int xsum0;
    509   unsigned int xxsum0;
    510 
    511   vp9_half_horiz_variance16x_h_sse2(
    512     src_ptr, src_pixels_per_line,
    513     dst_ptr, dst_pixels_per_line, 16,
    514     &xsum0, &xxsum0);
    515 
    516   *sse = xxsum0;
    517   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    518 }
    519 
    520 
    521 unsigned int vp9_variance_halfpixvar16x16_v_sse2(
    522   const unsigned char *src_ptr,
    523   int  src_pixels_per_line,
    524   const unsigned char *dst_ptr,
    525   int  dst_pixels_per_line,
    526   unsigned int *sse) {
    527   int xsum0;
    528   unsigned int xxsum0;
    529   vp9_half_vert_variance16x_h_sse2(
    530     src_ptr, src_pixels_per_line,
    531     dst_ptr, dst_pixels_per_line, 16,
    532     &xsum0, &xxsum0);
    533 
    534   *sse = xxsum0;
    535   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    536 }
    537 
    538 
    539 unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
    540   const unsigned char *src_ptr,
    541   int  src_pixels_per_line,
    542   const unsigned char *dst_ptr,
    543   int  dst_pixels_per_line,
    544   unsigned int *sse) {
    545   int xsum0;
    546   unsigned int xxsum0;
    547 
    548   vp9_half_horiz_vert_variance16x_h_sse2(
    549     src_ptr, src_pixels_per_line,
    550     dst_ptr, dst_pixels_per_line, 16,
    551     &xsum0, &xxsum0);
    552 
    553   *sse = xxsum0;
    554   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    555 }
    556