Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_config.h"
     12 
     13 #include "vp9/encoder/vp9_variance.h"
     14 #include "vp9/common/vp9_pragmas.h"
     15 #include "vpx_ports/mem.h"
     16 
     17 extern unsigned int vp9_get4x4var_mmx
     18 (
     19   const unsigned char *src_ptr,
     20   int  source_stride,
     21   const unsigned char *ref_ptr,
     22   int  recon_stride,
     23   unsigned int *SSE,
     24   int *Sum
     25 );
     26 
     27 unsigned int vp9_get16x16var_sse2
     28 (
     29   const unsigned char *src_ptr,
     30   int source_stride,
     31   const unsigned char *ref_ptr,
     32   int recon_stride,
     33   unsigned int *SSE,
     34   int *Sum
     35 );
     36 unsigned int vp9_get8x8var_sse2
     37 (
     38   const unsigned char *src_ptr,
     39   int source_stride,
     40   const unsigned char *ref_ptr,
     41   int recon_stride,
     42   unsigned int *SSE,
     43   int *Sum
     44 );
     45 void vp9_half_horiz_vert_variance8x_h_sse2
     46 (
     47   const unsigned char *ref_ptr,
     48   int ref_pixels_per_line,
     49   const unsigned char *src_ptr,
     50   int src_pixels_per_line,
     51   unsigned int Height,
     52   int *sum,
     53   unsigned int *sumsquared
     54 );
     55 void vp9_half_horiz_vert_variance16x_h_sse2
     56 (
     57   const unsigned char *ref_ptr,
     58   int ref_pixels_per_line,
     59   const unsigned char *src_ptr,
     60   int src_pixels_per_line,
     61   unsigned int Height,
     62   int *sum,
     63   unsigned int *sumsquared
     64 );
     65 void vp9_half_horiz_variance8x_h_sse2
     66 (
     67   const unsigned char *ref_ptr,
     68   int ref_pixels_per_line,
     69   const unsigned char *src_ptr,
     70   int src_pixels_per_line,
     71   unsigned int Height,
     72   int *sum,
     73   unsigned int *sumsquared
     74 );
     75 void vp9_half_horiz_variance16x_h_sse2
     76 (
     77   const unsigned char *ref_ptr,
     78   int ref_pixels_per_line,
     79   const unsigned char *src_ptr,
     80   int src_pixels_per_line,
     81   unsigned int Height,
     82   int *sum,
     83   unsigned int *sumsquared
     84 );
     85 void vp9_half_vert_variance8x_h_sse2
     86 (
     87   const unsigned char *ref_ptr,
     88   int ref_pixels_per_line,
     89   const unsigned char *src_ptr,
     90   int src_pixels_per_line,
     91   unsigned int Height,
     92   int *sum,
     93   unsigned int *sumsquared
     94 );
     95 void vp9_half_vert_variance16x_h_sse2
     96 (
     97   const unsigned char *ref_ptr,
     98   int ref_pixels_per_line,
     99   const unsigned char *src_ptr,
    100   int src_pixels_per_line,
    101   unsigned int Height,
    102   int *sum,
    103   unsigned int *sumsquared
    104 );
    105 
    106 typedef unsigned int (*get_var_sse2) (
    107   const unsigned char *src_ptr,
    108   int source_stride,
    109   const unsigned char *ref_ptr,
    110   int recon_stride,
    111   unsigned int *SSE,
    112   int *Sum
    113 );
    114 
    115 static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
    116                         const unsigned char *ref_ptr, int  recon_stride,
    117                         int  w, int  h, unsigned int *sse, int *sum,
    118                         get_var_sse2 var_fn, int block_size) {
    119   unsigned int sse0;
    120   int sum0;
    121   int i, j;
    122 
    123   *sse = 0;
    124   *sum = 0;
    125 
    126   for (i = 0; i < h; i += block_size) {
    127     for (j = 0; j < w; j += block_size) {
    128       var_fn(src_ptr + source_stride * i + j, source_stride,
    129              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
    130       *sse += sse0;
    131       *sum += sum0;
    132     }
    133   }
    134 }
    135 
    136 unsigned int vp9_variance4x4_sse2(
    137   const unsigned char *src_ptr,
    138   int  source_stride,
    139   const unsigned char *ref_ptr,
    140   int  recon_stride,
    141   unsigned int *sse) {
    142   unsigned int var;
    143   int avg;
    144 
    145   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
    146                   &var, &avg, vp9_get4x4var_mmx, 4);
    147   *sse = var;
    148   return (var - (((unsigned int)avg * avg) >> 4));
    149 }
    150 
    151 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
    152                                   int  source_stride,
    153                                   const uint8_t *ref_ptr,
    154                                   int  recon_stride,
    155                                   unsigned int *sse) {
    156   unsigned int var;
    157   int avg;
    158 
    159   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
    160                   &var, &avg, vp9_get4x4var_mmx, 4);
    161   *sse = var;
    162   return (var - (((unsigned int)avg * avg) >> 5));
    163 }
    164 
    165 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
    166                                   int  source_stride,
    167                                   const uint8_t *ref_ptr,
    168                                   int  recon_stride,
    169                                   unsigned int *sse) {
    170   unsigned int var;
    171   int avg;
    172 
    173   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
    174                   &var, &avg, vp9_get4x4var_mmx, 4);
    175   *sse = var;
    176   return (var - (((unsigned int)avg * avg) >> 5));
    177 }
    178 
    179 unsigned int vp9_variance8x8_sse2
    180 (
    181   const unsigned char *src_ptr,
    182   int  source_stride,
    183   const unsigned char *ref_ptr,
    184   int  recon_stride,
    185   unsigned int *sse) {
    186   unsigned int var;
    187   int avg;
    188 
    189   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
    190                   &var, &avg, vp9_get8x8var_sse2, 8);
    191   *sse = var;
    192   return (var - (((unsigned int)avg * avg) >> 6));
    193 }
    194 
    195 unsigned int vp9_variance16x8_sse2
    196 (
    197   const unsigned char *src_ptr,
    198   int  source_stride,
    199   const unsigned char *ref_ptr,
    200   int  recon_stride,
    201   unsigned int *sse) {
    202   unsigned int var;
    203   int avg;
    204 
    205   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
    206                   &var, &avg, vp9_get8x8var_sse2, 8);
    207   *sse = var;
    208   return (var - (((unsigned int)avg * avg) >> 7));
    209 }
    210 
    211 unsigned int vp9_variance8x16_sse2
    212 (
    213   const unsigned char *src_ptr,
    214   int  source_stride,
    215   const unsigned char *ref_ptr,
    216   int  recon_stride,
    217   unsigned int *sse) {
    218   unsigned int var;
    219   int avg;
    220 
    221   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
    222                 &var, &avg, vp9_get8x8var_sse2, 8);
    223   *sse = var;
    224   return (var - (((unsigned int)avg * avg) >> 7));
    225 }
    226 
    227 unsigned int vp9_variance16x16_sse2
    228 (
    229   const unsigned char *src_ptr,
    230   int  source_stride,
    231   const unsigned char *ref_ptr,
    232   int  recon_stride,
    233   unsigned int *sse) {
    234   unsigned int var;
    235   int avg;
    236 
    237   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
    238                 &var, &avg, vp9_get16x16var_sse2, 16);
    239   *sse = var;
    240   return (var - (((unsigned int)avg * avg) >> 8));
    241 }
    242 
    243 unsigned int vp9_mse16x16_sse2(
    244   const unsigned char *src_ptr,
    245   int  source_stride,
    246   const unsigned char *ref_ptr,
    247   int  recon_stride,
    248   unsigned int *sse) {
    249   unsigned int sse0;
    250   int sum0;
    251   vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
    252                        &sum0);
    253   *sse = sse0;
    254   return sse0;
    255 }
    256 
    257 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
    258                                     int  source_stride,
    259                                     const uint8_t *ref_ptr,
    260                                     int  recon_stride,
    261                                     unsigned int *sse) {
    262   unsigned int var;
    263   int avg;
    264 
    265   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
    266                 &var, &avg, vp9_get16x16var_sse2, 16);
    267   *sse = var;
    268   return (var - (((int64_t)avg * avg) >> 10));
    269 }
    270 
    271 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
    272                                     int  source_stride,
    273                                     const uint8_t *ref_ptr,
    274                                     int  recon_stride,
    275                                     unsigned int *sse) {
    276   unsigned int var;
    277   int avg;
    278 
    279   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
    280                 &var, &avg, vp9_get16x16var_sse2, 16);
    281   *sse = var;
    282   return (var - (((int64_t)avg * avg) >> 9));
    283 }
    284 
    285 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
    286                                     int  source_stride,
    287                                     const uint8_t *ref_ptr,
    288                                     int  recon_stride,
    289                                     unsigned int *sse) {
    290   unsigned int var;
    291   int avg;
    292 
    293   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
    294                 &var, &avg, vp9_get16x16var_sse2, 16);
    295   *sse = var;
    296   return (var - (((int64_t)avg * avg) >> 9));
    297 }
    298 
    299 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
    300                                     int  source_stride,
    301                                     const uint8_t *ref_ptr,
    302                                     int  recon_stride,
    303                                     unsigned int *sse) {
    304   unsigned int var;
    305   int avg;
    306 
    307   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
    308                 &var, &avg, vp9_get16x16var_sse2, 16);
    309   *sse = var;
    310   return (var - (((int64_t)avg * avg) >> 12));
    311 }
    312 
    313 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
    314                                     int  source_stride,
    315                                     const uint8_t *ref_ptr,
    316                                     int  recon_stride,
    317                                     unsigned int *sse) {
    318   unsigned int var;
    319   int avg;
    320 
    321   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
    322                 &var, &avg, vp9_get16x16var_sse2, 16);
    323   *sse = var;
    324   return (var - (((int64_t)avg * avg) >> 11));
    325 }
    326 
    327 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
    328                                     int  source_stride,
    329                                     const uint8_t *ref_ptr,
    330                                     int  recon_stride,
    331                                     unsigned int *sse) {
    332   unsigned int var;
    333   int avg;
    334 
    335   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
    336                 &var, &avg, vp9_get16x16var_sse2, 16);
    337   *sse = var;
    338   return (var - (((int64_t)avg * avg) >> 11));
    339 }
    340 
    341 #define DECL(w, opt) \
    342 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
    343                                         ptrdiff_t src_stride, \
    344                                         int x_offset, int y_offset, \
    345                                         const uint8_t *dst, \
    346                                         ptrdiff_t dst_stride, \
    347                                         int height, unsigned int *sse)
    348 #define DECLS(opt1, opt2) \
    349 DECL(4, opt2); \
    350 DECL(8, opt1); \
    351 DECL(16, opt1)
    352 
    353 DECLS(sse2, sse);
    354 DECLS(ssse3, ssse3);
    355 #undef DECLS
    356 #undef DECL
    357 
    358 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    359 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
    360                                                      int src_stride, \
    361                                                      int x_offset, \
    362                                                      int y_offset, \
    363                                                      const uint8_t *dst, \
    364                                                      int dst_stride, \
    365                                                      unsigned int *sse_ptr) { \
    366   unsigned int sse; \
    367   int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
    368                                                 y_offset, dst, dst_stride, \
    369                                                 h, &sse); \
    370   if (w > wf) { \
    371     unsigned int sse2; \
    372     int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
    373                                                    x_offset, y_offset, \
    374                                                    dst + 16, dst_stride, \
    375                                                    h, &sse2); \
    376     se += se2; \
    377     sse += sse2; \
    378     if (w > wf * 2) { \
    379       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
    380                                                  x_offset, y_offset, \
    381                                                  dst + 32, dst_stride, \
    382                                                  h, &sse2); \
    383       se += se2; \
    384       sse += sse2; \
    385       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
    386                                                  x_offset, y_offset, \
    387                                                  dst + 48, dst_stride, \
    388                                                  h, &sse2); \
    389       se += se2; \
    390       sse += sse2; \
    391     } \
    392   } \
    393   *sse_ptr = sse; \
    394   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    395 }
    396 
    397 #define FNS(opt1, opt2) \
    398 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    399 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    400 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    401 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    402 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    403 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    404 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    405 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
    406 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
    407 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
    408 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
    409 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
    410 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
    411 
    412 FNS(sse2, sse);
    413 FNS(ssse3, ssse3);
    414 
    415 #undef FNS
    416 #undef FN
    417 
    418 #define DECL(w, opt) \
    419 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
    420                                             ptrdiff_t src_stride, \
    421                                             int x_offset, int y_offset, \
    422                                             const uint8_t *dst, \
    423                                             ptrdiff_t dst_stride, \
    424                                             const uint8_t *sec, \
    425                                             ptrdiff_t sec_stride, \
    426                                             int height, unsigned int *sse)
    427 #define DECLS(opt1, opt2) \
    428 DECL(4, opt2); \
    429 DECL(8, opt1); \
    430 DECL(16, opt1)
    431 
    432 DECLS(sse2, sse);
    433 DECLS(ssse3, ssse3);
    434 #undef DECL
    435 #undef DECLS
    436 
    437 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
    438 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
    439                                                          int src_stride, \
    440                                                          int x_offset, \
    441                                                          int y_offset, \
    442                                                          const uint8_t *dst, \
    443                                                          int dst_stride, \
    444                                                          unsigned int *sseptr, \
    445                                                          const uint8_t *sec) { \
    446   unsigned int sse; \
    447   int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
    448                                                     y_offset, dst, dst_stride, \
    449                                                     sec, w, h, &sse); \
    450   if (w > wf) { \
    451     unsigned int sse2; \
    452     int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
    453                                                        x_offset, y_offset, \
    454                                                        dst + 16, dst_stride, \
    455                                                        sec + 16, w, h, &sse2); \
    456     se += se2; \
    457     sse += sse2; \
    458     if (w > wf * 2) { \
    459       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
    460                                                      x_offset, y_offset, \
    461                                                      dst + 32, dst_stride, \
    462                                                      sec + 32, w, h, &sse2); \
    463       se += se2; \
    464       sse += sse2; \
    465       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
    466                                                      x_offset, y_offset, \
    467                                                      dst + 48, dst_stride, \
    468                                                      sec + 48, w, h, &sse2); \
    469       se += se2; \
    470       sse += sse2; \
    471     } \
    472   } \
    473   *sseptr = sse; \
    474   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
    475 }
    476 
    477 #define FNS(opt1, opt2) \
    478 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
    479 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
    480 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
    481 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
    482 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
    483 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
    484 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
    485 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
    486 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
    487 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
    488 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
    489 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
    490 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
    491 
    492 FNS(sse2, sse);
    493 FNS(ssse3, ssse3);
    494 
    495 #undef FNS
    496 #undef FN
    497 
    498 unsigned int vp9_variance_halfpixvar16x16_h_sse2(
    499   const unsigned char *src_ptr,
    500   int  src_pixels_per_line,
    501   const unsigned char *dst_ptr,
    502   int  dst_pixels_per_line,
    503   unsigned int *sse) {
    504   int xsum0;
    505   unsigned int xxsum0;
    506 
    507   vp9_half_horiz_variance16x_h_sse2(
    508     src_ptr, src_pixels_per_line,
    509     dst_ptr, dst_pixels_per_line, 16,
    510     &xsum0, &xxsum0);
    511 
    512   *sse = xxsum0;
    513   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    514 }
    515 
    516 
    517 unsigned int vp9_variance_halfpixvar16x16_v_sse2(
    518   const unsigned char *src_ptr,
    519   int  src_pixels_per_line,
    520   const unsigned char *dst_ptr,
    521   int  dst_pixels_per_line,
    522   unsigned int *sse) {
    523   int xsum0;
    524   unsigned int xxsum0;
    525   vp9_half_vert_variance16x_h_sse2(
    526     src_ptr, src_pixels_per_line,
    527     dst_ptr, dst_pixels_per_line, 16,
    528     &xsum0, &xxsum0);
    529 
    530   *sse = xxsum0;
    531   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    532 }
    533 
    534 
    535 unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
    536   const unsigned char *src_ptr,
    537   int  src_pixels_per_line,
    538   const unsigned char *dst_ptr,
    539   int  dst_pixels_per_line,
    540   unsigned int *sse) {
    541   int xsum0;
    542   unsigned int xxsum0;
    543 
    544   vp9_half_horiz_vert_variance16x_h_sse2(
    545     src_ptr, src_pixels_per_line,
    546     dst_ptr, dst_pixels_per_line, 16,
    547     &xsum0, &xxsum0);
    548 
    549   *sse = xxsum0;
    550   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    551 }
    552