Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include "vp8/encoder/variance.h"
     13 #include "vp8/common/pragmas.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     20 
     21 extern void vp8_filter_block2d_bil4x4_var_mmx
     22 (
     23     const unsigned char *ref_ptr,
     24     int ref_pixels_per_line,
     25     const unsigned char *src_ptr,
     26     int src_pixels_per_line,
     27     const short *HFilter,
     28     const short *VFilter,
     29     int *sum,
     30     unsigned int *sumsquared
     31 );
     32 
     33 extern unsigned int vp8_get4x4var_mmx
     34 (
     35     const unsigned char *src_ptr,
     36     int  source_stride,
     37     const unsigned char *ref_ptr,
     38     int  recon_stride,
     39     unsigned int *SSE,
     40     int *Sum
     41 );
     42 
     43 unsigned int vp8_get_mb_ss_sse2
     44 (
     45     const short *src_ptr
     46 );
     47 unsigned int vp8_get16x16var_sse2
     48 (
     49     const unsigned char *src_ptr,
     50     int source_stride,
     51     const unsigned char *ref_ptr,
     52     int recon_stride,
     53     unsigned int *SSE,
     54     int *Sum
     55 );
     56 unsigned int vp8_get16x16pred_error_sse2
     57 (
     58     const unsigned char *src_ptr,
     59     int src_stride,
     60     const unsigned char *ref_ptr,
     61     int ref_stride
     62 );
     63 unsigned int vp8_get8x8var_sse2
     64 (
     65     const unsigned char *src_ptr,
     66     int source_stride,
     67     const unsigned char *ref_ptr,
     68     int recon_stride,
     69     unsigned int *SSE,
     70     int *Sum
     71 );
     72 void vp8_filter_block2d_bil_var_sse2
     73 (
     74     const unsigned char *ref_ptr,
     75     int ref_pixels_per_line,
     76     const unsigned char *src_ptr,
     77     int src_pixels_per_line,
     78     unsigned int Height,
     79     int  xoffset,
     80     int  yoffset,
     81     int *sum,
     82     unsigned int *sumsquared
     83 );
     84 void vp8_half_horiz_vert_variance8x_h_sse2
     85 (
     86     const unsigned char *ref_ptr,
     87     int ref_pixels_per_line,
     88     const unsigned char *src_ptr,
     89     int src_pixels_per_line,
     90     unsigned int Height,
     91     int *sum,
     92     unsigned int *sumsquared
     93 );
     94 void vp8_half_horiz_vert_variance16x_h_sse2
     95 (
     96     const unsigned char *ref_ptr,
     97     int ref_pixels_per_line,
     98     const unsigned char *src_ptr,
     99     int src_pixels_per_line,
    100     unsigned int Height,
    101     int *sum,
    102     unsigned int *sumsquared
    103 );
    104 void vp8_half_horiz_variance8x_h_sse2
    105 (
    106     const unsigned char *ref_ptr,
    107     int ref_pixels_per_line,
    108     const unsigned char *src_ptr,
    109     int src_pixels_per_line,
    110     unsigned int Height,
    111     int *sum,
    112     unsigned int *sumsquared
    113 );
    114 void vp8_half_horiz_variance16x_h_sse2
    115 (
    116     const unsigned char *ref_ptr,
    117     int ref_pixels_per_line,
    118     const unsigned char *src_ptr,
    119     int src_pixels_per_line,
    120     unsigned int Height,
    121     int *sum,
    122     unsigned int *sumsquared
    123 );
    124 void vp8_half_vert_variance8x_h_sse2
    125 (
    126     const unsigned char *ref_ptr,
    127     int ref_pixels_per_line,
    128     const unsigned char *src_ptr,
    129     int src_pixels_per_line,
    130     unsigned int Height,
    131     int *sum,
    132     unsigned int *sumsquared
    133 );
    134 void vp8_half_vert_variance16x_h_sse2
    135 (
    136     const unsigned char *ref_ptr,
    137     int ref_pixels_per_line,
    138     const unsigned char *src_ptr,
    139     int src_pixels_per_line,
    140     unsigned int Height,
    141     int *sum,
    142     unsigned int *sumsquared
    143 );
    144 
    145 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
    146 
    147 unsigned int vp8_variance4x4_wmt(
    148     const unsigned char *src_ptr,
    149     int  source_stride,
    150     const unsigned char *ref_ptr,
    151     int  recon_stride)
    152 {
    153     unsigned int var;
    154     int avg;
    155 
    156     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    157     return (var - ((avg * avg) >> 4));
    158 
    159 }
    160 
    161 
    162 
    163 unsigned int vp8_variance8x8_wmt
    164 (
    165     const unsigned char *src_ptr,
    166     int  source_stride,
    167     const unsigned char *ref_ptr,
    168     int  recon_stride)
    169 {
    170     unsigned int var;
    171     int avg;
    172 
    173     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    174 
    175     return (var - ((avg * avg) >> 6));
    176 
    177 }
    178 
    179 
    180 unsigned int vp8_variance16x16_wmt
    181 (
    182     const unsigned char *src_ptr,
    183     int  source_stride,
    184     const unsigned char *ref_ptr,
    185     int  recon_stride,
    186     unsigned int *sse)
    187 {
    188     unsigned int sse0;
    189     int sum0;
    190 
    191 
    192     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    193     *sse = sse0;
    194     return (sse0 - ((sum0 * sum0) >> 8));
    195 }
    196 unsigned int vp8_mse16x16_wmt(
    197     const unsigned char *src_ptr,
    198     int  source_stride,
    199     const unsigned char *ref_ptr,
    200     int  recon_stride,
    201     unsigned int *sse)
    202 {
    203 
    204     unsigned int sse0;
    205     int sum0;
    206     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    207     *sse = sse0;
    208     return sse0;
    209 
    210 }
    211 
    212 
    213 unsigned int vp8_variance16x8_wmt
    214 (
    215     const unsigned char *src_ptr,
    216     int  source_stride,
    217     const unsigned char *ref_ptr,
    218     int  recon_stride,
    219     unsigned int *sse)
    220 {
    221     unsigned int sse0, sse1, var;
    222     int sum0, sum1, avg;
    223 
    224     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    225     vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    226 
    227     var = sse0 + sse1;
    228     avg = sum0 + sum1;
    229     *sse = var;
    230     return (var - ((avg * avg) >> 7));
    231 
    232 }
    233 
    234 unsigned int vp8_variance8x16_wmt
    235 (
    236     const unsigned char *src_ptr,
    237     int  source_stride,
    238     const unsigned char *ref_ptr,
    239     int  recon_stride,
    240     unsigned int *sse)
    241 {
    242     unsigned int sse0, sse1, var;
    243     int sum0, sum1, avg;
    244 
    245     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    246     vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
    247 
    248     var = sse0 + sse1;
    249     avg = sum0 + sum1;
    250     *sse = var;
    251     return (var - ((avg * avg) >> 7));
    252 
    253 }
    254 
    255 unsigned int vp8_sub_pixel_variance4x4_wmt
    256 (
    257     const unsigned char  *src_ptr,
    258     int  src_pixels_per_line,
    259     int  xoffset,
    260     int  yoffset,
    261     const unsigned char *dst_ptr,
    262     int dst_pixels_per_line,
    263     unsigned int *sse
    264 )
    265 {
    266     int xsum;
    267     unsigned int xxsum;
    268     vp8_filter_block2d_bil4x4_var_mmx(
    269         src_ptr, src_pixels_per_line,
    270         dst_ptr, dst_pixels_per_line,
    271         vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
    272         &xsum, &xxsum
    273     );
    274     *sse = xxsum;
    275     return (xxsum - ((xsum * xsum) >> 4));
    276 }
    277 
    278 
    279 unsigned int vp8_sub_pixel_variance8x8_wmt
    280 (
    281     const unsigned char  *src_ptr,
    282     int  src_pixels_per_line,
    283     int  xoffset,
    284     int  yoffset,
    285     const unsigned char *dst_ptr,
    286     int dst_pixels_per_line,
    287     unsigned int *sse
    288 )
    289 {
    290     int xsum;
    291     unsigned int xxsum;
    292 
    293     if (xoffset == 4 && yoffset == 0)
    294     {
    295         vp8_half_horiz_variance8x_h_sse2(
    296             src_ptr, src_pixels_per_line,
    297             dst_ptr, dst_pixels_per_line, 8,
    298             &xsum, &xxsum);
    299     }
    300     else if (xoffset == 0 && yoffset == 4)
    301     {
    302         vp8_half_vert_variance8x_h_sse2(
    303             src_ptr, src_pixels_per_line,
    304             dst_ptr, dst_pixels_per_line, 8,
    305             &xsum, &xxsum);
    306     }
    307     else if (xoffset == 4 && yoffset == 4)
    308     {
    309         vp8_half_horiz_vert_variance8x_h_sse2(
    310             src_ptr, src_pixels_per_line,
    311             dst_ptr, dst_pixels_per_line, 8,
    312             &xsum, &xxsum);
    313     }
    314     else
    315     {
    316         vp8_filter_block2d_bil_var_sse2(
    317             src_ptr, src_pixels_per_line,
    318             dst_ptr, dst_pixels_per_line, 8,
    319             xoffset, yoffset,
    320             &xsum, &xxsum);
    321     }
    322 
    323     *sse = xxsum;
    324     return (xxsum - ((xsum * xsum) >> 6));
    325 }
    326 
    327 unsigned int vp8_sub_pixel_variance16x16_wmt
    328 (
    329     const unsigned char  *src_ptr,
    330     int  src_pixels_per_line,
    331     int  xoffset,
    332     int  yoffset,
    333     const unsigned char *dst_ptr,
    334     int dst_pixels_per_line,
    335     unsigned int *sse
    336 )
    337 {
    338     int xsum0, xsum1;
    339     unsigned int xxsum0, xxsum1;
    340 
    341 
    342     // note we could avoid these if statements if the calling function
    343     // just called the appropriate functions inside.
    344     if (xoffset == 4 && yoffset == 0)
    345     {
    346         vp8_half_horiz_variance16x_h_sse2(
    347             src_ptr, src_pixels_per_line,
    348             dst_ptr, dst_pixels_per_line, 16,
    349             &xsum0, &xxsum0);
    350     }
    351     else if (xoffset == 0 && yoffset == 4)
    352     {
    353         vp8_half_vert_variance16x_h_sse2(
    354             src_ptr, src_pixels_per_line,
    355             dst_ptr, dst_pixels_per_line, 16,
    356             &xsum0, &xxsum0);
    357     }
    358     else if (xoffset == 4 && yoffset == 4)
    359     {
    360         vp8_half_horiz_vert_variance16x_h_sse2(
    361             src_ptr, src_pixels_per_line,
    362             dst_ptr, dst_pixels_per_line, 16,
    363             &xsum0, &xxsum0);
    364     }
    365     else
    366     {
    367         vp8_filter_block2d_bil_var_sse2(
    368             src_ptr, src_pixels_per_line,
    369             dst_ptr, dst_pixels_per_line, 16,
    370             xoffset, yoffset,
    371             &xsum0, &xxsum0
    372         );
    373 
    374         vp8_filter_block2d_bil_var_sse2(
    375             src_ptr + 8, src_pixels_per_line,
    376             dst_ptr + 8, dst_pixels_per_line, 16,
    377             xoffset, yoffset,
    378             &xsum1, &xxsum1
    379         );
    380         xsum0 += xsum1;
    381         xxsum0 += xxsum1;
    382     }
    383 
    384     *sse = xxsum0;
    385     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    386 }
    387 
    388 unsigned int vp8_sub_pixel_mse16x16_wmt(
    389     const unsigned char  *src_ptr,
    390     int  src_pixels_per_line,
    391     int  xoffset,
    392     int  yoffset,
    393     const unsigned char *dst_ptr,
    394     int dst_pixels_per_line,
    395     unsigned int *sse
    396 )
    397 {
    398     vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
    399     return *sse;
    400 }
    401 
    402 unsigned int vp8_sub_pixel_variance16x8_wmt
    403 (
    404     const unsigned char  *src_ptr,
    405     int  src_pixels_per_line,
    406     int  xoffset,
    407     int  yoffset,
    408     const unsigned char *dst_ptr,
    409     int dst_pixels_per_line,
    410     unsigned int *sse
    411 
    412 )
    413 {
    414     int xsum0, xsum1;
    415     unsigned int xxsum0, xxsum1;
    416 
    417     if (xoffset == 4 && yoffset == 0)
    418     {
    419         vp8_half_horiz_variance16x_h_sse2(
    420             src_ptr, src_pixels_per_line,
    421             dst_ptr, dst_pixels_per_line, 8,
    422             &xsum0, &xxsum0);
    423     }
    424     else if (xoffset == 0 && yoffset == 4)
    425     {
    426         vp8_half_vert_variance16x_h_sse2(
    427             src_ptr, src_pixels_per_line,
    428             dst_ptr, dst_pixels_per_line, 8,
    429             &xsum0, &xxsum0);
    430     }
    431     else if (xoffset == 4 && yoffset == 4)
    432     {
    433         vp8_half_horiz_vert_variance16x_h_sse2(
    434             src_ptr, src_pixels_per_line,
    435             dst_ptr, dst_pixels_per_line, 8,
    436             &xsum0, &xxsum0);
    437     }
    438     else
    439     {
    440         vp8_filter_block2d_bil_var_sse2(
    441             src_ptr, src_pixels_per_line,
    442             dst_ptr, dst_pixels_per_line, 8,
    443             xoffset, yoffset,
    444             &xsum0, &xxsum0);
    445 
    446         vp8_filter_block2d_bil_var_sse2(
    447             src_ptr + 8, src_pixels_per_line,
    448             dst_ptr + 8, dst_pixels_per_line, 8,
    449             xoffset, yoffset,
    450             &xsum1, &xxsum1);
    451         xsum0 += xsum1;
    452         xxsum0 += xxsum1;
    453     }
    454 
    455     *sse = xxsum0;
    456     return (xxsum0 - ((xsum0 * xsum0) >> 7));
    457 }
    458 
    459 unsigned int vp8_sub_pixel_variance8x16_wmt
    460 (
    461     const unsigned char  *src_ptr,
    462     int  src_pixels_per_line,
    463     int  xoffset,
    464     int  yoffset,
    465     const unsigned char *dst_ptr,
    466     int dst_pixels_per_line,
    467     unsigned int *sse
    468 )
    469 {
    470     int xsum;
    471     unsigned int xxsum;
    472 
    473     if (xoffset == 4 && yoffset == 0)
    474     {
    475         vp8_half_horiz_variance8x_h_sse2(
    476             src_ptr, src_pixels_per_line,
    477             dst_ptr, dst_pixels_per_line, 16,
    478             &xsum, &xxsum);
    479     }
    480     else if (xoffset == 0 && yoffset == 4)
    481     {
    482         vp8_half_vert_variance8x_h_sse2(
    483             src_ptr, src_pixels_per_line,
    484             dst_ptr, dst_pixels_per_line, 16,
    485             &xsum, &xxsum);
    486     }
    487     else if (xoffset == 4 && yoffset == 4)
    488     {
    489         vp8_half_horiz_vert_variance8x_h_sse2(
    490             src_ptr, src_pixels_per_line,
    491             dst_ptr, dst_pixels_per_line, 16,
    492             &xsum, &xxsum);
    493     }
    494     else
    495     {
    496         vp8_filter_block2d_bil_var_sse2(
    497             src_ptr, src_pixels_per_line,
    498             dst_ptr, dst_pixels_per_line, 16,
    499             xoffset, yoffset,
    500             &xsum, &xxsum);
    501     }
    502 
    503     *sse = xxsum;
    504     return (xxsum - ((xsum * xsum) >> 7));
    505 }
    506 
    507 
    508 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
    509     const unsigned char *src_ptr,
    510     int  src_pixels_per_line,
    511     const unsigned char *dst_ptr,
    512     int  dst_pixels_per_line,
    513     unsigned int *sse)
    514 {
    515     int xsum0;
    516     unsigned int xxsum0;
    517 
    518     vp8_half_horiz_variance16x_h_sse2(
    519         src_ptr, src_pixels_per_line,
    520         dst_ptr, dst_pixels_per_line, 16,
    521         &xsum0, &xxsum0);
    522 
    523     *sse = xxsum0;
    524     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    525 }
    526 
    527 
    528 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
    529     const unsigned char *src_ptr,
    530     int  src_pixels_per_line,
    531     const unsigned char *dst_ptr,
    532     int  dst_pixels_per_line,
    533     unsigned int *sse)
    534 {
    535     int xsum0;
    536     unsigned int xxsum0;
    537     vp8_half_vert_variance16x_h_sse2(
    538         src_ptr, src_pixels_per_line,
    539         dst_ptr, dst_pixels_per_line, 16,
    540         &xsum0, &xxsum0);
    541 
    542     *sse = xxsum0;
    543     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    544 }
    545 
    546 
    547 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
    548     const unsigned char *src_ptr,
    549     int  src_pixels_per_line,
    550     const unsigned char *dst_ptr,
    551     int  dst_pixels_per_line,
    552     unsigned int *sse)
    553 {
    554     int xsum0;
    555     unsigned int xxsum0;
    556 
    557     vp8_half_horiz_vert_variance16x_h_sse2(
    558         src_ptr, src_pixels_per_line,
    559         dst_ptr, dst_pixels_per_line, 16,
    560         &xsum0, &xxsum0);
    561 
    562     *sse = xxsum0;
    563     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    564 }
    565