Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_config.h"
     12 #include "vp8/common/variance.h"
     13 #include "vp8/common/pragmas.h"
     14 #include "vpx_ports/mem.h"
     15 #include "vp8/common/x86/filter_x86.h"
     16 
     17 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
     18 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
     19 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
     20 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
     21 
     22 extern void vp8_filter_block2d_bil4x4_var_mmx
     23 (
     24     const unsigned char *ref_ptr,
     25     int ref_pixels_per_line,
     26     const unsigned char *src_ptr,
     27     int src_pixels_per_line,
     28     const short *HFilter,
     29     const short *VFilter,
     30     int *sum,
     31     unsigned int *sumsquared
     32 );
     33 
     34 extern unsigned int vp8_get4x4var_mmx
     35 (
     36     const unsigned char *src_ptr,
     37     int  source_stride,
     38     const unsigned char *ref_ptr,
     39     int  recon_stride,
     40     unsigned int *SSE,
     41     int *Sum
     42 );
     43 
     44 unsigned int vp8_get_mb_ss_sse2
     45 (
     46     const short *src_ptr
     47 );
     48 unsigned int vp8_get16x16var_sse2
     49 (
     50     const unsigned char *src_ptr,
     51     int source_stride,
     52     const unsigned char *ref_ptr,
     53     int recon_stride,
     54     unsigned int *SSE,
     55     int *Sum
     56 );
     57 unsigned int vp8_get8x8var_sse2
     58 (
     59     const unsigned char *src_ptr,
     60     int source_stride,
     61     const unsigned char *ref_ptr,
     62     int recon_stride,
     63     unsigned int *SSE,
     64     int *Sum
     65 );
     66 void vp8_filter_block2d_bil_var_sse2
     67 (
     68     const unsigned char *ref_ptr,
     69     int ref_pixels_per_line,
     70     const unsigned char *src_ptr,
     71     int src_pixels_per_line,
     72     unsigned int Height,
     73     int  xoffset,
     74     int  yoffset,
     75     int *sum,
     76     unsigned int *sumsquared
     77 );
     78 void vp8_half_horiz_vert_variance8x_h_sse2
     79 (
     80     const unsigned char *ref_ptr,
     81     int ref_pixels_per_line,
     82     const unsigned char *src_ptr,
     83     int src_pixels_per_line,
     84     unsigned int Height,
     85     int *sum,
     86     unsigned int *sumsquared
     87 );
     88 void vp8_half_horiz_vert_variance16x_h_sse2
     89 (
     90     const unsigned char *ref_ptr,
     91     int ref_pixels_per_line,
     92     const unsigned char *src_ptr,
     93     int src_pixels_per_line,
     94     unsigned int Height,
     95     int *sum,
     96     unsigned int *sumsquared
     97 );
     98 void vp8_half_horiz_variance8x_h_sse2
     99 (
    100     const unsigned char *ref_ptr,
    101     int ref_pixels_per_line,
    102     const unsigned char *src_ptr,
    103     int src_pixels_per_line,
    104     unsigned int Height,
    105     int *sum,
    106     unsigned int *sumsquared
    107 );
    108 void vp8_half_horiz_variance16x_h_sse2
    109 (
    110     const unsigned char *ref_ptr,
    111     int ref_pixels_per_line,
    112     const unsigned char *src_ptr,
    113     int src_pixels_per_line,
    114     unsigned int Height,
    115     int *sum,
    116     unsigned int *sumsquared
    117 );
    118 void vp8_half_vert_variance8x_h_sse2
    119 (
    120     const unsigned char *ref_ptr,
    121     int ref_pixels_per_line,
    122     const unsigned char *src_ptr,
    123     int src_pixels_per_line,
    124     unsigned int Height,
    125     int *sum,
    126     unsigned int *sumsquared
    127 );
    128 void vp8_half_vert_variance16x_h_sse2
    129 (
    130     const unsigned char *ref_ptr,
    131     int ref_pixels_per_line,
    132     const unsigned char *src_ptr,
    133     int src_pixels_per_line,
    134     unsigned int Height,
    135     int *sum,
    136     unsigned int *sumsquared
    137 );
    138 
    139 unsigned int vp8_variance4x4_wmt(
    140     const unsigned char *src_ptr,
    141     int  source_stride,
    142     const unsigned char *ref_ptr,
    143     int  recon_stride,
    144     unsigned int *sse)
    145 {
    146     unsigned int var;
    147     int avg;
    148 
    149     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    150     *sse = var;
    151     return (var - (((unsigned int)avg * avg) >> 4));
    152 
    153 }
    154 
    155 unsigned int vp8_variance8x8_wmt
    156 (
    157     const unsigned char *src_ptr,
    158     int  source_stride,
    159     const unsigned char *ref_ptr,
    160     int  recon_stride,
    161     unsigned int *sse)
    162 {
    163     unsigned int var;
    164     int avg;
    165 
    166     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    167     *sse = var;
    168     return (var - (((unsigned int)avg * avg) >> 6));
    169 
    170 }
    171 
    172 
    173 unsigned int vp8_variance16x16_wmt
    174 (
    175     const unsigned char *src_ptr,
    176     int  source_stride,
    177     const unsigned char *ref_ptr,
    178     int  recon_stride,
    179     unsigned int *sse)
    180 {
    181     unsigned int sse0;
    182     int sum0;
    183 
    184 
    185     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    186     *sse = sse0;
    187     return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
    188 }
    189 unsigned int vp8_mse16x16_wmt(
    190     const unsigned char *src_ptr,
    191     int  source_stride,
    192     const unsigned char *ref_ptr,
    193     int  recon_stride,
    194     unsigned int *sse)
    195 {
    196 
    197     unsigned int sse0;
    198     int sum0;
    199     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    200     *sse = sse0;
    201     return sse0;
    202 
    203 }
    204 
    205 
    206 unsigned int vp8_variance16x8_wmt
    207 (
    208     const unsigned char *src_ptr,
    209     int  source_stride,
    210     const unsigned char *ref_ptr,
    211     int  recon_stride,
    212     unsigned int *sse)
    213 {
    214     unsigned int sse0, sse1, var;
    215     int sum0, sum1, avg;
    216 
    217     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    218     vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    219 
    220     var = sse0 + sse1;
    221     avg = sum0 + sum1;
    222     *sse = var;
    223     return (var - (((unsigned int)avg * avg) >> 7));
    224 
    225 }
    226 
    227 unsigned int vp8_variance8x16_wmt
    228 (
    229     const unsigned char *src_ptr,
    230     int  source_stride,
    231     const unsigned char *ref_ptr,
    232     int  recon_stride,
    233     unsigned int *sse)
    234 {
    235     unsigned int sse0, sse1, var;
    236     int sum0, sum1, avg;
    237 
    238     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    239     vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
    240 
    241     var = sse0 + sse1;
    242     avg = sum0 + sum1;
    243     *sse = var;
    244     return (var - (((unsigned int)avg * avg) >> 7));
    245 
    246 }
    247 
    248 unsigned int vp8_sub_pixel_variance4x4_wmt
    249 (
    250     const unsigned char  *src_ptr,
    251     int  src_pixels_per_line,
    252     int  xoffset,
    253     int  yoffset,
    254     const unsigned char *dst_ptr,
    255     int dst_pixels_per_line,
    256     unsigned int *sse
    257 )
    258 {
    259     int xsum;
    260     unsigned int xxsum;
    261     vp8_filter_block2d_bil4x4_var_mmx(
    262         src_ptr, src_pixels_per_line,
    263         dst_ptr, dst_pixels_per_line,
    264         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
    265         &xsum, &xxsum
    266     );
    267     *sse = xxsum;
    268     return (xxsum - (((unsigned int)xsum * xsum) >> 4));
    269 }
    270 
    271 
    272 unsigned int vp8_sub_pixel_variance8x8_wmt
    273 (
    274     const unsigned char  *src_ptr,
    275     int  src_pixels_per_line,
    276     int  xoffset,
    277     int  yoffset,
    278     const unsigned char *dst_ptr,
    279     int dst_pixels_per_line,
    280     unsigned int *sse
    281 )
    282 {
    283     int xsum;
    284     unsigned int xxsum;
    285 
    286     if (xoffset == 4 && yoffset == 0)
    287     {
    288         vp8_half_horiz_variance8x_h_sse2(
    289             src_ptr, src_pixels_per_line,
    290             dst_ptr, dst_pixels_per_line, 8,
    291             &xsum, &xxsum);
    292     }
    293     else if (xoffset == 0 && yoffset == 4)
    294     {
    295         vp8_half_vert_variance8x_h_sse2(
    296             src_ptr, src_pixels_per_line,
    297             dst_ptr, dst_pixels_per_line, 8,
    298             &xsum, &xxsum);
    299     }
    300     else if (xoffset == 4 && yoffset == 4)
    301     {
    302         vp8_half_horiz_vert_variance8x_h_sse2(
    303             src_ptr, src_pixels_per_line,
    304             dst_ptr, dst_pixels_per_line, 8,
    305             &xsum, &xxsum);
    306     }
    307     else
    308     {
    309         vp8_filter_block2d_bil_var_sse2(
    310             src_ptr, src_pixels_per_line,
    311             dst_ptr, dst_pixels_per_line, 8,
    312             xoffset, yoffset,
    313             &xsum, &xxsum);
    314     }
    315 
    316     *sse = xxsum;
    317     return (xxsum - (((unsigned int)xsum * xsum) >> 6));
    318 }
    319 
    320 unsigned int vp8_sub_pixel_variance16x16_wmt
    321 (
    322     const unsigned char  *src_ptr,
    323     int  src_pixels_per_line,
    324     int  xoffset,
    325     int  yoffset,
    326     const unsigned char *dst_ptr,
    327     int dst_pixels_per_line,
    328     unsigned int *sse
    329 )
    330 {
    331     int xsum0, xsum1;
    332     unsigned int xxsum0, xxsum1;
    333 
    334 
    335     /* note we could avoid these if statements if the calling function
    336      * just called the appropriate functions inside.
    337      */
    338     if (xoffset == 4 && yoffset == 0)
    339     {
    340         vp8_half_horiz_variance16x_h_sse2(
    341             src_ptr, src_pixels_per_line,
    342             dst_ptr, dst_pixels_per_line, 16,
    343             &xsum0, &xxsum0);
    344     }
    345     else if (xoffset == 0 && yoffset == 4)
    346     {
    347         vp8_half_vert_variance16x_h_sse2(
    348             src_ptr, src_pixels_per_line,
    349             dst_ptr, dst_pixels_per_line, 16,
    350             &xsum0, &xxsum0);
    351     }
    352     else if (xoffset == 4 && yoffset == 4)
    353     {
    354         vp8_half_horiz_vert_variance16x_h_sse2(
    355             src_ptr, src_pixels_per_line,
    356             dst_ptr, dst_pixels_per_line, 16,
    357             &xsum0, &xxsum0);
    358     }
    359     else
    360     {
    361         vp8_filter_block2d_bil_var_sse2(
    362             src_ptr, src_pixels_per_line,
    363             dst_ptr, dst_pixels_per_line, 16,
    364             xoffset, yoffset,
    365             &xsum0, &xxsum0
    366         );
    367 
    368         vp8_filter_block2d_bil_var_sse2(
    369             src_ptr + 8, src_pixels_per_line,
    370             dst_ptr + 8, dst_pixels_per_line, 16,
    371             xoffset, yoffset,
    372             &xsum1, &xxsum1
    373         );
    374         xsum0 += xsum1;
    375         xxsum0 += xxsum1;
    376     }
    377 
    378     *sse = xxsum0;
    379     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    380 }
    381 
    382 unsigned int vp8_sub_pixel_mse16x16_wmt(
    383     const unsigned char  *src_ptr,
    384     int  src_pixels_per_line,
    385     int  xoffset,
    386     int  yoffset,
    387     const unsigned char *dst_ptr,
    388     int dst_pixels_per_line,
    389     unsigned int *sse
    390 )
    391 {
    392     vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
    393     return *sse;
    394 }
    395 
    396 unsigned int vp8_sub_pixel_variance16x8_wmt
    397 (
    398     const unsigned char  *src_ptr,
    399     int  src_pixels_per_line,
    400     int  xoffset,
    401     int  yoffset,
    402     const unsigned char *dst_ptr,
    403     int dst_pixels_per_line,
    404     unsigned int *sse
    405 
    406 )
    407 {
    408     int xsum0, xsum1;
    409     unsigned int xxsum0, xxsum1;
    410 
    411     if (xoffset == 4 && yoffset == 0)
    412     {
    413         vp8_half_horiz_variance16x_h_sse2(
    414             src_ptr, src_pixels_per_line,
    415             dst_ptr, dst_pixels_per_line, 8,
    416             &xsum0, &xxsum0);
    417     }
    418     else if (xoffset == 0 && yoffset == 4)
    419     {
    420         vp8_half_vert_variance16x_h_sse2(
    421             src_ptr, src_pixels_per_line,
    422             dst_ptr, dst_pixels_per_line, 8,
    423             &xsum0, &xxsum0);
    424     }
    425     else if (xoffset == 4 && yoffset == 4)
    426     {
    427         vp8_half_horiz_vert_variance16x_h_sse2(
    428             src_ptr, src_pixels_per_line,
    429             dst_ptr, dst_pixels_per_line, 8,
    430             &xsum0, &xxsum0);
    431     }
    432     else
    433     {
    434         vp8_filter_block2d_bil_var_sse2(
    435             src_ptr, src_pixels_per_line,
    436             dst_ptr, dst_pixels_per_line, 8,
    437             xoffset, yoffset,
    438             &xsum0, &xxsum0);
    439 
    440         vp8_filter_block2d_bil_var_sse2(
    441             src_ptr + 8, src_pixels_per_line,
    442             dst_ptr + 8, dst_pixels_per_line, 8,
    443             xoffset, yoffset,
    444             &xsum1, &xxsum1);
    445         xsum0 += xsum1;
    446         xxsum0 += xxsum1;
    447     }
    448 
    449     *sse = xxsum0;
    450     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
    451 }
    452 
    453 unsigned int vp8_sub_pixel_variance8x16_wmt
    454 (
    455     const unsigned char  *src_ptr,
    456     int  src_pixels_per_line,
    457     int  xoffset,
    458     int  yoffset,
    459     const unsigned char *dst_ptr,
    460     int dst_pixels_per_line,
    461     unsigned int *sse
    462 )
    463 {
    464     int xsum;
    465     unsigned int xxsum;
    466 
    467     if (xoffset == 4 && yoffset == 0)
    468     {
    469         vp8_half_horiz_variance8x_h_sse2(
    470             src_ptr, src_pixels_per_line,
    471             dst_ptr, dst_pixels_per_line, 16,
    472             &xsum, &xxsum);
    473     }
    474     else if (xoffset == 0 && yoffset == 4)
    475     {
    476         vp8_half_vert_variance8x_h_sse2(
    477             src_ptr, src_pixels_per_line,
    478             dst_ptr, dst_pixels_per_line, 16,
    479             &xsum, &xxsum);
    480     }
    481     else if (xoffset == 4 && yoffset == 4)
    482     {
    483         vp8_half_horiz_vert_variance8x_h_sse2(
    484             src_ptr, src_pixels_per_line,
    485             dst_ptr, dst_pixels_per_line, 16,
    486             &xsum, &xxsum);
    487     }
    488     else
    489     {
    490         vp8_filter_block2d_bil_var_sse2(
    491             src_ptr, src_pixels_per_line,
    492             dst_ptr, dst_pixels_per_line, 16,
    493             xoffset, yoffset,
    494             &xsum, &xxsum);
    495     }
    496 
    497     *sse = xxsum;
    498     return (xxsum - (((unsigned int)xsum * xsum) >> 7));
    499 }
    500 
    501 
    502 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
    503     const unsigned char *src_ptr,
    504     int  src_pixels_per_line,
    505     const unsigned char *dst_ptr,
    506     int  dst_pixels_per_line,
    507     unsigned int *sse)
    508 {
    509     int xsum0;
    510     unsigned int xxsum0;
    511 
    512     vp8_half_horiz_variance16x_h_sse2(
    513         src_ptr, src_pixels_per_line,
    514         dst_ptr, dst_pixels_per_line, 16,
    515         &xsum0, &xxsum0);
    516 
    517     *sse = xxsum0;
    518     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    519 }
    520 
    521 
    522 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
    523     const unsigned char *src_ptr,
    524     int  src_pixels_per_line,
    525     const unsigned char *dst_ptr,
    526     int  dst_pixels_per_line,
    527     unsigned int *sse)
    528 {
    529     int xsum0;
    530     unsigned int xxsum0;
    531     vp8_half_vert_variance16x_h_sse2(
    532         src_ptr, src_pixels_per_line,
    533         dst_ptr, dst_pixels_per_line, 16,
    534         &xsum0, &xxsum0);
    535 
    536     *sse = xxsum0;
    537     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    538 }
    539 
    540 
    541 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
    542     const unsigned char *src_ptr,
    543     int  src_pixels_per_line,
    544     const unsigned char *dst_ptr,
    545     int  dst_pixels_per_line,
    546     unsigned int *sse)
    547 {
    548     int xsum0;
    549     unsigned int xxsum0;
    550 
    551     vp8_half_horiz_vert_variance16x_h_sse2(
    552         src_ptr, src_pixels_per_line,
    553         dst_ptr, dst_pixels_per_line, 16,
    554         &xsum0, &xxsum0);
    555 
    556     *sse = xxsum0;
    557     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
    558 }
    559