Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include "variance.h"
     13 #include "pragmas.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
     20 
     21 extern void vp8_filter_block2d_bil4x4_var_mmx
     22 (
     23     const unsigned char *ref_ptr,
     24     int ref_pixels_per_line,
     25     const unsigned char *src_ptr,
     26     int src_pixels_per_line,
     27     const short *HFilter,
     28     const short *VFilter,
     29     int *sum,
     30     unsigned int *sumsquared
     31 );
     32 
     33 extern unsigned int vp8_get4x4var_mmx
     34 (
     35     const unsigned char *src_ptr,
     36     int  source_stride,
     37     const unsigned char *ref_ptr,
     38     int  recon_stride,
     39     unsigned int *SSE,
     40     int *Sum
     41 );
     42 
     43 unsigned int vp8_get_mb_ss_sse2
     44 (
     45     const short *src_ptr
     46 );
     47 unsigned int vp8_get16x16var_sse2
     48 (
     49     const unsigned char *src_ptr,
     50     int source_stride,
     51     const unsigned char *ref_ptr,
     52     int recon_stride,
     53     unsigned int *SSE,
     54     int *Sum
     55 );
     56 unsigned int vp8_get16x16pred_error_sse2
     57 (
     58     const unsigned char *src_ptr,
     59     int src_stride,
     60     const unsigned char *ref_ptr,
     61     int ref_stride
     62 );
     63 unsigned int vp8_get8x8var_sse2
     64 (
     65     const unsigned char *src_ptr,
     66     int source_stride,
     67     const unsigned char *ref_ptr,
     68     int recon_stride,
     69     unsigned int *SSE,
     70     int *Sum
     71 );
     72 void vp8_filter_block2d_bil_var_sse2
     73 (
     74     const unsigned char *ref_ptr,
     75     int ref_pixels_per_line,
     76     const unsigned char *src_ptr,
     77     int src_pixels_per_line,
     78     unsigned int Height,
     79     const short *HFilter,
     80     const short *VFilter,
     81     int *sum,
     82     unsigned int *sumsquared
     83 );
     84 void vp8_half_horiz_vert_variance16x_h_sse2
     85 (
     86     const unsigned char *ref_ptr,
     87     int ref_pixels_per_line,
     88     const unsigned char *src_ptr,
     89     int src_pixels_per_line,
     90     unsigned int Height,
     91     int *sum,
     92     unsigned int *sumsquared
     93 );
     94 void vp8_half_horiz_variance16x_h_sse2
     95 (
     96     const unsigned char *ref_ptr,
     97     int ref_pixels_per_line,
     98     const unsigned char *src_ptr,
     99     int src_pixels_per_line,
    100     unsigned int Height,
    101     int *sum,
    102     unsigned int *sumsquared
    103 );
    104 void vp8_half_vert_variance16x_h_sse2
    105 (
    106     const unsigned char *ref_ptr,
    107     int ref_pixels_per_line,
    108     const unsigned char *src_ptr,
    109     int src_pixels_per_line,
    110     unsigned int Height,
    111     int *sum,
    112     unsigned int *sumsquared
    113 );
    114 
    115 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
    116 
    117 unsigned int vp8_variance4x4_wmt(
    118     const unsigned char *src_ptr,
    119     int  source_stride,
    120     const unsigned char *ref_ptr,
    121     int  recon_stride)
    122 {
    123     unsigned int var;
    124     int avg;
    125 
    126     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    127     return (var - ((avg * avg) >> 4));
    128 
    129 }
    130 
    131 
    132 
    133 unsigned int vp8_variance8x8_wmt
    134 (
    135     const unsigned char *src_ptr,
    136     int  source_stride,
    137     const unsigned char *ref_ptr,
    138     int  recon_stride)
    139 {
    140     unsigned int var;
    141     int avg;
    142 
    143     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    144 
    145     return (var - ((avg * avg) >> 6));
    146 
    147 }
    148 
    149 
    150 unsigned int vp8_variance16x16_wmt
    151 (
    152     const unsigned char *src_ptr,
    153     int  source_stride,
    154     const unsigned char *ref_ptr,
    155     int  recon_stride,
    156     unsigned int *sse)
    157 {
    158     unsigned int sse0;
    159     int sum0;
    160 
    161 
    162     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    163     *sse = sse0;
    164     return (sse0 - ((sum0 * sum0) >> 8));
    165 }
    166 unsigned int vp8_mse16x16_wmt(
    167     const unsigned char *src_ptr,
    168     int  source_stride,
    169     const unsigned char *ref_ptr,
    170     int  recon_stride,
    171     unsigned int *sse)
    172 {
    173 
    174     unsigned int sse0;
    175     int sum0;
    176     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    177     *sse = sse0;
    178     return sse0;
    179 
    180 }
    181 
    182 
    183 unsigned int vp8_variance16x8_wmt
    184 (
    185     const unsigned char *src_ptr,
    186     int  source_stride,
    187     const unsigned char *ref_ptr,
    188     int  recon_stride,
    189     unsigned int *sse)
    190 {
    191     unsigned int sse0, sse1, var;
    192     int sum0, sum1, avg;
    193 
    194     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    195     vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    196 
    197     var = sse0 + sse1;
    198     avg = sum0 + sum1;
    199     *sse = var;
    200     return (var - ((avg * avg) >> 7));
    201 
    202 }
    203 
    204 unsigned int vp8_variance8x16_wmt
    205 (
    206     const unsigned char *src_ptr,
    207     int  source_stride,
    208     const unsigned char *ref_ptr,
    209     int  recon_stride,
    210     unsigned int *sse)
    211 {
    212     unsigned int sse0, sse1, var;
    213     int sum0, sum1, avg;
    214 
    215     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    216     vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
    217 
    218     var = sse0 + sse1;
    219     avg = sum0 + sum1;
    220     *sse = var;
    221     return (var - ((avg * avg) >> 7));
    222 
    223 }
    224 
    225 ///////////////////////////////////////////////////////////////////////////
    226 // the mmx function that does the bilinear filtering and var calculation //
    227 // int one pass                                                          //
    228 ///////////////////////////////////////////////////////////////////////////
    229 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
    230 {
    231     { 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 },
    232     { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
    233     {  96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
    234     {  80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
    235     {  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
    236     {  48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
    237     {  32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
    238     {  16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
    239 };
    240 unsigned int vp8_sub_pixel_variance4x4_wmt
    241 (
    242     const unsigned char  *src_ptr,
    243     int  src_pixels_per_line,
    244     int  xoffset,
    245     int  yoffset,
    246     const unsigned char *dst_ptr,
    247     int dst_pixels_per_line,
    248     unsigned int *sse
    249 )
    250 {
    251     int xsum;
    252     unsigned int xxsum;
    253     vp8_filter_block2d_bil4x4_var_mmx(
    254         src_ptr, src_pixels_per_line,
    255         dst_ptr, dst_pixels_per_line,
    256         vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
    257         &xsum, &xxsum
    258     );
    259     *sse = xxsum;
    260     return (xxsum - ((xsum * xsum) >> 4));
    261 }
    262 
    263 
    264 unsigned int vp8_sub_pixel_variance8x8_wmt
    265 (
    266     const unsigned char  *src_ptr,
    267     int  src_pixels_per_line,
    268     int  xoffset,
    269     int  yoffset,
    270     const unsigned char *dst_ptr,
    271     int dst_pixels_per_line,
    272     unsigned int *sse
    273 )
    274 {
    275 
    276     int xsum;
    277     unsigned int xxsum;
    278     vp8_filter_block2d_bil_var_sse2(
    279         src_ptr, src_pixels_per_line,
    280         dst_ptr, dst_pixels_per_line, 8,
    281         vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
    282         &xsum, &xxsum
    283     );
    284 
    285     *sse = xxsum;
    286     return (xxsum - ((xsum * xsum) >> 6));
    287 }
    288 
    289 unsigned int vp8_sub_pixel_variance16x16_wmt
    290 (
    291     const unsigned char  *src_ptr,
    292     int  src_pixels_per_line,
    293     int  xoffset,
    294     int  yoffset,
    295     const unsigned char *dst_ptr,
    296     int dst_pixels_per_line,
    297     unsigned int *sse
    298 )
    299 {
    300     int xsum0, xsum1;
    301     unsigned int xxsum0, xxsum1;
    302 
    303 
    304     // note we could avoid these if statements if the calling function
    305     // just called the appropriate functions inside.
    306     if (xoffset == 4 && yoffset == 0)
    307     {
    308         vp8_half_horiz_variance16x_h_sse2(
    309             src_ptr, src_pixels_per_line,
    310             dst_ptr, dst_pixels_per_line, 16,
    311             &xsum0, &xxsum0);
    312 
    313         vp8_half_horiz_variance16x_h_sse2(
    314             src_ptr + 8, src_pixels_per_line,
    315             dst_ptr + 8, dst_pixels_per_line, 16,
    316             &xsum1, &xxsum1);
    317     }
    318     else if (xoffset == 0 && yoffset == 4)
    319     {
    320         vp8_half_vert_variance16x_h_sse2(
    321             src_ptr, src_pixels_per_line,
    322             dst_ptr, dst_pixels_per_line, 16,
    323             &xsum0, &xxsum0);
    324 
    325         vp8_half_vert_variance16x_h_sse2(
    326             src_ptr + 8, src_pixels_per_line,
    327             dst_ptr + 8, dst_pixels_per_line, 16,
    328             &xsum1, &xxsum1);
    329     }
    330     else if (xoffset == 4 && yoffset == 4)
    331     {
    332         vp8_half_horiz_vert_variance16x_h_sse2(
    333             src_ptr, src_pixels_per_line,
    334             dst_ptr, dst_pixels_per_line, 16,
    335             &xsum0, &xxsum0);
    336 
    337         vp8_half_horiz_vert_variance16x_h_sse2(
    338             src_ptr + 8, src_pixels_per_line,
    339             dst_ptr + 8, dst_pixels_per_line, 16,
    340             &xsum1, &xxsum1);
    341     }
    342     else
    343     {
    344         vp8_filter_block2d_bil_var_sse2(
    345             src_ptr, src_pixels_per_line,
    346             dst_ptr, dst_pixels_per_line, 16,
    347             vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
    348             &xsum0, &xxsum0
    349         );
    350 
    351 
    352         vp8_filter_block2d_bil_var_sse2(
    353             src_ptr + 8, src_pixels_per_line,
    354             dst_ptr + 8, dst_pixels_per_line, 16,
    355             vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
    356             &xsum1, &xxsum1
    357         );
    358     }
    359 
    360     xsum0 += xsum1;
    361     xxsum0 += xxsum1;
    362     *sse = xxsum0;
    363     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    364 }
    365 
    366 unsigned int vp8_sub_pixel_mse16x16_wmt(
    367     const unsigned char  *src_ptr,
    368     int  src_pixels_per_line,
    369     int  xoffset,
    370     int  yoffset,
    371     const unsigned char *dst_ptr,
    372     int dst_pixels_per_line,
    373     unsigned int *sse
    374 )
    375 {
    376     vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
    377     return *sse;
    378 }
    379 
    380 unsigned int vp8_sub_pixel_variance16x8_wmt
    381 (
    382     const unsigned char  *src_ptr,
    383     int  src_pixels_per_line,
    384     int  xoffset,
    385     int  yoffset,
    386     const unsigned char *dst_ptr,
    387     int dst_pixels_per_line,
    388     unsigned int *sse
    389 
    390 )
    391 {
    392     int xsum0, xsum1;
    393     unsigned int xxsum0, xxsum1;
    394 
    395 
    396     vp8_filter_block2d_bil_var_sse2(
    397         src_ptr, src_pixels_per_line,
    398         dst_ptr, dst_pixels_per_line, 8,
    399         vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
    400         &xsum0, &xxsum0
    401     );
    402 
    403 
    404     vp8_filter_block2d_bil_var_sse2(
    405         src_ptr + 8, src_pixels_per_line,
    406         dst_ptr + 8, dst_pixels_per_line, 8,
    407         vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
    408         &xsum1, &xxsum1
    409     );
    410 
    411     xsum0 += xsum1;
    412     xxsum0 += xxsum1;
    413 
    414     *sse = xxsum0;
    415     return (xxsum0 - ((xsum0 * xsum0) >> 7));
    416 }
    417 
    418 unsigned int vp8_sub_pixel_variance8x16_wmt
    419 (
    420     const unsigned char  *src_ptr,
    421     int  src_pixels_per_line,
    422     int  xoffset,
    423     int  yoffset,
    424     const unsigned char *dst_ptr,
    425     int dst_pixels_per_line,
    426     unsigned int *sse
    427 )
    428 {
    429     int xsum;
    430     unsigned int xxsum;
    431     vp8_filter_block2d_bil_var_sse2(
    432         src_ptr, src_pixels_per_line,
    433         dst_ptr, dst_pixels_per_line, 16,
    434         vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
    435         &xsum, &xxsum
    436     );
    437 
    438     *sse = xxsum;
    439     return (xxsum - ((xsum * xsum) >> 7));
    440 }
    441 
    442 unsigned int vp8_i_variance16x16_wmt(
    443     const unsigned char *src_ptr,
    444     int  source_stride,
    445     const unsigned char *ref_ptr,
    446     int  recon_stride,
    447     unsigned int *sse)
    448 {
    449     unsigned int sse0, sse1, sse2, sse3, var;
    450     int sum0, sum1, sum2, sum3, avg;
    451 
    452 
    453     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    454     vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    455     vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
    456     vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
    457 
    458     var = sse0 + sse1 + sse2 + sse3;
    459     avg = sum0 + sum1 + sum2 + sum3;
    460 
    461     *sse = var;
    462     return (var - ((avg * avg) >> 8));
    463 
    464 }
    465 
    466 unsigned int vp8_i_variance8x16_wmt(
    467     const unsigned char *src_ptr,
    468     int  source_stride,
    469     const unsigned char *ref_ptr,
    470     int  recon_stride,
    471     unsigned int *sse)
    472 {
    473     unsigned int sse0, sse1, var;
    474     int sum0, sum1, avg;
    475     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    476     vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
    477 
    478     var = sse0 + sse1;
    479     avg = sum0 + sum1;
    480 
    481     *sse = var;
    482     return (var - ((avg * avg) >> 7));
    483 
    484 }
    485 
    486 
    487 unsigned int vp8_i_sub_pixel_variance16x16_wmt
    488 (
    489     const unsigned char  *src_ptr,
    490     int  src_pixels_per_line,
    491     int  xoffset,
    492     int  yoffset,
    493     const unsigned char *dst_ptr,
    494     int dst_pixels_per_line,
    495     unsigned int *sse
    496 )
    497 {
    498     return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
    499 }
    500 
    501 
    502 unsigned int vp8_i_sub_pixel_variance8x16_wmt
    503 (
    504     const unsigned char  *src_ptr,
    505     int  src_pixels_per_line,
    506     int  xoffset,
    507     int  yoffset,
    508     const unsigned char *dst_ptr,
    509     int dst_pixels_per_line,
    510     unsigned int *sse
    511 )
    512 {
    513 
    514     return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
    515 }
    516 
    517 
    518 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
    519     const unsigned char *src_ptr,
    520     int  src_pixels_per_line,
    521     const unsigned char *dst_ptr,
    522     int  dst_pixels_per_line,
    523     unsigned int *sse)
    524 {
    525     int xsum0, xsum1;
    526     unsigned int xxsum0, xxsum1;
    527 
    528     vp8_half_horiz_variance16x_h_sse2(
    529         src_ptr, src_pixels_per_line,
    530         dst_ptr, dst_pixels_per_line, 16,
    531         &xsum0, &xxsum0);
    532 
    533     vp8_half_horiz_variance16x_h_sse2(
    534         src_ptr + 8, src_pixels_per_line,
    535         dst_ptr + 8, dst_pixels_per_line, 16,
    536         &xsum1, &xxsum1);
    537 
    538     xsum0 += xsum1;
    539     xxsum0 += xxsum1;
    540     *sse = xxsum0;
    541     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    542 }
    543 
    544 
    545 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
    546     const unsigned char *src_ptr,
    547     int  src_pixels_per_line,
    548     const unsigned char *dst_ptr,
    549     int  dst_pixels_per_line,
    550     unsigned int *sse)
    551 {
    552     int xsum0, xsum1;
    553     unsigned int xxsum0, xxsum1;
    554 
    555     vp8_half_vert_variance16x_h_sse2(
    556         src_ptr, src_pixels_per_line,
    557         dst_ptr, dst_pixels_per_line, 16,
    558         &xsum0, &xxsum0);
    559 
    560     vp8_half_vert_variance16x_h_sse2(
    561         src_ptr + 8, src_pixels_per_line,
    562         dst_ptr + 8, dst_pixels_per_line, 16,
    563         &xsum1, &xxsum1);
    564 
    565     xsum0 += xsum1;
    566     xxsum0 += xxsum1;
    567     *sse = xxsum0;
    568     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    569 }
    570 
    571 
    572 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
    573     const unsigned char *src_ptr,
    574     int  src_pixels_per_line,
    575     const unsigned char *dst_ptr,
    576     int  dst_pixels_per_line,
    577     unsigned int *sse)
    578 {
    579     int xsum0, xsum1;
    580     unsigned int xxsum0, xxsum1;
    581 
    582     vp8_half_horiz_vert_variance16x_h_sse2(
    583         src_ptr, src_pixels_per_line,
    584         dst_ptr, dst_pixels_per_line, 16,
    585         &xsum0, &xxsum0);
    586 
    587     vp8_half_horiz_vert_variance16x_h_sse2(
    588         src_ptr + 8, src_pixels_per_line,
    589         dst_ptr + 8, dst_pixels_per_line, 16,
    590         &xsum1, &xxsum1);
    591 
    592     xsum0 += xsum1;
    593     xxsum0 += xxsum1;
    594     *sse = xxsum0;
    595     return (xxsum0 - ((xsum0 * xsum0) >> 8));
    596 }
    597