Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include "vpx_ports/config.h"
     13 #include "vpx_ports/mem.h"
     14 #include "vp8/common/subpixel.h"
     15 
     16 extern const short vp8_six_tap_mmx[8][6*8];
     17 extern const short vp8_bilinear_filters_mmx[8][2*8];
     18 
     19 extern void vp8_filter_block1d_h6_mmx
     20 (
     21     unsigned char   *src_ptr,
     22     unsigned short  *output_ptr,
     23     unsigned int    src_pixels_per_line,
     24     unsigned int    pixel_step,
     25     unsigned int    output_height,
     26     unsigned int    output_width,
     27     const short      *vp8_filter
     28 );
     29 extern void vp8_filter_block1dc_v6_mmx
     30 (
     31     unsigned short *src_ptr,
     32     unsigned char  *output_ptr,
     33     int             output_pitch,
     34     unsigned int    pixels_per_line,
     35     unsigned int    pixel_step,
     36     unsigned int    output_height,
     37     unsigned int    output_width,
     38     const short    *vp8_filter
     39 );
     40 extern void vp8_filter_block1d8_h6_sse2
     41 (
     42     unsigned char  *src_ptr,
     43     unsigned short *output_ptr,
     44     unsigned int    src_pixels_per_line,
     45     unsigned int    pixel_step,
     46     unsigned int    output_height,
     47     unsigned int    output_width,
     48     const short    *vp8_filter
     49 );
     50 extern void vp8_filter_block1d16_h6_sse2
     51 (
     52     unsigned char  *src_ptr,
     53     unsigned short *output_ptr,
     54     unsigned int    src_pixels_per_line,
     55     unsigned int    pixel_step,
     56     unsigned int    output_height,
     57     unsigned int    output_width,
     58     const short    *vp8_filter
     59 );
     60 extern void vp8_filter_block1d8_v6_sse2
     61 (
     62     unsigned short *src_ptr,
     63     unsigned char *output_ptr,
     64     int dst_ptich,
     65     unsigned int pixels_per_line,
     66     unsigned int pixel_step,
     67     unsigned int output_height,
     68     unsigned int output_width,
     69     const short    *vp8_filter
     70 );
     71 extern void vp8_filter_block1d16_v6_sse2
     72 (
     73     unsigned short *src_ptr,
     74     unsigned char *output_ptr,
     75     int dst_ptich,
     76     unsigned int pixels_per_line,
     77     unsigned int pixel_step,
     78     unsigned int output_height,
     79     unsigned int output_width,
     80     const short    *vp8_filter
     81 );
     82 extern void vp8_unpack_block1d16_h6_sse2
     83 (
     84     unsigned char  *src_ptr,
     85     unsigned short *output_ptr,
     86     unsigned int    src_pixels_per_line,
     87     unsigned int    output_height,
     88     unsigned int    output_width
     89 );
     90 extern void vp8_filter_block1d8_h6_only_sse2
     91 (
     92     unsigned char  *src_ptr,
     93     unsigned int    src_pixels_per_line,
     94     unsigned char  *output_ptr,
     95     int dst_ptich,
     96     unsigned int    output_height,
     97     const short    *vp8_filter
     98 );
     99 extern void vp8_filter_block1d16_h6_only_sse2
    100 (
    101     unsigned char  *src_ptr,
    102     unsigned int    src_pixels_per_line,
    103     unsigned char  *output_ptr,
    104     int dst_ptich,
    105     unsigned int    output_height,
    106     const short    *vp8_filter
    107 );
    108 extern void vp8_filter_block1d8_v6_only_sse2
    109 (
    110     unsigned char *src_ptr,
    111     unsigned int   src_pixels_per_line,
    112     unsigned char *output_ptr,
    113     int dst_ptich,
    114     unsigned int   output_height,
    115     const short   *vp8_filter
    116 );
    117 extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx);
    118 
    119 
    120 #if HAVE_MMX
    121 void vp8_sixtap_predict4x4_mmx
    122 (
    123     unsigned char  *src_ptr,
    124     int   src_pixels_per_line,
    125     int  xoffset,
    126     int  yoffset,
    127     unsigned char *dst_ptr,
    128     int dst_pitch
    129 )
    130 {
    131     DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  /* Temp data bufffer used in filtering */
    132     const short *HFilter, *VFilter;
    133     HFilter = vp8_six_tap_mmx[xoffset];
    134     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
    135     VFilter = vp8_six_tap_mmx[yoffset];
    136     vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
    137 
    138 }
    139 
    140 
    141 void vp8_sixtap_predict16x16_mmx
    142 (
    143     unsigned char  *src_ptr,
    144     int   src_pixels_per_line,
    145     int  xoffset,
    146     int  yoffset,
    147     unsigned char *dst_ptr,
    148     int dst_pitch
    149 )
    150 {
    151 
    152     DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  /* Temp data bufffer used in filtering */
    153 
    154     const short *HFilter, *VFilter;
    155 
    156 
    157     HFilter = vp8_six_tap_mmx[xoffset];
    158 
    159     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
    160     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
    161     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,  FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
    162     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
    163 
    164     VFilter = vp8_six_tap_mmx[yoffset];
    165     vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, 16, VFilter);
    166     vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
    167     vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
    168     vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
    169 
    170 }
    171 
    172 
    173 void vp8_sixtap_predict8x8_mmx
    174 (
    175     unsigned char  *src_ptr,
    176     int   src_pixels_per_line,
    177     int  xoffset,
    178     int  yoffset,
    179     unsigned char *dst_ptr,
    180     int dst_pitch
    181 )
    182 {
    183 
    184     DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
    185 
    186     const short *HFilter, *VFilter;
    187 
    188     HFilter = vp8_six_tap_mmx[xoffset];
    189     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
    190     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
    191 
    192     VFilter = vp8_six_tap_mmx[yoffset];
    193     vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, 8, VFilter);
    194     vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
    195 
    196 }
    197 
    198 
    199 void vp8_sixtap_predict8x4_mmx
    200 (
    201     unsigned char  *src_ptr,
    202     int   src_pixels_per_line,
    203     int  xoffset,
    204     int  yoffset,
    205     unsigned char *dst_ptr,
    206     int dst_pitch
    207 )
    208 {
    209 
    210     DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
    211 
    212     const short *HFilter, *VFilter;
    213 
    214     HFilter = vp8_six_tap_mmx[xoffset];
    215     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
    216     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
    217 
    218     VFilter = vp8_six_tap_mmx[yoffset];
    219     vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, 8, VFilter);
    220     vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
    221 
    222 }
    223 
    224 
    225 
    226 void vp8_bilinear_predict16x16_mmx
    227 (
    228     unsigned char  *src_ptr,
    229     int   src_pixels_per_line,
    230     int  xoffset,
    231     int  yoffset,
    232     unsigned char *dst_ptr,
    233     int dst_pitch
    234 )
    235 {
    236     vp8_bilinear_predict8x8_mmx(src_ptr,   src_pixels_per_line, xoffset, yoffset, dst_ptr,   dst_pitch);
    237     vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
    238     vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,   src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8,   dst_pitch);
    239     vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
    240 }
    241 #endif
    242 
    243 
    244 #if HAVE_SSE2
    245 void vp8_sixtap_predict16x16_sse2
    246 (
    247     unsigned char  *src_ptr,
    248     int   src_pixels_per_line,
    249     int  xoffset,
    250     int  yoffset,
    251     unsigned char *dst_ptr,
    252     int dst_pitch
    253 
    254 )
    255 {
    256     DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    /* Temp data bufffer used in filtering */
    257 
    258     const short *HFilter, *VFilter;
    259 
    260     if (xoffset)
    261     {
    262         if (yoffset)
    263         {
    264             HFilter = vp8_six_tap_mmx[xoffset];
    265             vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
    266             VFilter = vp8_six_tap_mmx[yoffset];
    267             vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
    268         }
    269         else
    270         {
    271             /* First-pass only */
    272             HFilter = vp8_six_tap_mmx[xoffset];
    273             vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
    274         }
    275     }
    276     else
    277     {
    278         /* Second-pass only */
    279         VFilter = vp8_six_tap_mmx[yoffset];
    280         vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 21, 32);
    281         vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
    282     }
    283 }
    284 
    285 
    286 void vp8_sixtap_predict8x8_sse2
    287 (
    288     unsigned char  *src_ptr,
    289     int   src_pixels_per_line,
    290     int  xoffset,
    291     int  yoffset,
    292     unsigned char *dst_ptr,
    293     int dst_pitch
    294 )
    295 {
    296     DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
    297     const short *HFilter, *VFilter;
    298 
    299     if (xoffset)
    300     {
    301         if (yoffset)
    302         {
    303             HFilter = vp8_six_tap_mmx[xoffset];
    304             vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
    305             VFilter = vp8_six_tap_mmx[yoffset];
    306             vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
    307         }
    308         else
    309         {
    310             /* First-pass only */
    311             HFilter = vp8_six_tap_mmx[xoffset];
    312             vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
    313         }
    314     }
    315     else
    316     {
    317         /* Second-pass only */
    318         VFilter = vp8_six_tap_mmx[yoffset];
    319         vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
    320     }
    321 }
    322 
    323 
    324 void vp8_sixtap_predict8x4_sse2
    325 (
    326     unsigned char  *src_ptr,
    327     int   src_pixels_per_line,
    328     int  xoffset,
    329     int  yoffset,
    330     unsigned char *dst_ptr,
    331     int dst_pitch
    332 )
    333 {
    334     DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
    335     const short *HFilter, *VFilter;
    336 
    337     if (xoffset)
    338     {
    339         if (yoffset)
    340         {
    341             HFilter = vp8_six_tap_mmx[xoffset];
    342             vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
    343             VFilter = vp8_six_tap_mmx[yoffset];
    344             vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
    345         }
    346         else
    347         {
    348             /* First-pass only */
    349             HFilter = vp8_six_tap_mmx[xoffset];
    350             vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
    351         }
    352     }
    353     else
    354     {
    355         /* Second-pass only */
    356         VFilter = vp8_six_tap_mmx[yoffset];
    357         vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
    358     }
    359 }
    360 
    361 #endif
    362 
    363 #if HAVE_SSSE3
    364 
    365 extern void vp8_filter_block1d8_h6_ssse3
    366 (
    367     unsigned char  *src_ptr,
    368     unsigned int    src_pixels_per_line,
    369     unsigned char  *output_ptr,
    370     unsigned int    output_pitch,
    371     unsigned int    output_height,
    372     unsigned int    vp8_filter_index
    373 );
    374 
    375 extern void vp8_filter_block1d16_h6_ssse3
    376 (
    377     unsigned char  *src_ptr,
    378     unsigned int    src_pixels_per_line,
    379     unsigned char  *output_ptr,
    380     unsigned int    output_pitch,
    381     unsigned int    output_height,
    382     unsigned int    vp8_filter_index
    383 );
    384 
    385 extern void vp8_filter_block1d16_v6_ssse3
    386 (
    387     unsigned char *src_ptr,
    388     unsigned int   src_pitch,
    389     unsigned char *output_ptr,
    390     unsigned int   out_pitch,
    391     unsigned int   output_height,
    392     unsigned int   vp8_filter_index
    393 );
    394 
    395 extern void vp8_filter_block1d8_v6_ssse3
    396 (
    397     unsigned char *src_ptr,
    398     unsigned int   src_pitch,
    399     unsigned char *output_ptr,
    400     unsigned int   out_pitch,
    401     unsigned int   output_height,
    402     unsigned int   vp8_filter_index
    403 );
    404 
    405 extern void vp8_filter_block1d4_h6_ssse3
    406 (
    407     unsigned char  *src_ptr,
    408     unsigned int    src_pixels_per_line,
    409     unsigned char  *output_ptr,
    410     unsigned int    output_pitch,
    411     unsigned int    output_height,
    412     unsigned int    vp8_filter_index
    413 );
    414 
    415 extern void vp8_filter_block1d4_v6_ssse3
    416 (
    417     unsigned char *src_ptr,
    418     unsigned int   src_pitch,
    419     unsigned char *output_ptr,
    420     unsigned int   out_pitch,
    421     unsigned int   output_height,
    422     unsigned int   vp8_filter_index
    423 );
    424 
    425 void vp8_sixtap_predict16x16_ssse3
    426 (
    427     unsigned char  *src_ptr,
    428     int   src_pixels_per_line,
    429     int  xoffset,
    430     int  yoffset,
    431     unsigned char *dst_ptr,
    432     int dst_pitch
    433 
    434 )
    435 {
    436     DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
    437 
    438     if (xoffset)
    439     {
    440         if (yoffset)
    441         {
    442             vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset);
    443             vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset);
    444         }
    445         else
    446         {
    447             /* First-pass only */
    448             vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
    449         }
    450     }
    451     else
    452     {
    453         /* Second-pass only */
    454         vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
    455     }
    456 }
    457 
    458 void vp8_sixtap_predict8x8_ssse3
    459 (
    460     unsigned char  *src_ptr,
    461     int   src_pixels_per_line,
    462     int  xoffset,
    463     int  yoffset,
    464     unsigned char *dst_ptr,
    465     int dst_pitch
    466 )
    467 {
    468     DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
    469 
    470     if (xoffset)
    471     {
    472         if (yoffset)
    473         {
    474             vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset);
    475             vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
    476         }
    477         else
    478         {
    479             vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset);
    480         }
    481     }
    482     else
    483     {
    484         /* Second-pass only */
    485         vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
    486     }
    487 }
    488 
    489 
    490 void vp8_sixtap_predict8x4_ssse3
    491 (
    492     unsigned char  *src_ptr,
    493     int   src_pixels_per_line,
    494     int  xoffset,
    495     int  yoffset,
    496     unsigned char *dst_ptr,
    497     int dst_pitch
    498 )
    499 {
    500     DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
    501 
    502     if (xoffset)
    503     {
    504         if (yoffset)
    505         {
    506             vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset);
    507             vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
    508         }
    509         else
    510         {
    511             /* First-pass only */
    512             vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
    513         }
    514     }
    515     else
    516     {
    517         /* Second-pass only */
    518         vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
    519     }
    520 }
    521 
    522 void vp8_sixtap_predict4x4_ssse3
    523 (
    524     unsigned char  *src_ptr,
    525     int   src_pixels_per_line,
    526     int  xoffset,
    527     int  yoffset,
    528     unsigned char *dst_ptr,
    529     int dst_pitch
    530 )
    531 {
    532   DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
    533 
    534   if (xoffset)
    535   {
    536       if (yoffset)
    537       {
    538           vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
    539           vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
    540       }
    541       else
    542       {
    543           vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
    544       }
    545   }
    546   else
    547   {
    548       vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
    549   }
    550 
    551 }
    552 
    553 #endif
    554