Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_config.h"
     12 #include "vp8_rtcd.h"
     13 #include "vpx_ports/mem.h"
     14 #include "filter_x86.h"
     15 
     16 extern const short vp8_six_tap_x86[8][6 * 8];
     17 
     18 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
     19                                       unsigned short *output_ptr,
     20                                       unsigned int src_pixels_per_line,
     21                                       unsigned int pixel_step,
     22                                       unsigned int output_height,
     23                                       unsigned int output_width,
     24                                       const short *vp8_filter);
     25 extern void vp8_filter_block1dc_v6_mmx(
     26     unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
     27     unsigned int pixels_per_line, unsigned int pixel_step,
     28     unsigned int output_height, unsigned int output_width,
     29     const short *vp8_filter);
     30 extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
     31                                         unsigned short *output_ptr,
     32                                         unsigned int src_pixels_per_line,
     33                                         unsigned int pixel_step,
     34                                         unsigned int output_height,
     35                                         unsigned int output_width,
     36                                         const short *vp8_filter);
     37 extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
     38                                          unsigned short *output_ptr,
     39                                          unsigned int src_pixels_per_line,
     40                                          unsigned int pixel_step,
     41                                          unsigned int output_height,
     42                                          unsigned int output_width,
     43                                          const short *vp8_filter);
     44 extern void vp8_filter_block1d8_v6_sse2(
     45     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
     46     unsigned int pixels_per_line, unsigned int pixel_step,
     47     unsigned int output_height, unsigned int output_width,
     48     const short *vp8_filter);
     49 extern void vp8_filter_block1d16_v6_sse2(
     50     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
     51     unsigned int pixels_per_line, unsigned int pixel_step,
     52     unsigned int output_height, unsigned int output_width,
     53     const short *vp8_filter);
     54 extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
     55                                          unsigned short *output_ptr,
     56                                          unsigned int src_pixels_per_line,
     57                                          unsigned int output_height,
     58                                          unsigned int output_width);
     59 extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
     60                                              unsigned int src_pixels_per_line,
     61                                              unsigned char *output_ptr,
     62                                              int dst_ptich,
     63                                              unsigned int output_height,
     64                                              const short *vp8_filter);
     65 extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
     66                                               unsigned int src_pixels_per_line,
     67                                               unsigned char *output_ptr,
     68                                               int dst_ptich,
     69                                               unsigned int output_height,
     70                                               const short *vp8_filter);
     71 extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
     72                                              unsigned int src_pixels_per_line,
     73                                              unsigned char *output_ptr,
     74                                              int dst_ptich,
     75                                              unsigned int output_height,
     76                                              const short *vp8_filter);
     77 
     78 #if HAVE_MMX
     79 void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
     80                                int xoffset, int yoffset, unsigned char *dst_ptr,
     81                                int dst_pitch) {
     82   DECLARE_ALIGNED(16, unsigned short,
     83                   FData2[16 * 16]); /* Temp data bufffer used in filtering */
     84   const short *HFilter, *VFilter;
     85   HFilter = vp8_six_tap_x86[xoffset];
     86   vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
     87                             src_pixels_per_line, 1, 9, 8, HFilter);
     88   VFilter = vp8_six_tap_x86[yoffset];
     89   vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
     90                              VFilter);
     91 }
     92 #endif
     93 
     94 #if HAVE_SSE2
     95 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
     96                                   int src_pixels_per_line, int xoffset,
     97                                   int yoffset, unsigned char *dst_ptr,
     98                                   int dst_pitch
     99 
    100                                   ) {
    101   DECLARE_ALIGNED(16, unsigned short,
    102                   FData2[24 * 24]); /* Temp data bufffer used in filtering */
    103 
    104   const short *HFilter, *VFilter;
    105 
    106   if (xoffset) {
    107     if (yoffset) {
    108       HFilter = vp8_six_tap_x86[xoffset];
    109       vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    110                                    src_pixels_per_line, 1, 21, 32, HFilter);
    111       VFilter = vp8_six_tap_x86[yoffset];
    112       vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
    113                                    dst_pitch, VFilter);
    114     } else {
    115       /* First-pass only */
    116       HFilter = vp8_six_tap_x86[xoffset];
    117       vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
    118                                         dst_pitch, 16, HFilter);
    119     }
    120   } else {
    121     /* Second-pass only */
    122     VFilter = vp8_six_tap_x86[yoffset];
    123     vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    124                                  src_pixels_per_line, 21, 32);
    125     vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
    126                                  dst_pitch, VFilter);
    127   }
    128 }
    129 
    130 void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
    131                                 int xoffset, int yoffset,
    132                                 unsigned char *dst_ptr, int dst_pitch) {
    133   DECLARE_ALIGNED(16, unsigned short,
    134                   FData2[256]); /* Temp data bufffer used in filtering */
    135   const short *HFilter, *VFilter;
    136 
    137   if (xoffset) {
    138     if (yoffset) {
    139       HFilter = vp8_six_tap_x86[xoffset];
    140       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    141                                   src_pixels_per_line, 1, 13, 16, HFilter);
    142       VFilter = vp8_six_tap_x86[yoffset];
    143       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
    144                                   dst_pitch, VFilter);
    145     } else {
    146       /* First-pass only */
    147       HFilter = vp8_six_tap_x86[xoffset];
    148       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
    149                                        dst_pitch, 8, HFilter);
    150     }
    151   } else {
    152     /* Second-pass only */
    153     VFilter = vp8_six_tap_x86[yoffset];
    154     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
    155                                      src_pixels_per_line, dst_ptr, dst_pitch, 8,
    156                                      VFilter);
    157   }
    158 }
    159 
    160 void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
    161                                 int xoffset, int yoffset,
    162                                 unsigned char *dst_ptr, int dst_pitch) {
    163   DECLARE_ALIGNED(16, unsigned short,
    164                   FData2[256]); /* Temp data bufffer used in filtering */
    165   const short *HFilter, *VFilter;
    166 
    167   if (xoffset) {
    168     if (yoffset) {
    169       HFilter = vp8_six_tap_x86[xoffset];
    170       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    171                                   src_pixels_per_line, 1, 9, 16, HFilter);
    172       VFilter = vp8_six_tap_x86[yoffset];
    173       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
    174                                   dst_pitch, VFilter);
    175     } else {
    176       /* First-pass only */
    177       HFilter = vp8_six_tap_x86[xoffset];
    178       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
    179                                        dst_pitch, 4, HFilter);
    180     }
    181   } else {
    182     /* Second-pass only */
    183     VFilter = vp8_six_tap_x86[yoffset];
    184     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
    185                                      src_pixels_per_line, dst_ptr, dst_pitch, 4,
    186                                      VFilter);
    187   }
    188 }
    189 
    190 #endif
    191 
    192 #if HAVE_SSSE3
    193 
    194 extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
    195                                          unsigned int src_pixels_per_line,
    196                                          unsigned char *output_ptr,
    197                                          unsigned int output_pitch,
    198                                          unsigned int output_height,
    199                                          unsigned int vp8_filter_index);
    200 
    201 extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
    202                                           unsigned int src_pixels_per_line,
    203                                           unsigned char *output_ptr,
    204                                           unsigned int output_pitch,
    205                                           unsigned int output_height,
    206                                           unsigned int vp8_filter_index);
    207 
    208 extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
    209                                           unsigned int src_pitch,
    210                                           unsigned char *output_ptr,
    211                                           unsigned int out_pitch,
    212                                           unsigned int output_height,
    213                                           unsigned int vp8_filter_index);
    214 
    215 extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
    216                                          unsigned int src_pitch,
    217                                          unsigned char *output_ptr,
    218                                          unsigned int out_pitch,
    219                                          unsigned int output_height,
    220                                          unsigned int vp8_filter_index);
    221 
    222 extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
    223                                          unsigned int src_pixels_per_line,
    224                                          unsigned char *output_ptr,
    225                                          unsigned int output_pitch,
    226                                          unsigned int output_height,
    227                                          unsigned int vp8_filter_index);
    228 
    229 extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
    230                                          unsigned int src_pitch,
    231                                          unsigned char *output_ptr,
    232                                          unsigned int out_pitch,
    233                                          unsigned int output_height,
    234                                          unsigned int vp8_filter_index);
    235 
    236 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
    237                                    int src_pixels_per_line, int xoffset,
    238                                    int yoffset, unsigned char *dst_ptr,
    239                                    int dst_pitch
    240 
    241                                    ) {
    242   DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
    243 
    244   if (xoffset) {
    245     if (yoffset) {
    246       vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    247                                     src_pixels_per_line, FData2, 16, 21,
    248                                     xoffset);
    249       vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
    250                                     yoffset);
    251     } else {
    252       /* First-pass only */
    253       vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    254                                     dst_pitch, 16, xoffset);
    255     }
    256   } else {
    257     if (yoffset) {
    258       /* Second-pass only */
    259       vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    260                                     src_pixels_per_line, dst_ptr, dst_pitch, 16,
    261                                     yoffset);
    262     } else {
    263       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    264        * yoffset==0) case correctly. Add copy function here to guarantee
    265        * six-tap function handles all possible offsets. */
    266       vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
    267     }
    268   }
    269 }
    270 
    271 void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
    272                                  int src_pixels_per_line, int xoffset,
    273                                  int yoffset, unsigned char *dst_ptr,
    274                                  int dst_pitch) {
    275   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
    276 
    277   if (xoffset) {
    278     if (yoffset) {
    279       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    280                                    src_pixels_per_line, FData2, 8, 13, xoffset);
    281       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
    282     } else {
    283       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    284                                    dst_pitch, 8, xoffset);
    285     }
    286   } else {
    287     if (yoffset) {
    288       /* Second-pass only */
    289       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    290                                    src_pixels_per_line, dst_ptr, dst_pitch, 8,
    291                                    yoffset);
    292     } else {
    293       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    294        * yoffset==0) case correctly. Add copy function here to guarantee
    295        * six-tap function handles all possible offsets. */
    296       vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
    297     }
    298   }
    299 }
    300 
    301 void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
    302                                  int src_pixels_per_line, int xoffset,
    303                                  int yoffset, unsigned char *dst_ptr,
    304                                  int dst_pitch) {
    305   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
    306 
    307   if (xoffset) {
    308     if (yoffset) {
    309       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    310                                    src_pixels_per_line, FData2, 8, 9, xoffset);
    311       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
    312     } else {
    313       /* First-pass only */
    314       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    315                                    dst_pitch, 4, xoffset);
    316     }
    317   } else {
    318     if (yoffset) {
    319       /* Second-pass only */
    320       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    321                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
    322                                    yoffset);
    323     } else {
    324       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    325        * yoffset==0) case correctly. Add copy function here to guarantee
    326        * six-tap function handles all possible offsets. */
    327       vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
    328     }
    329   }
    330 }
    331 
    332 void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
    333                                  int src_pixels_per_line, int xoffset,
    334                                  int yoffset, unsigned char *dst_ptr,
    335                                  int dst_pitch) {
    336   DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
    337 
    338   if (xoffset) {
    339     if (yoffset) {
    340       vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    341                                    src_pixels_per_line, FData2, 4, 9, xoffset);
    342       vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
    343     } else {
    344       vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    345                                    dst_pitch, 4, xoffset);
    346     }
    347   } else {
    348     if (yoffset) {
    349       vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    350                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
    351                                    yoffset);
    352     } else {
    353       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    354         * yoffset==0) case correctly. Add copy function here to guarantee
    355         * six-tap function handles all possible offsets. */
    356       int r;
    357 
    358       for (r = 0; r < 4; ++r) {
    359         dst_ptr[0] = src_ptr[0];
    360         dst_ptr[1] = src_ptr[1];
    361         dst_ptr[2] = src_ptr[2];
    362         dst_ptr[3] = src_ptr[3];
    363         dst_ptr += dst_pitch;
    364         src_ptr += src_pixels_per_line;
    365       }
    366     }
    367   }
    368 }
    369 
    370 #endif
    371