Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_config.h"
     12 #include "vp8_rtcd.h"
     13 #include "vpx_ports/mem.h"
     14 
     15 extern const short vp8_six_tap_x86[8][6 * 8];
     16 
     17 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
     18                                       unsigned short *output_ptr,
     19                                       unsigned int src_pixels_per_line,
     20                                       unsigned int pixel_step,
     21                                       unsigned int output_height,
     22                                       unsigned int output_width,
     23                                       const short *vp8_filter);
     24 extern void vp8_filter_block1dc_v6_mmx(
     25     unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
     26     unsigned int pixels_per_line, unsigned int pixel_step,
     27     unsigned int output_height, unsigned int output_width,
     28     const short *vp8_filter);
     29 extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
     30                                         unsigned short *output_ptr,
     31                                         unsigned int src_pixels_per_line,
     32                                         unsigned int pixel_step,
     33                                         unsigned int output_height,
     34                                         unsigned int output_width,
     35                                         const short *vp8_filter);
     36 extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
     37                                          unsigned short *output_ptr,
     38                                          unsigned int src_pixels_per_line,
     39                                          unsigned int pixel_step,
     40                                          unsigned int output_height,
     41                                          unsigned int output_width,
     42                                          const short *vp8_filter);
     43 extern void vp8_filter_block1d8_v6_sse2(
     44     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
     45     unsigned int pixels_per_line, unsigned int pixel_step,
     46     unsigned int output_height, unsigned int output_width,
     47     const short *vp8_filter);
     48 extern void vp8_filter_block1d16_v6_sse2(
     49     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
     50     unsigned int pixels_per_line, unsigned int pixel_step,
     51     unsigned int output_height, unsigned int output_width,
     52     const short *vp8_filter);
     53 extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
     54                                          unsigned short *output_ptr,
     55                                          unsigned int src_pixels_per_line,
     56                                          unsigned int output_height,
     57                                          unsigned int output_width);
     58 extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
     59                                              unsigned int src_pixels_per_line,
     60                                              unsigned char *output_ptr,
     61                                              int dst_ptich,
     62                                              unsigned int output_height,
     63                                              const short *vp8_filter);
     64 extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
     65                                               unsigned int src_pixels_per_line,
     66                                               unsigned char *output_ptr,
     67                                               int dst_ptich,
     68                                               unsigned int output_height,
     69                                               const short *vp8_filter);
     70 extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
     71                                              unsigned int src_pixels_per_line,
     72                                              unsigned char *output_ptr,
     73                                              int dst_ptich,
     74                                              unsigned int output_height,
     75                                              const short *vp8_filter);
     76 
     77 #if HAVE_MMX
     78 void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
     79                                int xoffset, int yoffset, unsigned char *dst_ptr,
     80                                int dst_pitch) {
     81   DECLARE_ALIGNED(16, unsigned short,
     82                   FData2[16 * 16]); /* Temp data bufffer used in filtering */
     83   const short *HFilter, *VFilter;
     84   HFilter = vp8_six_tap_x86[xoffset];
     85   vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
     86                             src_pixels_per_line, 1, 9, 8, HFilter);
     87   VFilter = vp8_six_tap_x86[yoffset];
     88   vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
     89                              VFilter);
     90 }
     91 #endif
     92 
     93 #if HAVE_SSE2
     94 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
     95                                   int src_pixels_per_line, int xoffset,
     96                                   int yoffset, unsigned char *dst_ptr,
     97                                   int dst_pitch) {
     98   DECLARE_ALIGNED(16, unsigned short,
     99                   FData2[24 * 24]); /* Temp data bufffer used in filtering */
    100 
    101   const short *HFilter, *VFilter;
    102 
    103   if (xoffset) {
    104     if (yoffset) {
    105       HFilter = vp8_six_tap_x86[xoffset];
    106       vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    107                                    src_pixels_per_line, 1, 21, 32, HFilter);
    108       VFilter = vp8_six_tap_x86[yoffset];
    109       vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
    110                                    dst_pitch, VFilter);
    111     } else {
    112       /* First-pass only */
    113       HFilter = vp8_six_tap_x86[xoffset];
    114       vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
    115                                         dst_pitch, 16, HFilter);
    116     }
    117   } else {
    118     /* Second-pass only */
    119     VFilter = vp8_six_tap_x86[yoffset];
    120     vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    121                                  src_pixels_per_line, 21, 32);
    122     vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
    123                                  dst_pitch, VFilter);
    124   }
    125 }
    126 
    127 void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
    128                                 int xoffset, int yoffset,
    129                                 unsigned char *dst_ptr, int dst_pitch) {
    130   DECLARE_ALIGNED(16, unsigned short,
    131                   FData2[256]); /* Temp data bufffer used in filtering */
    132   const short *HFilter, *VFilter;
    133 
    134   if (xoffset) {
    135     if (yoffset) {
    136       HFilter = vp8_six_tap_x86[xoffset];
    137       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    138                                   src_pixels_per_line, 1, 13, 16, HFilter);
    139       VFilter = vp8_six_tap_x86[yoffset];
    140       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
    141                                   dst_pitch, VFilter);
    142     } else {
    143       /* First-pass only */
    144       HFilter = vp8_six_tap_x86[xoffset];
    145       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
    146                                        dst_pitch, 8, HFilter);
    147     }
    148   } else {
    149     /* Second-pass only */
    150     VFilter = vp8_six_tap_x86[yoffset];
    151     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
    152                                      src_pixels_per_line, dst_ptr, dst_pitch, 8,
    153                                      VFilter);
    154   }
    155 }
    156 
    157 void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
    158                                 int xoffset, int yoffset,
    159                                 unsigned char *dst_ptr, int dst_pitch) {
    160   DECLARE_ALIGNED(16, unsigned short,
    161                   FData2[256]); /* Temp data bufffer used in filtering */
    162   const short *HFilter, *VFilter;
    163 
    164   if (xoffset) {
    165     if (yoffset) {
    166       HFilter = vp8_six_tap_x86[xoffset];
    167       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
    168                                   src_pixels_per_line, 1, 9, 16, HFilter);
    169       VFilter = vp8_six_tap_x86[yoffset];
    170       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
    171                                   dst_pitch, VFilter);
    172     } else {
    173       /* First-pass only */
    174       HFilter = vp8_six_tap_x86[xoffset];
    175       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
    176                                        dst_pitch, 4, HFilter);
    177     }
    178   } else {
    179     /* Second-pass only */
    180     VFilter = vp8_six_tap_x86[yoffset];
    181     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
    182                                      src_pixels_per_line, dst_ptr, dst_pitch, 4,
    183                                      VFilter);
    184   }
    185 }
    186 
    187 #endif
    188 
    189 #if HAVE_SSSE3
    190 
    191 extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
    192                                          unsigned int src_pixels_per_line,
    193                                          unsigned char *output_ptr,
    194                                          unsigned int output_pitch,
    195                                          unsigned int output_height,
    196                                          unsigned int vp8_filter_index);
    197 
    198 extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
    199                                           unsigned int src_pixels_per_line,
    200                                           unsigned char *output_ptr,
    201                                           unsigned int output_pitch,
    202                                           unsigned int output_height,
    203                                           unsigned int vp8_filter_index);
    204 
    205 extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
    206                                           unsigned int src_pitch,
    207                                           unsigned char *output_ptr,
    208                                           unsigned int out_pitch,
    209                                           unsigned int output_height,
    210                                           unsigned int vp8_filter_index);
    211 
    212 extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
    213                                          unsigned int src_pitch,
    214                                          unsigned char *output_ptr,
    215                                          unsigned int out_pitch,
    216                                          unsigned int output_height,
    217                                          unsigned int vp8_filter_index);
    218 
    219 extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
    220                                          unsigned int src_pixels_per_line,
    221                                          unsigned char *output_ptr,
    222                                          unsigned int output_pitch,
    223                                          unsigned int output_height,
    224                                          unsigned int vp8_filter_index);
    225 
    226 extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
    227                                          unsigned int src_pitch,
    228                                          unsigned char *output_ptr,
    229                                          unsigned int out_pitch,
    230                                          unsigned int output_height,
    231                                          unsigned int vp8_filter_index);
    232 
    233 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
    234                                    int src_pixels_per_line, int xoffset,
    235                                    int yoffset, unsigned char *dst_ptr,
    236                                    int dst_pitch) {
    237   DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
    238 
    239   if (xoffset) {
    240     if (yoffset) {
    241       vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    242                                     src_pixels_per_line, FData2, 16, 21,
    243                                     xoffset);
    244       vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
    245                                     yoffset);
    246     } else {
    247       /* First-pass only */
    248       vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    249                                     dst_pitch, 16, xoffset);
    250     }
    251   } else {
    252     if (yoffset) {
    253       /* Second-pass only */
    254       vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    255                                     src_pixels_per_line, dst_ptr, dst_pitch, 16,
    256                                     yoffset);
    257     } else {
    258       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    259        * yoffset==0) case correctly. Add copy function here to guarantee
    260        * six-tap function handles all possible offsets. */
    261       vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
    262     }
    263   }
    264 }
    265 
    266 void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
    267                                  int src_pixels_per_line, int xoffset,
    268                                  int yoffset, unsigned char *dst_ptr,
    269                                  int dst_pitch) {
    270   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
    271 
    272   if (xoffset) {
    273     if (yoffset) {
    274       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    275                                    src_pixels_per_line, FData2, 8, 13, xoffset);
    276       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
    277     } else {
    278       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    279                                    dst_pitch, 8, xoffset);
    280     }
    281   } else {
    282     if (yoffset) {
    283       /* Second-pass only */
    284       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    285                                    src_pixels_per_line, dst_ptr, dst_pitch, 8,
    286                                    yoffset);
    287     } else {
    288       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    289        * yoffset==0) case correctly. Add copy function here to guarantee
    290        * six-tap function handles all possible offsets. */
    291       vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
    292     }
    293   }
    294 }
    295 
    296 void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
    297                                  int src_pixels_per_line, int xoffset,
    298                                  int yoffset, unsigned char *dst_ptr,
    299                                  int dst_pitch) {
    300   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
    301 
    302   if (xoffset) {
    303     if (yoffset) {
    304       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    305                                    src_pixels_per_line, FData2, 8, 9, xoffset);
    306       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
    307     } else {
    308       /* First-pass only */
    309       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    310                                    dst_pitch, 4, xoffset);
    311     }
    312   } else {
    313     if (yoffset) {
    314       /* Second-pass only */
    315       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    316                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
    317                                    yoffset);
    318     } else {
    319       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    320        * yoffset==0) case correctly. Add copy function here to guarantee
    321        * six-tap function handles all possible offsets. */
    322       vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
    323     }
    324   }
    325 }
    326 
    327 void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
    328                                  int src_pixels_per_line, int xoffset,
    329                                  int yoffset, unsigned char *dst_ptr,
    330                                  int dst_pitch) {
    331   DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
    332 
    333   if (xoffset) {
    334     if (yoffset) {
    335       vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
    336                                    src_pixels_per_line, FData2, 4, 9, xoffset);
    337       vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
    338     } else {
    339       vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
    340                                    dst_pitch, 4, xoffset);
    341     }
    342   } else {
    343     if (yoffset) {
    344       vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
    345                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
    346                                    yoffset);
    347     } else {
    348       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
    349        * yoffset==0) case correctly. Add copy function here to guarantee
    350        * six-tap function handles all possible offsets. */
    351       int r;
    352 
    353       for (r = 0; r < 4; ++r) {
    354         dst_ptr[0] = src_ptr[0];
    355         dst_ptr[1] = src_ptr[1];
    356         dst_ptr[2] = src_ptr[2];
    357         dst_ptr[3] = src_ptr[3];
    358         dst_ptr += dst_pitch;
    359         src_ptr += src_pixels_per_line;
    360       }
    361     }
    362   }
    363 }
    364 
    365 #endif
    366