Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 
     13 #include "./vpx_config.h"
     14 #include "./vp9_rtcd.h"
     15 #include "vpx_ports/mem.h"
     16 ///////////////////////////////////////////////////////////////////////////
     17 // the mmx function that does the bilinear filtering and var calculation //
     18 // int one pass                                                          //
     19 ///////////////////////////////////////////////////////////////////////////
     20 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
     21   { 128, 128, 128, 128,  0,  0,  0,  0 },
     22   { 120, 120, 120, 120,  8,  8,  8,  8 },
     23   { 112, 112, 112, 112, 16, 16, 16, 16 },
     24   { 104, 104, 104, 104, 24, 24, 24, 24 },
     25   {  96, 96, 96, 96, 32, 32, 32, 32 },
     26   {  88, 88, 88, 88, 40, 40, 40, 40 },
     27   {  80, 80, 80, 80, 48, 48, 48, 48 },
     28   {  72, 72, 72, 72, 56, 56, 56, 56 },
     29   {  64, 64, 64, 64, 64, 64, 64, 64 },
     30   {  56, 56, 56, 56, 72, 72, 72, 72 },
     31   {  48, 48, 48, 48, 80, 80, 80, 80 },
     32   {  40, 40, 40, 40, 88, 88, 88, 88 },
     33   {  32, 32, 32, 32, 96, 96, 96, 96 },
     34   {  24, 24, 24, 24, 104, 104, 104, 104 },
     35   {  16, 16, 16, 16, 112, 112, 112, 112 },
     36   {   8,  8,  8,  8, 120, 120, 120, 120 }
     37 };
     38 
     39 typedef void filter8_1dfunction (
     40   const unsigned char *src_ptr,
     41   const unsigned int src_pitch,
     42   unsigned char *output_ptr,
     43   unsigned int out_pitch,
     44   unsigned int output_height,
     45   const short *filter
     46 );
     47 
     48 #if HAVE_SSSE3
     49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
     50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
     51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
     52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
     53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
     54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
     55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
     56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
     57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
     58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
     59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
     60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
     61 
     62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
     63                                uint8_t *dst, ptrdiff_t dst_stride,
     64                                const int16_t *filter_x, int x_step_q4,
     65                                const int16_t *filter_y, int y_step_q4,
     66                                int w, int h) {
     67   /* Ensure the filter can be compressed to int16_t. */
     68   if (x_step_q4 == 16 && filter_x[3] != 128) {
     69     while (w >= 16) {
     70       vp9_filter_block1d16_h8_ssse3(src, src_stride,
     71                                     dst, dst_stride,
     72                                     h, filter_x);
     73       src += 16;
     74       dst += 16;
     75       w -= 16;
     76     }
     77     while (w >= 8) {
     78       vp9_filter_block1d8_h8_ssse3(src, src_stride,
     79                                    dst, dst_stride,
     80                                    h, filter_x);
     81       src += 8;
     82       dst += 8;
     83       w -= 8;
     84     }
     85     while (w >= 4) {
     86       vp9_filter_block1d4_h8_ssse3(src, src_stride,
     87                                    dst, dst_stride,
     88                                    h, filter_x);
     89       src += 4;
     90       dst += 4;
     91       w -= 4;
     92     }
     93   }
     94   if (w) {
     95     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
     96                           filter_x, x_step_q4, filter_y, y_step_q4,
     97                           w, h);
     98   }
     99 }
    100 
    101 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    102                               uint8_t *dst, ptrdiff_t dst_stride,
    103                               const int16_t *filter_x, int x_step_q4,
    104                               const int16_t *filter_y, int y_step_q4,
    105                               int w, int h) {
    106   if (y_step_q4 == 16 && filter_y[3] != 128) {
    107     while (w >= 16) {
    108       vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
    109                                     dst, dst_stride,
    110                                     h, filter_y);
    111       src += 16;
    112       dst += 16;
    113       w -= 16;
    114     }
    115     while (w >= 8) {
    116       vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
    117                                    dst, dst_stride,
    118                                    h, filter_y);
    119       src += 8;
    120       dst += 8;
    121       w -= 8;
    122     }
    123     while (w >= 4) {
    124       vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
    125                                    dst, dst_stride,
    126                                    h, filter_y);
    127       src += 4;
    128       dst += 4;
    129       w -= 4;
    130     }
    131   }
    132   if (w) {
    133     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
    134                          filter_x, x_step_q4, filter_y, y_step_q4,
    135                          w, h);
    136   }
    137 }
    138 
    139 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    140                                uint8_t *dst, ptrdiff_t dst_stride,
    141                                const int16_t *filter_x, int x_step_q4,
    142                                const int16_t *filter_y, int y_step_q4,
    143                                int w, int h) {
    144   if (x_step_q4 == 16 && filter_x[3] != 128) {
    145     while (w >= 16) {
    146       vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
    147                                     dst, dst_stride,
    148                                     h, filter_x);
    149       src += 16;
    150       dst += 16;
    151       w -= 16;
    152     }
    153     while (w >= 8) {
    154       vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
    155                                    dst, dst_stride,
    156                                    h, filter_x);
    157       src += 8;
    158       dst += 8;
    159       w -= 8;
    160     }
    161     while (w >= 4) {
    162       vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
    163                                    dst, dst_stride,
    164                                    h, filter_x);
    165       src += 4;
    166       dst += 4;
    167       w -= 4;
    168     }
    169   }
    170   if (w) {
    171     vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
    172                               filter_x, x_step_q4, filter_y, y_step_q4,
    173                               w, h);
    174   }
    175 }
    176 
    177 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    178                               uint8_t *dst, ptrdiff_t dst_stride,
    179                               const int16_t *filter_x, int x_step_q4,
    180                               const int16_t *filter_y, int y_step_q4,
    181                               int w, int h) {
    182   if (y_step_q4 == 16 && filter_y[3] != 128) {
    183     while (w >= 16) {
    184       vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
    185                                     dst, dst_stride,
    186                                     h, filter_y);
    187       src += 16;
    188       dst += 16;
    189       w -= 16;
    190     }
    191     while (w >= 8) {
    192       vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
    193                                    dst, dst_stride,
    194                                    h, filter_y);
    195       src += 8;
    196       dst += 8;
    197       w -= 8;
    198     }
    199     while (w >= 4) {
    200       vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
    201                                    dst, dst_stride,
    202                                    h, filter_y);
    203       src += 4;
    204       dst += 4;
    205       w -= 4;
    206     }
    207   }
    208   if (w) {
    209     vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
    210                              filter_x, x_step_q4, filter_y, y_step_q4,
    211                              w, h);
    212   }
    213 }
    214 
    215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    216                          uint8_t *dst, ptrdiff_t dst_stride,
    217                          const int16_t *filter_x, int x_step_q4,
    218                          const int16_t *filter_y, int y_step_q4,
    219                          int w, int h) {
    220   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
    221 
    222   assert(w <= 64);
    223   assert(h <= 64);
    224   if (x_step_q4 == 16 && y_step_q4 == 16) {
    225     vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
    226                               filter_x, x_step_q4, filter_y, y_step_q4,
    227                               w, h + 7);
    228     vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
    229                              filter_x, x_step_q4, filter_y, y_step_q4, w, h);
    230   } else {
    231     vp9_convolve8_c(src, src_stride, dst, dst_stride,
    232                     filter_x, x_step_q4, filter_y, y_step_q4, w, h);
    233   }
    234 }
    235 
    236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    237                          uint8_t *dst, ptrdiff_t dst_stride,
    238                          const int16_t *filter_x, int x_step_q4,
    239                          const int16_t *filter_y, int y_step_q4,
    240                          int w, int h) {
    241   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
    242 
    243   assert(w <= 64);
    244   assert(h <= 64);
    245   if (x_step_q4 == 16 && y_step_q4 == 16) {
    246     vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
    247                               filter_x, x_step_q4, filter_y, y_step_q4,
    248                               w, h + 7);
    249     vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
    250                                  filter_x, x_step_q4, filter_y, y_step_q4,
    251                                  w, h);
    252   } else {
    253     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
    254                         filter_x, x_step_q4, filter_y, y_step_q4, w, h);
    255   }
    256 }
    257 #endif
    258 
    259 #if HAVE_SSE2
    260 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
    261 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
    262 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
    263 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
    264 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
    265 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
    266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
    267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
    268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
    269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
    270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
    271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
    272 
    273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
    274                                uint8_t *dst, ptrdiff_t dst_stride,
    275                                const int16_t *filter_x, int x_step_q4,
    276                                const int16_t *filter_y, int y_step_q4,
    277                                int w, int h) {
    278   /* Ensure the filter can be compressed to int16_t. */
    279   if (x_step_q4 == 16 && filter_x[3] != 128) {
    280     while (w >= 16) {
    281       vp9_filter_block1d16_h8_sse2(src, src_stride,
    282                                     dst, dst_stride,
    283                                     h, filter_x);
    284       src += 16;
    285       dst += 16;
    286       w -= 16;
    287     }
    288     while (w >= 8) {
    289       vp9_filter_block1d8_h8_sse2(src, src_stride,
    290                                    dst, dst_stride,
    291                                    h, filter_x);
    292       src += 8;
    293       dst += 8;
    294       w -= 8;
    295     }
    296     while (w >= 4) {
    297       vp9_filter_block1d4_h8_sse2(src, src_stride,
    298                                    dst, dst_stride,
    299                                    h, filter_x);
    300       src += 4;
    301       dst += 4;
    302       w -= 4;
    303     }
    304   }
    305   if (w) {
    306     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
    307                           filter_x, x_step_q4, filter_y, y_step_q4,
    308                           w, h);
    309   }
    310 }
    311 
    312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
    313                               uint8_t *dst, ptrdiff_t dst_stride,
    314                               const int16_t *filter_x, int x_step_q4,
    315                               const int16_t *filter_y, int y_step_q4,
    316                               int w, int h) {
    317   if (y_step_q4 == 16 && filter_y[3] != 128) {
    318     while (w >= 16) {
    319       vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
    320                                     dst, dst_stride,
    321                                     h, filter_y);
    322       src += 16;
    323       dst += 16;
    324       w -= 16;
    325     }
    326     while (w >= 8) {
    327       vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
    328                                    dst, dst_stride,
    329                                    h, filter_y);
    330       src += 8;
    331       dst += 8;
    332       w -= 8;
    333     }
    334     while (w >= 4) {
    335       vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
    336                                    dst, dst_stride,
    337                                    h, filter_y);
    338       src += 4;
    339       dst += 4;
    340       w -= 4;
    341     }
    342   }
    343   if (w) {
    344     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
    345                          filter_x, x_step_q4, filter_y, y_step_q4,
    346                          w, h);
    347   }
    348 }
    349 
    350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
    351                                uint8_t *dst, ptrdiff_t dst_stride,
    352                                const int16_t *filter_x, int x_step_q4,
    353                                const int16_t *filter_y, int y_step_q4,
    354                                int w, int h) {
    355   if (x_step_q4 == 16 && filter_x[3] != 128) {
    356     while (w >= 16) {
    357       vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
    358                                     dst, dst_stride,
    359                                     h, filter_x);
    360       src += 16;
    361       dst += 16;
    362       w -= 16;
    363     }
    364     while (w >= 8) {
    365       vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
    366                                    dst, dst_stride,
    367                                    h, filter_x);
    368       src += 8;
    369       dst += 8;
    370       w -= 8;
    371     }
    372     while (w >= 4) {
    373       vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
    374                                    dst, dst_stride,
    375                                    h, filter_x);
    376       src += 4;
    377       dst += 4;
    378       w -= 4;
    379     }
    380   }
    381   if (w) {
    382     vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
    383                               filter_x, x_step_q4, filter_y, y_step_q4,
    384                               w, h);
    385   }
    386 }
    387 
    388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
    389                               uint8_t *dst, ptrdiff_t dst_stride,
    390                               const int16_t *filter_x, int x_step_q4,
    391                               const int16_t *filter_y, int y_step_q4,
    392                               int w, int h) {
    393   if (y_step_q4 == 16 && filter_y[3] != 128) {
    394     while (w >= 16) {
    395       vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
    396                                     dst, dst_stride,
    397                                     h, filter_y);
    398       src += 16;
    399       dst += 16;
    400       w -= 16;
    401     }
    402     while (w >= 8) {
    403       vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
    404                                    dst, dst_stride,
    405                                    h, filter_y);
    406       src += 8;
    407       dst += 8;
    408       w -= 8;
    409     }
    410     while (w >= 4) {
    411       vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
    412                                    dst, dst_stride,
    413                                    h, filter_y);
    414       src += 4;
    415       dst += 4;
    416       w -= 4;
    417     }
    418   }
    419   if (w) {
    420     vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
    421                              filter_x, x_step_q4, filter_y, y_step_q4,
    422                              w, h);
    423   }
    424 }
    425 
    426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
    427                          uint8_t *dst, ptrdiff_t dst_stride,
    428                          const int16_t *filter_x, int x_step_q4,
    429                          const int16_t *filter_y, int y_step_q4,
    430                          int w, int h) {
    431   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
    432 
    433   assert(w <= 64);
    434   assert(h <= 64);
    435   if (x_step_q4 == 16 && y_step_q4 == 16) {
    436     vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
    437                               filter_x, x_step_q4, filter_y, y_step_q4,
    438                               w, h + 7);
    439     vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
    440                              filter_x, x_step_q4, filter_y, y_step_q4, w, h);
    441   } else {
    442     vp9_convolve8_c(src, src_stride, dst, dst_stride,
    443                     filter_x, x_step_q4, filter_y, y_step_q4, w, h);
    444   }
    445 }
    446 
    447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
    448                          uint8_t *dst, ptrdiff_t dst_stride,
    449                          const int16_t *filter_x, int x_step_q4,
    450                          const int16_t *filter_y, int y_step_q4,
    451                          int w, int h) {
    452   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
    453 
    454   assert(w <= 64);
    455   assert(h <= 64);
    456   if (x_step_q4 == 16 && y_step_q4 == 16) {
    457     vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
    458                               filter_x, x_step_q4, filter_y, y_step_q4,
    459                               w, h + 7);
    460     vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
    461                                  filter_x, x_step_q4, filter_y, y_step_q4,
    462                                  w, h);
    463   } else {
    464     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
    465                         filter_x, x_step_q4, filter_y, y_step_q4, w, h);
    466   }
    467 }
    468 #endif
    469