Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <tmmintrin.h>
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_dsp/vpx_filter.h"
     15 #include "vpx_dsp/x86/convolve.h"
     16 #include "vpx_mem/vpx_mem.h"
     17 #include "vpx_ports/mem.h"
     18 #include "vpx_ports/emmintrin_compat.h"
     19 
     20 // filters only for the 4_h8 convolution
     21 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
     22   0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
     23 };
     24 
     25 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
     26   4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
     27 };
     28 
     29 // filters for 8_h8 and 16_h8
     30 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
     31   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
     32 };
     33 
     34 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
     35   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
     36 };
     37 
     38 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
     39   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
     40 };
     41 
     42 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
     43   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
     44 };
     45 
     46 // These are reused by the avx2 intrinsics.
     47 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
     48 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
     49 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
     50 
     51 void vpx_filter_block1d4_h8_intrin_ssse3(
     52     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     53     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
     54   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
     55   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
     56   __m128i addFilterReg64, filtersReg, srcReg, minReg;
     57   unsigned int i;
     58 
     59   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
     60   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
     61   filtersReg = _mm_loadu_si128((const __m128i *)filter);
     62   // converting the 16 bit (short) to  8 bit (byte) and have the same data
     63   // in both lanes of 128 bit register.
     64   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
     65 
     66   // duplicate only the first 16 bits in the filter into the first lane
     67   firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
     68   // duplicate only the third 16 bit in the filter into the first lane
     69   secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
     70   // duplicate only the seconds 16 bits in the filter into the second lane
     71   // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
     72   firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
     73   // duplicate only the forth 16 bits in the filter into the second lane
     74   // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
     75   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
     76 
     77   // loading the local filters
     78   shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
     79   shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
     80 
     81   for (i = 0; i < output_height; i++) {
     82     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
     83 
     84     // filter the source buffer
     85     srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
     86     srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
     87 
     88     // multiply 2 adjacent elements with the filter and add the result
     89     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     90     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
     91 
     92     // extract the higher half of the lane
     93     srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
     94     srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
     95 
     96     minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
     97 
     98     // add and saturate all the results together
     99     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    100     srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
    101     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    102     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
    103     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
    104 
    105     // shift by 7 bit each 16 bits
    106     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
    107 
    108     // shrink to 8 bit each 16 bits
    109     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
    110     src_ptr += src_pixels_per_line;
    111 
    112     // save only 4 bytes
    113     *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
    114 
    115     output_ptr += output_pitch;
    116   }
    117 }
    118 
    119 void vpx_filter_block1d8_h8_intrin_ssse3(
    120     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    121     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    122   __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
    123   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
    124   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
    125   __m128i addFilterReg64, filtersReg, minReg;
    126   unsigned int i;
    127 
    128   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
    129   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
    130   filtersReg = _mm_loadu_si128((const __m128i *)filter);
    131   // converting the 16 bit (short) to  8 bit (byte) and have the same data
    132   // in both lanes of 128 bit register.
    133   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    134 
    135   // duplicate only the first 16 bits (first and second byte)
    136   // across 128 bit register
    137   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
    138   // duplicate only the second 16 bits (third and forth byte)
    139   // across 128 bit register
    140   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
    141   // duplicate only the third 16 bits (fifth and sixth byte)
    142   // across 128 bit register
    143   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
    144   // duplicate only the forth 16 bits (seventh and eighth byte)
    145   // across 128 bit register
    146   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
    147 
    148   filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
    149   filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
    150   filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
    151   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
    152 
    153   for (i = 0; i < output_height; i++) {
    154     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
    155 
    156     // filter the source buffer
    157     srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
    158     srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
    159 
    160     // multiply 2 adjacent elements with the filter and add the result
    161     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    162     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
    163 
    164     // filter the source buffer
    165     srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
    166     srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
    167 
    168     // multiply 2 adjacent elements with the filter and add the result
    169     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
    170     srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
    171 
    172     // add and saturate all the results together
    173     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
    174     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    175 
    176     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
    177     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    178     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
    179     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
    180 
    181     // shift by 7 bit each 16 bits
    182     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
    183 
    184     // shrink to 8 bit each 16 bits
    185     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
    186 
    187     src_ptr += src_pixels_per_line;
    188 
    189     // save only 8 bytes
    190     _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
    191 
    192     output_ptr += output_pitch;
    193   }
    194 }
    195 
    196 void vpx_filter_block1d8_v8_intrin_ssse3(
    197     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    198     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
    199   __m128i addFilterReg64, filtersReg, minReg;
    200   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
    201   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
    202   __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
    203   __m128i srcReg8;
    204   unsigned int i;
    205 
    206   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
    207   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
    208   filtersReg = _mm_loadu_si128((const __m128i *)filter);
    209   // converting the 16 bit (short) to  8 bit (byte) and have the same data
    210   // in both lanes of 128 bit register.
    211   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
    212 
    213   // duplicate only the first 16 bits in the filter
    214   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
    215   // duplicate only the second 16 bits in the filter
    216   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
    217   // duplicate only the third 16 bits in the filter
    218   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
    219   // duplicate only the forth 16 bits in the filter
    220   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
    221 
    222   // load the first 7 rows of 8 bytes
    223   srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
    224   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
    225   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
    226   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
    227   srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
    228   srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
    229   srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
    230 
    231   for (i = 0; i < output_height; i++) {
    232     // load the last 8 bytes
    233     srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
    234 
    235     // merge the result together
    236     srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
    237     srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
    238 
    239     // merge the result together
    240     srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
    241     srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
    242 
    243     // multiply 2 adjacent elements with the filter and add the result
    244     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    245     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
    246     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
    247     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
    248 
    249     // add and saturate the results together
    250     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
    251     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
    252     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
    253     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    254     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
    255     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
    256 
    257     // shift by 7 bit each 16 bit
    258     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
    259 
    260     // shrink to 8 bit each 16 bits
    261     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
    262 
    263     src_ptr += src_pitch;
    264 
    265     // shift down a row
    266     srcReg1 = srcReg2;
    267     srcReg2 = srcReg3;
    268     srcReg3 = srcReg4;
    269     srcReg4 = srcReg5;
    270     srcReg5 = srcReg6;
    271     srcReg6 = srcReg7;
    272     srcReg7 = srcReg8;
    273 
    274     // save only 8 bytes convolve result
    275     _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
    276 
    277     output_ptr += out_pitch;
    278   }
    279 }
    280 
    281 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
    282 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
    283 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
    284 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
    285 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
    286 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
    287 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
    288 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
    289 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
    290 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
    291 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
    292 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
    293 
    294 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
    295 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
    296 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
    297 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
    298 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
    299 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
    300 filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
    301 filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
    302 filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
    303 filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
    304 filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
    305 filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
    306 
    307 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    308 //                                uint8_t *dst, ptrdiff_t dst_stride,
    309 //                                const int16_t *filter_x, int x_step_q4,
    310 //                                const int16_t *filter_y, int y_step_q4,
    311 //                                int w, int h);
    312 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    313 //                               uint8_t *dst, ptrdiff_t dst_stride,
    314 //                               const int16_t *filter_x, int x_step_q4,
    315 //                               const int16_t *filter_y, int y_step_q4,
    316 //                               int w, int h);
    317 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    318 //                                    uint8_t *dst, ptrdiff_t dst_stride,
    319 //                                    const int16_t *filter_x, int x_step_q4,
    320 //                                    const int16_t *filter_y, int y_step_q4,
    321 //                                    int w, int h);
    322 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    323 //                                   uint8_t *dst, ptrdiff_t dst_stride,
    324 //                                   const int16_t *filter_x, int x_step_q4,
    325 //                                   const int16_t *filter_y, int y_step_q4,
    326 //                                   int w, int h);
    327 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
    328 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
    329 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
    330 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
    331             ssse3);
    332 
    333 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
    334                       out2, out3, out4, out5, out6, out7)                 \
    335   {                                                                       \
    336     const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
    337     const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
    338     const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
    339     const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
    340                                                                           \
    341     const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
    342     const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
    343     const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
    344     const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
    345                                                                           \
    346     const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
    347     const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
    348     const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
    349     const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
    350                                                                           \
    351     out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
    352     out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
    353     out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
    354     out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
    355     out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
    356     out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
    357     out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
    358     out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
    359   }
    360 
    361 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
    362                                   uint8_t *dst, const int16_t *x_filter) {
    363   const __m128i k_256 = _mm_set1_epi16(1 << 8);
    364   const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
    365   // pack and duplicate the filter values
    366   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
    367   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
    368   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
    369   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
    370   const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
    371   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
    372   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
    373   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
    374   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
    375   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
    376   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
    377   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
    378   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
    379   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
    380   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
    381   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
    382   // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
    383   const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
    384   // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
    385   const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
    386   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
    387   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    388   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
    389   const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    390   // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
    391   const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
    392   // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
    393   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
    394   // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
    395   const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
    396   const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
    397   const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
    398   const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
    399   // multiply 2 adjacent elements with the filter and add the result
    400   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
    401   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
    402   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
    403   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
    404   // add and saturate the results together
    405   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
    406   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
    407   __m128i temp = _mm_adds_epi16(x0, x3);
    408   temp = _mm_adds_epi16(temp, min_x2x1);
    409   temp = _mm_adds_epi16(temp, max_x2x1);
    410   // round and shift by 7 bit each 16 bit
    411   temp = _mm_mulhrs_epi16(temp, k_256);
    412   // shrink to 8 bit each 16 bits
    413   temp = _mm_packus_epi16(temp, temp);
    414   // save only 8 bytes convolve result
    415   _mm_storel_epi64((__m128i *)dst, temp);
    416 }
    417 
    418 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
    419                                 uint8_t *dst, ptrdiff_t dst_stride) {
    420   __m128i A, B, C, D, E, F, G, H;
    421 
    422   A = _mm_loadl_epi64((const __m128i *)src);
    423   B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
    424   C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
    425   D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
    426   E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
    427   F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
    428   G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
    429   H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
    430 
    431   TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
    432 
    433   _mm_storel_epi64((__m128i *)dst, A);
    434   _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
    435   _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
    436   _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
    437   _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
    438   _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
    439   _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
    440   _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
    441 }
    442 
    443 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
    444                                     uint8_t *dst, ptrdiff_t dst_stride,
    445                                     const InterpKernel *x_filters, int x0_q4,
    446                                     int x_step_q4, int w, int h) {
    447   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
    448   int x, y, z;
    449   src -= SUBPEL_TAPS / 2 - 1;
    450 
    451   // This function processes 8x8 areas.  The intermediate height is not always
    452   // a multiple of 8, so force it to be a multiple of 8 here.
    453   y = h + (8 - (h & 0x7));
    454 
    455   do {
    456     int x_q4 = x0_q4;
    457     for (x = 0; x < w; x += 8) {
    458       // process 8 src_x steps
    459       for (z = 0; z < 8; ++z) {
    460         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
    461         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
    462         if (x_q4 & SUBPEL_MASK) {
    463           filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
    464         } else {
    465           int i;
    466           for (i = 0; i < 8; ++i) {
    467             temp[z * 8 + i] = src_x[i * src_stride + 3];
    468           }
    469         }
    470         x_q4 += x_step_q4;
    471       }
    472 
    473       // transpose the 8x8 filters values back to dst
    474       transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
    475     }
    476 
    477     src += src_stride * 8;
    478     dst += dst_stride * 8;
    479   } while (y -= 8);
    480 }
    481 
    482 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
    483                                   uint8_t *dst, const int16_t *filter) {
    484   const __m128i k_256 = _mm_set1_epi16(1 << 8);
    485   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
    486   // pack and duplicate the filter values
    487   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
    488   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
    489   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
    490   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
    491   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
    492   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
    493   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
    494   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
    495   // TRANSPOSE...
    496   // 00 01 02 03 04 05 06 07
    497   // 10 11 12 13 14 15 16 17
    498   // 20 21 22 23 24 25 26 27
    499   // 30 31 32 33 34 35 36 37
    500   //
    501   // TO
    502   //
    503   // 00 10 20 30
    504   // 01 11 21 31
    505   // 02 12 22 32
    506   // 03 13 23 33
    507   // 04 14 24 34
    508   // 05 15 25 35
    509   // 06 16 26 36
    510   // 07 17 27 37
    511   //
    512   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
    513   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
    514   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
    515   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
    516   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
    517   const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    518   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
    519   const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    520   // 02 03 12 13 22 23 32 33
    521   const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
    522   // 06 07 16 17 26 27 36 37
    523   const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
    524   // multiply 2 adjacent elements with the filter and add the result
    525   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
    526   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
    527   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
    528   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
    529   // add and saturate the results together
    530   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
    531   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
    532   __m128i temp = _mm_adds_epi16(x0, x3);
    533   temp = _mm_adds_epi16(temp, min_x2x1);
    534   temp = _mm_adds_epi16(temp, max_x2x1);
    535   // round and shift by 7 bit each 16 bit
    536   temp = _mm_mulhrs_epi16(temp, k_256);
    537   // shrink to 8 bit each 16 bits
    538   temp = _mm_packus_epi16(temp, temp);
    539   // save only 4 bytes
    540   *(int *)dst = _mm_cvtsi128_si32(temp);
    541 }
    542 
    543 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
    544                                 uint8_t *dst, ptrdiff_t dst_stride) {
    545   __m128i A = _mm_cvtsi32_si128(*(const int *)src);
    546   __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
    547   __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
    548   __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
    549   // 00 10 01 11 02 12 03 13
    550   const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
    551   // 20 30 21 31 22 32 23 33
    552   const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
    553   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    554   A = _mm_unpacklo_epi16(tr0_0, tr0_1);
    555   B = _mm_srli_si128(A, 4);
    556   C = _mm_srli_si128(A, 8);
    557   D = _mm_srli_si128(A, 12);
    558 
    559   *(int *)(dst) = _mm_cvtsi128_si32(A);
    560   *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
    561   *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
    562   *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
    563 }
    564 
    565 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
    566                                     uint8_t *dst, ptrdiff_t dst_stride,
    567                                     const InterpKernel *x_filters, int x0_q4,
    568                                     int x_step_q4, int w, int h) {
    569   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
    570   int x, y, z;
    571   src -= SUBPEL_TAPS / 2 - 1;
    572 
    573   for (y = 0; y < h; y += 4) {
    574     int x_q4 = x0_q4;
    575     for (x = 0; x < w; x += 4) {
    576       // process 4 src_x steps
    577       for (z = 0; z < 4; ++z) {
    578         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
    579         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
    580         if (x_q4 & SUBPEL_MASK) {
    581           filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
    582         } else {
    583           int i;
    584           for (i = 0; i < 4; ++i) {
    585             temp[z * 4 + i] = src_x[i * src_stride + 3];
    586           }
    587         }
    588         x_q4 += x_step_q4;
    589       }
    590 
    591       // transpose the 4x4 filters values back to dst
    592       transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
    593     }
    594 
    595     src += src_stride * 4;
    596     dst += dst_stride * 4;
    597   }
    598 }
    599 
    600 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
    601                                  uint8_t *dst, const int16_t *filter) {
    602   const __m128i k_256 = _mm_set1_epi16(1 << 8);
    603   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
    604   // pack and duplicate the filter values
    605   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
    606   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
    607   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
    608   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
    609   const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
    610   const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
    611   const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
    612   const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
    613   const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
    614   const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
    615   const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
    616   const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
    617   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
    618   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
    619   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
    620   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
    621   // multiply 2 adjacent elements with the filter and add the result
    622   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
    623   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
    624   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
    625   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
    626   // add and saturate the results together
    627   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
    628   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
    629   __m128i temp = _mm_adds_epi16(x0, x3);
    630   temp = _mm_adds_epi16(temp, min_x2x1);
    631   temp = _mm_adds_epi16(temp, max_x2x1);
    632   // round and shift by 7 bit each 16 bit
    633   temp = _mm_mulhrs_epi16(temp, k_256);
    634   // shrink to 8 bit each 16 bits
    635   temp = _mm_packus_epi16(temp, temp);
    636   // save only 4 bytes
    637   *(int *)dst = _mm_cvtsi128_si32(temp);
    638 }
    639 
    640 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
    641                                    uint8_t *dst, ptrdiff_t dst_stride,
    642                                    const InterpKernel *y_filters, int y0_q4,
    643                                    int y_step_q4, int w, int h) {
    644   int y;
    645   int y_q4 = y0_q4;
    646 
    647   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    648   for (y = 0; y < h; ++y) {
    649     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    650     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
    651 
    652     if (y_q4 & SUBPEL_MASK) {
    653       filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
    654     } else {
    655       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
    656     }
    657 
    658     y_q4 += y_step_q4;
    659   }
    660 }
    661 
    662 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
    663                                  uint8_t *dst, const int16_t *filter) {
    664   const __m128i k_256 = _mm_set1_epi16(1 << 8);
    665   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
    666   // pack and duplicate the filter values
    667   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
    668   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
    669   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
    670   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
    671   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
    672   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
    673   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
    674   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
    675   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
    676   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
    677   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
    678   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
    679   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
    680   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
    681   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
    682   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
    683   // multiply 2 adjacent elements with the filter and add the result
    684   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
    685   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
    686   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
    687   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
    688   // add and saturate the results together
    689   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
    690   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
    691   __m128i temp = _mm_adds_epi16(x0, x3);
    692   temp = _mm_adds_epi16(temp, min_x2x1);
    693   temp = _mm_adds_epi16(temp, max_x2x1);
    694   // round and shift by 7 bit each 16 bit
    695   temp = _mm_mulhrs_epi16(temp, k_256);
    696   // shrink to 8 bit each 16 bits
    697   temp = _mm_packus_epi16(temp, temp);
    698   // save only 8 bytes convolve result
    699   _mm_storel_epi64((__m128i *)dst, temp);
    700 }
    701 
    702 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
    703                                    uint8_t *dst, ptrdiff_t dst_stride,
    704                                    const InterpKernel *y_filters, int y0_q4,
    705                                    int y_step_q4, int w, int h) {
    706   int y;
    707   int y_q4 = y0_q4;
    708 
    709   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    710   for (y = 0; y < h; ++y) {
    711     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    712     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
    713     if (y_q4 & SUBPEL_MASK) {
    714       filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
    715     } else {
    716       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
    717     }
    718     y_q4 += y_step_q4;
    719   }
    720 }
    721 
    722 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
    723                                   uint8_t *dst, const int16_t *filter, int w) {
    724   const __m128i k_256 = _mm_set1_epi16(1 << 8);
    725   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
    726   // pack and duplicate the filter values
    727   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
    728   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
    729   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
    730   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
    731   int i;
    732 
    733   for (i = 0; i < w; i += 16) {
    734     const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
    735     const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
    736     const __m128i C =
    737         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
    738     const __m128i D =
    739         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
    740     const __m128i E =
    741         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
    742     const __m128i F =
    743         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
    744     const __m128i G =
    745         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
    746     const __m128i H =
    747         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
    748     // merge the result together
    749     const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
    750     const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
    751     const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
    752     const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
    753     // multiply 2 adjacent elements with the filter and add the result
    754     const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
    755     const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
    756     const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
    757     const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
    758     // add and saturate the results together
    759     const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
    760     const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
    761     // merge the result together
    762     const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
    763     const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
    764     // multiply 2 adjacent elements with the filter and add the result
    765     const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
    766     const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
    767     // merge the result together
    768     const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
    769     const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
    770     // multiply 2 adjacent elements with the filter and add the result
    771     const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
    772     const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
    773     // add and saturate the results together
    774     __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
    775     __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
    776 
    777     // add and saturate the results together
    778     temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
    779     temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
    780     // round and shift by 7 bit each 16 bit
    781     temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
    782     temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
    783     // shrink to 8 bit each 16 bits, the first lane contain the first
    784     // convolve result and the second lane contain the second convolve
    785     // result
    786     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
    787     src_ptr += 16;
    788     // save 16 bytes convolve result
    789     _mm_store_si128((__m128i *)&dst[i], temp_hi);
    790   }
    791 }
    792 
    793 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
    794                                     uint8_t *dst, ptrdiff_t dst_stride,
    795                                     const InterpKernel *y_filters, int y0_q4,
    796                                     int y_step_q4, int w, int h) {
    797   int y;
    798   int y_q4 = y0_q4;
    799 
    800   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    801   for (y = 0; y < h; ++y) {
    802     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    803     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
    804     if (y_q4 & SUBPEL_MASK) {
    805       filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
    806                             w);
    807     } else {
    808       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
    809     }
    810     y_q4 += y_step_q4;
    811   }
    812 }
    813 
    814 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
    815                              uint8_t *dst, ptrdiff_t dst_stride,
    816                              const InterpKernel *const x_filters, int x0_q4,
    817                              int x_step_q4, const InterpKernel *const y_filters,
    818                              int y0_q4, int y_step_q4, int w, int h) {
    819   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
    820   // 2d filtering proceeds in 2 steps:
    821   //   (1) Interpolate horizontally into an intermediate buffer, temp.
    822   //   (2) Interpolate temp vertically to derive the sub-pixel result.
    823   // Deriving the maximum number of rows in the temp buffer (135):
    824   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
    825   // --Largest block size is 64x64 pixels.
    826   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
    827   //   original frame (in 1/16th pixel units).
    828   // --Must round-up because block may be located at sub-pixel position.
    829   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
    830   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
    831   // --Require an additional 8 rows for the horiz_w8 transpose tail.
    832   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
    833   const int intermediate_height =
    834       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
    835 
    836   assert(w <= 64);
    837   assert(h <= 64);
    838   assert(y_step_q4 <= 32);
    839   assert(x_step_q4 <= 32);
    840 
    841   if (w >= 8) {
    842     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
    843                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
    844                             w, intermediate_height);
    845   } else {
    846     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
    847                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
    848                             w, intermediate_height);
    849   }
    850 
    851   if (w >= 16) {
    852     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
    853                             dst_stride, y_filters, y0_q4, y_step_q4, w, h);
    854   } else if (w == 8) {
    855     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
    856                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
    857   } else {
    858     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
    859                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
    860   }
    861 }
    862 
    863 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    864                          ptrdiff_t dst_stride, const int16_t *filter_x,
    865                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
    866                          int w, int h) {
    867   const InterpKernel *const filters_x = get_filter_base(filter_x);
    868   const int x0_q4 = get_filter_offset(filter_x, filters_x);
    869 
    870   const InterpKernel *const filters_y = get_filter_base(filter_y);
    871   const int y0_q4 = get_filter_offset(filter_y, filters_y);
    872 
    873   scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
    874                    x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
    875 }
    876 
    877 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    878 //                          uint8_t *dst, ptrdiff_t dst_stride,
    879 //                          const int16_t *filter_x, int x_step_q4,
    880 //                          const int16_t *filter_y, int y_step_q4,
    881 //                          int w, int h);
    882 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    883 //                              uint8_t *dst, ptrdiff_t dst_stride,
    884 //                              const int16_t *filter_x, int x_step_q4,
    885 //                              const int16_t *filter_y, int y_step_q4,
    886 //                              int w, int h);
    887 FUN_CONV_2D(, ssse3);
    888 FUN_CONV_2D(avg_, ssse3);
    889