Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /*****************************************************************************/
     21 /*                                                                           */
     22 /*  File Name         : ih264_weighted_pred_intr_sse42.c                     */
     23 /*                                                                           */
     24 /*  Description       : Contains function definitions for weighted           */
     25 /*                      prediction functions in x86 sse4 intrinsics          */
     26 /*                                                                           */
     27 /*  List of Functions : ih264_default_weighted_pred_luma_sse42()             */
     28 /*                      ih264_default_weighted_pred_chroma_sse42()           */
     29 /*                      ih264_weighted_pred_luma_sse42()                     */
     30 /*                      ih264_weighted_pred_chroma_sse42()                   */
     31 /*                      ih264_weighted_bipred_luma_sse42()                   */
     32 /*                      ih264_weighted_bipred_chroma_sse42()                 */
     33 /*                                                                           */
     34 /*  Issues / Problems : None                                                 */
     35 /*                                                                           */
     36 /*  Revision History  :                                                      */
     37 /*                                                                           */
     38 /*         DD MM YYYY   Author(s)       Changes                              */
     39 /*         30 01 2015   Kaushik         Initial version                      */
     40 /*                      Senthoor                                             */
     41 /*                                                                           */
     42 /*****************************************************************************/
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 
     47 #include <immintrin.h>
     48 #include "ih264_typedefs.h"
     49 #include "ih264_macros.h"
     50 #include "ih264_platform_macros.h"
     51 #include "ih264_weighted_pred.h"
     52 
     53 /*****************************************************************************/
     54 /*  Function definitions .                                                   */
     55 /*****************************************************************************/
     56 /*****************************************************************************/
     57 /*                                                                           */
     58 /*  Function Name : ih264_default_weighted_pred_luma_sse42                   */
     59 /*                                                                           */
     60 /*  Description   : This function performs the default weighted prediction   */
     61 /*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
     62 /*                  sample prediction process" for luma. The function gets   */
     63 /*                  two ht x wd blocks, calculates their rounded-average and */
     64 /*                  stores it in the destination block. (ht,wd) can be       */
     65 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
     66 /*                                                                           */
     67 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
     68 /*                  pu1_src2  - Pointer to source 2                          */
     69 /*                  pu1_dst   - Pointer to destination                       */
     70 /*                  src_strd1 - stride for source 1                          */
     71 /*                  src_strd1 - stride for source 2                          */
     72 /*                  dst_strd  - stride for destination                       */
     73 /*                  ht        - height of the block                          */
     74 /*                  wd        - width of the block                           */
     75 /*                                                                           */
     76 /*  Issues        : None                                                     */
     77 /*                                                                           */
     78 /*  Revision History:                                                        */
     79 /*                                                                           */
     80 /*         DD MM YYYY   Author(s)       Changes                              */
     81 /*         04 02 2015   Kaushik         Initial Version                      */
     82 /*                      Senthoor                                             */
     83 /*                                                                           */
     84 /*****************************************************************************/
     85 void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
     86                                             UWORD8 *pu1_src2,
     87                                             UWORD8 *pu1_dst,
     88                                             WORD32 src_strd1,
     89                                             WORD32 src_strd2,
     90                                             WORD32 dst_strd,
     91                                             WORD32 ht,
     92                                             WORD32 wd)
     93 {
     94     __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
     95     __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
     96 
     97     if(wd == 4)
     98     {
     99         do
    100         {
    101             y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
    102             y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
    103             y0_2_16x8b = _mm_loadl_epi64(
    104                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
    105             y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
    106 
    107             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
    108             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
    109             y1_2_16x8b = _mm_loadl_epi64(
    110                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
    111             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
    112 
    113             y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
    114             y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
    115             y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
    116             y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
    117 
    118             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
    119             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
    120             *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
    121             *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
    122 
    123             ht -= 4;
    124             pu1_src1 += src_strd1 << 2;
    125             pu1_src2 += src_strd2 << 2;
    126             pu1_dst += dst_strd << 2;
    127         }
    128         while(ht > 0);
    129     }
    130     else if(wd == 8)
    131     {
    132         do
    133         {
    134             y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
    135             y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
    136             y0_2_16x8b = _mm_loadl_epi64(
    137                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
    138             y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
    139 
    140             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
    141             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
    142             y1_2_16x8b = _mm_loadl_epi64(
    143                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
    144             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
    145 
    146             y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
    147             y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
    148             y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
    149             y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
    150 
    151             _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
    152             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
    153             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
    154             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
    155 
    156             ht -= 4;
    157             pu1_src1 += src_strd1 << 2;
    158             pu1_src2 += src_strd2 << 2;
    159             pu1_dst += dst_strd << 2;
    160         }
    161         while(ht > 0);
    162     }
    163     else // wd == 16
    164     {
    165         __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
    166         __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
    167 
    168         do
    169         {
    170             y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
    171             y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
    172             y0_2_16x8b = _mm_loadu_si128(
    173                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
    174             y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
    175             y0_4_16x8b = _mm_loadu_si128(
    176                             (__m128i *)(pu1_src1 + (src_strd1 << 2)));
    177             y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
    178             y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
    179             y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
    180 
    181             y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
    182             y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
    183             y1_2_16x8b = _mm_loadu_si128(
    184                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
    185             y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
    186             y1_4_16x8b = _mm_loadu_si128(
    187                             (__m128i *)(pu1_src2 + (src_strd2 << 2)));
    188             y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
    189             y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
    190             y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
    191 
    192             y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
    193             y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
    194             y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
    195             y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
    196             y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
    197             y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
    198             y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
    199             y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
    200 
    201             _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
    202             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
    203             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
    204             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
    205             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
    206             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
    207             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
    208             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
    209 
    210             ht -= 8;
    211             pu1_src1 += src_strd1 << 3;
    212             pu1_src2 += src_strd2 << 3;
    213             pu1_dst += dst_strd << 3;
    214         }
    215         while(ht > 0);
    216     }
    217 }
    218 
    219 /*****************************************************************************/
    220 /*                                                                           */
    221 /*  Function Name : ih264_default_weighted_pred_chroma_sse42                 */
    222 /*                                                                           */
    223 /*  Description   : This function performs the default weighted prediction   */
    224 /*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
    225 /*                  sample prediction process" for chroma. The function gets */
    226 /*                  two ht x wd blocks, calculates their rounded-average and */
    227 /*                  stores it in the destination block. (ht,wd) can be       */
    228 /*                  (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8).      */
    229 /*                                                                           */
    230 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
    231 /*                  pu1_src2  - Pointer to source 2                          */
    232 /*                  pu1_dst   - Pointer to destination                       */
    233 /*                  src_strd1 - stride for source 1                          */
    234 /*                  src_strd1 - stride for source 2                          */
    235 /*                  dst_strd  - stride for destination                       */
    236 /*                  ht        - height of the block                          */
    237 /*                  wd        - width of the block                           */
    238 /*                                                                           */
    239 /*  Issues        : None                                                     */
    240 /*                                                                           */
    241 /*  Revision History:                                                        */
    242 /*                                                                           */
    243 /*         DD MM YYYY   Author(s)       Changes                              */
    244 /*         04 02 2015   Kaushik         Initial Version                      */
    245 /*                      Senthoor                                             */
    246 /*                                                                           */
    247 /*****************************************************************************/
    248 void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
    249                                               UWORD8 *pu1_src2,
    250                                               UWORD8 *pu1_dst,
    251                                               WORD32 src_strd1,
    252                                               WORD32 src_strd2,
    253                                               WORD32 dst_strd,
    254                                               WORD32 ht,
    255                                               WORD32 wd)
    256 {
    257     __m128i uv0_0_16x8b, uv0_1_16x8b;
    258     __m128i uv1_0_16x8b, uv1_1_16x8b;
    259 
    260     if(wd == 2)
    261     {
    262         do
    263         {
    264             uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
    265             uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
    266 
    267             uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
    268             uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
    269 
    270             uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
    271             uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
    272 
    273             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
    274             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
    275 
    276             ht -= 2;
    277             pu1_src1 += src_strd1 << 1;
    278             pu1_src2 += src_strd2 << 1;
    279             pu1_dst += dst_strd << 1;
    280         }
    281         while(ht > 0);
    282     }
    283     else if(wd == 4)
    284     {
    285         do
    286         {
    287             uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
    288             uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
    289 
    290             uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
    291             uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
    292 
    293             uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
    294             uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
    295 
    296             _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
    297             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
    298 
    299             ht -= 2;
    300             pu1_src1 += src_strd1 << 1;
    301             pu1_src2 += src_strd2 << 1;
    302             pu1_dst += dst_strd << 1;
    303         }
    304         while(ht > 0);
    305     }
    306     else // wd == 8
    307     {
    308         __m128i uv0_2_16x8b, uv0_3_16x8b;
    309         __m128i uv1_2_16x8b, uv1_3_16x8b;
    310 
    311         do
    312         {
    313             uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
    314             uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
    315             uv0_2_16x8b = _mm_loadu_si128(
    316                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
    317             uv0_3_16x8b = _mm_loadu_si128(
    318                             (__m128i *)(pu1_src1 + src_strd1 * 3));
    319 
    320             uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
    321             uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
    322             uv1_2_16x8b = _mm_loadu_si128(
    323                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
    324             uv1_3_16x8b = _mm_loadu_si128(
    325                             (__m128i *)(pu1_src2 + src_strd2 * 3));
    326 
    327             uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
    328             uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
    329             uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
    330             uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
    331 
    332             _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
    333             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
    334             _mm_storeu_si128(
    335                             (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
    336             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
    337 
    338             ht -= 4;
    339             pu1_src1 += src_strd1 << 2;
    340             pu1_src2 += src_strd2 << 2;
    341             pu1_dst += dst_strd << 2;
    342         }
    343         while(ht > 0);
    344     }
    345 }
    346 
    347 /*****************************************************************************/
    348 /*                                                                           */
    349 /*  Function Name : ih264_weighted_pred_luma_sse42                           */
    350 /*                                                                           */
    351 /*  Description   : This function performs the weighted prediction as        */
    352 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
    353 /*                  prediction process" for luma. The function gets one      */
    354 /*                  ht x wd block, weights it, rounds it off, offsets it,    */
    355 /*                  saturates it to unsigned 8-bit and stores it in the      */
    356 /*                  destination block. (ht,wd) can be (4,4), (8,4), (4,8),   */
    357 /*                  (8,8), (16,8), (8,16) or (16,16).                        */
    358 /*                                                                           */
    359 /*  Inputs        : pu1_src  - Pointer to source                             */
    360 /*                  pu1_dst  - Pointer to destination                        */
    361 /*                  src_strd - stride for source                             */
    362 /*                  dst_strd - stride for destination                        */
    363 /*                  log_wd   - number of bits to be rounded off              */
    364 /*                  wt       - weight value                                  */
    365 /*                  ofst     - offset value                                  */
    366 /*                  ht       - height of the block                           */
    367 /*                  wd       - width of the block                            */
    368 /*                                                                           */
    369 /*  Issues        : None                                                     */
    370 /*                                                                           */
    371 /*  Revision History:                                                        */
    372 /*                                                                           */
    373 /*         DD MM YYYY   Author(s)       Changes                              */
    374 /*         04 02 2015   Kaushik         Initial Version                      */
    375 /*                      Senthoor                                             */
    376 /*                                                                           */
    377 /*****************************************************************************/
    378 void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
    379                                     UWORD8 *pu1_dst,
    380                                     WORD32 src_strd,
    381                                     WORD32 dst_strd,
    382                                     WORD32 log_wd,
    383                                     WORD32 wt,
    384                                     WORD32 ofst,
    385                                     WORD32 ht,
    386                                     WORD32 wd)
    387 {
    388     __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
    389 
    390     __m128i wt_8x16b, round_8x16b, ofst_8x16b;
    391 
    392     WORD32 round_val;
    393 
    394     wt = (WORD16)(wt & 0xffff);
    395     round_val = 1 << (log_wd - 1);
    396     ofst = (WORD8)(ofst & 0xff);
    397 
    398     wt_8x16b = _mm_set1_epi16(wt);
    399     round_8x16b = _mm_set1_epi16(round_val);
    400     ofst_8x16b = _mm_set1_epi16(ofst);
    401 
    402     if(wd == 4)
    403     {
    404         __m128i y_0_8x16b, y_2_8x16b;
    405 
    406         do
    407         {
    408             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    409             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    410             y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
    411             y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
    412 
    413             y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
    414             y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
    415 
    416             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
    417             y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
    418 
    419             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
    420             y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
    421 
    422             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
    423             y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
    424 
    425             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
    426             y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
    427 
    428             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
    429             y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
    430 
    431             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
    432             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
    433             y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
    434             y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
    435 
    436             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
    437             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
    438             *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
    439             *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
    440 
    441             ht -= 4;
    442             pu1_src += src_strd << 2;
    443             pu1_dst += dst_strd << 2;
    444         }
    445         while(ht > 0);
    446     }
    447     else if(wd == 8)
    448     {
    449         __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
    450 
    451         do
    452         {
    453             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    454             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    455             y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
    456             y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
    457 
    458             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
    459             y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
    460             y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
    461             y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
    462 
    463             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
    464             y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
    465             y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
    466             y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
    467 
    468             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
    469             y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
    470             y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
    471             y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
    472 
    473             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
    474             y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
    475             y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
    476             y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
    477 
    478             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
    479             y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
    480             y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
    481             y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
    482 
    483             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
    484             y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
    485             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
    486             y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
    487 
    488             _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
    489             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
    490             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
    491             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
    492 
    493             ht -= 4;
    494             pu1_src += src_strd << 2;
    495             pu1_dst += dst_strd << 2;
    496         }
    497         while(ht > 0);
    498     }
    499     else // wd == 16
    500     {
    501         __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
    502         __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
    503 
    504         __m128i zero_16x8b;
    505         zero_16x8b = _mm_set1_epi8(0);
    506 
    507         do
    508         {
    509             y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    510             y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
    511             y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
    512             y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
    513 
    514             y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
    515             y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
    516             y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
    517             y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
    518             y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
    519             y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
    520             y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
    521             y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
    522 
    523             y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
    524             y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
    525             y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
    526             y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
    527             y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
    528             y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
    529             y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
    530             y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
    531 
    532             y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
    533             y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
    534             y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
    535             y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
    536             y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
    537             y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
    538             y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
    539             y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
    540 
    541             y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
    542             y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
    543             y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
    544             y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
    545             y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
    546             y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
    547             y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
    548             y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
    549 
    550             y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
    551             y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
    552             y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
    553             y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
    554             y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
    555             y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
    556             y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
    557             y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
    558 
    559             y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
    560             y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
    561             y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
    562             y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
    563 
    564             _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
    565             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
    566             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
    567             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
    568 
    569             ht -= 4;
    570             pu1_src += src_strd << 2;
    571             pu1_dst += dst_strd << 2;
    572         }
    573         while(ht > 0);
    574     }
    575 }
    576 
    577 /*****************************************************************************/
    578 /*                                                                           */
    579 /*  Function Name : ih264_weighted_pred_chroma_sse42                         */
    580 /*                                                                           */
    581 /*  Description   : This function performs the weighted prediction as        */
    582 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
    583 /*                  prediction process" for chroma. The function gets one    */
    584 /*                  ht x wd block, weights it, rounds it off, offsets it,    */
    585 /*                  saturates it to unsigned 8-bit and stores it in the      */
    586 /*                  destination block. (ht,wd) can be (2,2), (4,2), (2,4),   */
    587 /*                  (4,4), (8,4), (4,8) or (8,8).                            */
    588 /*                                                                           */
    589 /*  Inputs        : pu1_src  - Pointer to source                             */
    590 /*                  pu1_dst  - Pointer to destination                        */
    591 /*                  src_strd - stride for source                             */
    592 /*                  dst_strd - stride for destination                        */
    593 /*                  log_wd   - number of bits to be rounded off              */
    594 /*                  wt       - weight values for u and v                     */
    595 /*                  ofst     - offset values for u and v                     */
    596 /*                  ht       - height of the block                           */
    597 /*                  wd       - width of the block                            */
    598 /*                                                                           */
    599 /*  Issues        : None                                                     */
    600 /*                                                                           */
    601 /*  Revision History:                                                        */
    602 /*                                                                           */
    603 /*         DD MM YYYY   Author(s)       Changes                              */
    604 /*         04 02 2015   Kaushik         Initial Version                      */
    605 /*                      Senthoor                                             */
    606 /*                                                                           */
    607 /*****************************************************************************/
    608 void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
    609                                       UWORD8 *pu1_dst,
    610                                       WORD32 src_strd,
    611                                       WORD32 dst_strd,
    612                                       WORD32 log_wd,
    613                                       WORD32 wt,
    614                                       WORD32 ofst,
    615                                       WORD32 ht,
    616                                       WORD32 wd)
    617 {
    618     __m128i y_0_16x8b, y_1_16x8b;
    619 
    620     __m128i wt_8x16b, round_8x16b, ofst_8x16b;
    621 
    622     WORD32 ofst_u, ofst_v;
    623     WORD32 round_val;
    624 
    625     ofst_u = (WORD8)(ofst & 0xff);
    626     ofst_v = (WORD8)(ofst >> 8);
    627     round_val = 1 << (log_wd - 1);
    628     ofst = (ofst_u & 0xffff) | (ofst_v << 16);
    629 
    630     wt_8x16b = _mm_set1_epi32(wt);
    631     round_8x16b = _mm_set1_epi16(round_val);
    632     ofst_8x16b = _mm_set1_epi32(ofst);
    633 
    634     if(wd == 2)
    635     {
    636         __m128i y_0_8x16b;
    637 
    638         do
    639         {
    640             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    641             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    642 
    643             y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
    644 
    645             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
    646 
    647             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
    648 
    649             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
    650 
    651             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
    652 
    653             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
    654 
    655             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
    656             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
    657 
    658             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
    659             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
    660 
    661             ht -= 2;
    662             pu1_src += src_strd << 1;
    663             pu1_dst += dst_strd << 1;
    664         }
    665         while(ht > 0);
    666     }
    667     else if(wd == 4)
    668     {
    669         __m128i y_0_8x16b, y_1_8x16b;
    670 
    671         do
    672         {
    673             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    674             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    675 
    676             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
    677             y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
    678 
    679             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
    680             y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
    681 
    682             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
    683             y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
    684 
    685             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
    686             y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
    687 
    688             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
    689             y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
    690 
    691             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
    692             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
    693 
    694             _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
    695             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
    696 
    697             ht -= 2;
    698             pu1_src += src_strd << 1;
    699             pu1_dst += dst_strd << 1;
    700         }
    701         while(ht > 0);
    702     }
    703     else // wd == 16
    704     {
    705         __m128i y_2_16x8b, y_3_16x8b;
    706         __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
    707         __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
    708 
    709         __m128i zero_16x8b;
    710         zero_16x8b = _mm_set1_epi8(0);
    711 
    712         do
    713         {
    714             y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    715             y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
    716             y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
    717             y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
    718 
    719             y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
    720             y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
    721             y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
    722             y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
    723             y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
    724             y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
    725             y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
    726             y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
    727 
    728             y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
    729             y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
    730             y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
    731             y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
    732             y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
    733             y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
    734             y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
    735             y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
    736 
    737             y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
    738             y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
    739             y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
    740             y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
    741             y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
    742             y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
    743             y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
    744             y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
    745 
    746             y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
    747             y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
    748             y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
    749             y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
    750             y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
    751             y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
    752             y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
    753             y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
    754 
    755             y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
    756             y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
    757             y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
    758             y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
    759             y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
    760             y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
    761             y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
    762             y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
    763 
    764             y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
    765             y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
    766             y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
    767             y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
    768 
    769             _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
    770             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
    771             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
    772             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
    773 
    774             ht -= 4;
    775             pu1_src += src_strd << 2;
    776             pu1_dst += dst_strd << 2;
    777         }
    778         while(ht > 0);
    779     }
    780 }
    781 
    782 /*****************************************************************************/
    783 /*                                                                           */
    784 /*  Function Name : ih264_weighted_bi_pred_luma_sse42                        */
    785 /*                                                                           */
    786 /*  Description   : This function performs the weighted biprediction as      */
    787 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
    788 /*                  prediction process" for luma. The function gets two      */
    789 /*                  ht x wd blocks, weights them, adds them, rounds off the  */
    790 /*                  sum, offsets it, saturates it to unsigned 8-bit and      */
    791 /*                  stores it in the destination block. (ht,wd) can be       */
    792 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
    793 /*                                                                           */
    794 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
    795 /*                  pu1_src2  - Pointer to source 2                          */
    796 /*                  pu1_dst   - Pointer to destination                       */
    797 /*                  src_strd1 - stride for source 1                          */
    798 /*                  src_strd2 - stride for source 2                          */
    799 /*                  dst_strd2 - stride for destination                       */
    800 /*                  log_wd    - number of bits to be rounded off             */
    801 /*                  wt1       - weight value for source 1                    */
    802 /*                  wt2       - weight value for source 2                    */
    803 /*                  ofst1     - offset value for source 1                    */
    804 /*                  ofst2     - offset value for source 2                    */
    805 /*                  ht        - height of the block                          */
    806 /*                  wd        - width of the block                           */
    807 /*                                                                           */
    808 /*  Issues        : None                                                     */
    809 /*                                                                           */
    810 /*  Revision History:                                                        */
    811 /*                                                                           */
    812 /*         DD MM YYYY   Author(s)       Changes                              */
    813 /*         04 02 2015   Kaushik         Initial Version                      */
    814 /*                      Senthoor                                             */
    815 /*                                                                           */
    816 /*****************************************************************************/
    817 void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
    818                                        UWORD8 *pu1_src2,
    819                                        UWORD8 *pu1_dst,
    820                                        WORD32 src_strd1,
    821                                        WORD32 src_strd2,
    822                                        WORD32 dst_strd,
    823                                        WORD32 log_wd,
    824                                        WORD32 wt1,
    825                                        WORD32 wt2,
    826                                        WORD32 ofst1,
    827                                        WORD32 ofst2,
    828                                        WORD32 ht,
    829                                        WORD32 wd)
    830 {
    831     __m128i y1_0_16x8b, y1_1_16x8b;
    832     __m128i y2_0_16x8b, y2_1_16x8b;
    833 
    834     __m128i wt1_8x16b, wt2_8x16b;
    835     __m128i ofst_8x16b, round_8x16b;
    836 
    837     WORD32 ofst;
    838     WORD32 round_val, shft;
    839 
    840     wt1 = (WORD16)(wt1 & 0xffff);
    841     wt2 = (WORD16)(wt2 & 0xffff);
    842     round_val = 1 << log_wd;
    843     shft = log_wd + 1;
    844     ofst1 = (WORD8)(ofst1 & 0xff);
    845     ofst2 = (WORD8)(ofst2 & 0xff);
    846     ofst = (ofst1 + ofst2 + 1) >> 1;
    847 
    848     wt1_8x16b = _mm_set1_epi16(wt1);
    849     wt2_8x16b = _mm_set1_epi16(wt2);
    850     round_8x16b = _mm_set1_epi16(round_val);
    851     ofst_8x16b = _mm_set1_epi16(ofst);
    852 
    853     if(wd == 4)
    854     {
    855         __m128i y1_2_16x8b, y1_3_16x8b;
    856         __m128i y2_2_16x8b, y2_3_16x8b;
    857 
    858         __m128i y1_0_8x16b, y1_2_8x16b;
    859         __m128i y2_0_8x16b, y2_2_8x16b;
    860 
    861         do
    862         {
    863             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
    864             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
    865             y1_2_16x8b = _mm_loadl_epi64(
    866                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
    867             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
    868 
    869             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
    870             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
    871             y2_2_16x8b = _mm_loadl_epi64(
    872                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
    873             y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
    874 
    875             y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
    876             y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
    877             y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
    878             y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
    879 
    880             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
    881             y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
    882             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
    883             y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
    884 
    885             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
    886             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
    887             y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
    888             y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
    889 
    890             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
    891             y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
    892 
    893             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
    894             y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
    895 
    896             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
    897             y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
    898 
    899             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
    900             y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
    901 
    902             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
    903             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
    904             y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
    905             y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
    906 
    907             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
    908             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
    909             *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
    910             *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
    911 
    912 
    913             ht -= 4;
    914             pu1_src1 += src_strd1 << 2;
    915             pu1_src2 += src_strd2 << 2;
    916             pu1_dst += dst_strd << 2;
    917         }
    918         while(ht > 0);
    919     }
    920     else if(wd == 8)
    921     {
    922         __m128i y1_2_16x8b, y1_3_16x8b;
    923         __m128i y2_2_16x8b, y2_3_16x8b;
    924 
    925         __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
    926         __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
    927 
    928         do
    929         {
    930             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
    931             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
    932             y1_2_16x8b = _mm_loadl_epi64(
    933                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
    934             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
    935 
    936             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
    937             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
    938             y2_2_16x8b = _mm_loadl_epi64(
    939                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
    940             y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
    941 
    942             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
    943             y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
    944             y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
    945             y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
    946 
    947             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
    948             y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
    949             y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
    950             y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
    951 
    952             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
    953             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
    954             y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
    955             y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
    956 
    957             y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
    958             y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
    959             y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
    960             y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
    961 
    962             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
    963             y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
    964             y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
    965             y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
    966 
    967             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
    968             y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
    969             y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
    970             y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
    971 
    972             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
    973             y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
    974             y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
    975             y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
    976 
    977             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
    978             y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
    979             y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
    980             y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
    981 
    982             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
    983             y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
    984             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
    985             y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
    986 
    987             _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
    988             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
    989             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
    990             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
    991 
    992             ht -= 4;
    993             pu1_src1 += src_strd1 << 2;
    994             pu1_src2 += src_strd2 << 2;
    995             pu1_dst += dst_strd << 2;
    996         }
    997         while(ht > 0);
    998     }
    999     else // wd == 16
   1000     {
   1001         __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
   1002         __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
   1003 
   1004         __m128i zero_16x8b;
   1005         zero_16x8b = _mm_set1_epi8(0);
   1006 
   1007         do
   1008         {
   1009             y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
   1010             y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
   1011             y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
   1012             y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
   1013 
   1014             y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
   1015             y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
   1016             y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
   1017             y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
   1018 
   1019             y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
   1020             y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
   1021             y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
   1022             y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
   1023 
   1024             y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
   1025             y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
   1026             y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
   1027             y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
   1028 
   1029             y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
   1030             y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
   1031             y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
   1032             y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
   1033 
   1034             y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
   1035             y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
   1036             y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
   1037             y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
   1038 
   1039             y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
   1040             y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
   1041             y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
   1042             y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
   1043 
   1044             y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
   1045             y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
   1046             y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
   1047             y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
   1048 
   1049             y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
   1050             y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
   1051             y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
   1052             y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
   1053 
   1054             y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
   1055             y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
   1056 
   1057             _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
   1058             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
   1059 
   1060             ht -= 2;
   1061             pu1_src1 += src_strd1 << 1;
   1062             pu1_src2 += src_strd2 << 1;
   1063             pu1_dst += dst_strd << 1;
   1064         }
   1065         while(ht > 0);
   1066     }
   1067 }
   1068 
   1069 /*****************************************************************************/
   1070 /*                                                                           */
   1071 /*  Function Name : ih264_weighted_bi_pred_chroma_sse42                      */
   1072 /*                                                                           */
   1073 /*  Description   : This function performs the weighted biprediction as      */
   1074 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
   1075 /*                  prediction process" for chroma. The function gets two    */
   1076 /*                  ht x wd blocks, weights them, adds them, rounds off the  */
   1077 /*                  sum, offsets it, saturates it to unsigned 8-bit and      */
   1078 /*                  stores it in the destination block. (ht,wd) can be       */
   1079 /*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
   1080 /*                                                                           */
   1081 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
   1082 /*                  pu1_src2  - Pointer to source 2                          */
   1083 /*                  pu1_dst   - Pointer to destination                       */
   1084 /*                  src_strd1 - stride for source 1                          */
   1085 /*                  src_strd2 - stride for source 2                          */
   1086 /*                  dst_strd2 - stride for destination                       */
   1087 /*                  log_wd    - number of bits to be rounded off             */
   1088 /*                  wt1       - weight values for u and v in source 1        */
   1089 /*                  wt2       - weight values for u and v in source 2        */
   1090 /*                  ofst1     - offset value for u and v in source 1         */
   1091 /*                  ofst2     - offset value for u and v in source 2         */
   1092 /*                  ht        - height of the block                          */
   1093 /*                  wd        - width of the block                           */
   1094 /*                                                                           */
   1095 /*  Issues        : None                                                     */
   1096 /*                                                                           */
   1097 /*  Revision History:                                                        */
   1098 /*                                                                           */
   1099 /*         DD MM YYYY   Author(s)       Changes                              */
   1100 /*         04 02 2015   Kaushik         Initial Version                      */
   1101 /*                      Senthoor                                             */
   1102 /*                                                                           */
   1103 /*****************************************************************************/
   1104 void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
   1105                                          UWORD8 *pu1_src2,
   1106                                          UWORD8 *pu1_dst,
   1107                                          WORD32 src_strd1,
   1108                                          WORD32 src_strd2,
   1109                                          WORD32 dst_strd,
   1110                                          WORD32 log_wd,
   1111                                          WORD32 wt1,
   1112                                          WORD32 wt2,
   1113                                          WORD32 ofst1,
   1114                                          WORD32 ofst2,
   1115                                          WORD32 ht,
   1116                                          WORD32 wd)
   1117 {
   1118     __m128i y1_0_16x8b, y1_1_16x8b;
   1119     __m128i y2_0_16x8b, y2_1_16x8b;
   1120 
   1121     __m128i wt1_8x16b, wt2_8x16b;
   1122     __m128i ofst_8x16b, round_8x16b;
   1123 
   1124     WORD32 ofst1_u, ofst2_u, ofst_u;
   1125     WORD32 ofst1_v, ofst2_v, ofst_v;
   1126     WORD32 round_val, shft, ofst_val;
   1127 
   1128     round_val = 1 << log_wd;
   1129     shft = log_wd + 1;
   1130 
   1131     ofst1_u = (WORD8)(ofst1 & 0xff);
   1132     ofst1_v = (WORD8)(ofst1 >> 8);
   1133     ofst2_u = (WORD8)(ofst2 & 0xff);
   1134     ofst2_v = (WORD8)(ofst2 >> 8);
   1135 
   1136     wt1_8x16b = _mm_set1_epi32(wt1);
   1137     wt2_8x16b = _mm_set1_epi32(wt2);
   1138 
   1139     ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
   1140     ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
   1141     ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
   1142 
   1143     round_8x16b = _mm_set1_epi16(round_val);
   1144     ofst_8x16b = _mm_set1_epi32(ofst_val);
   1145 
   1146     if(wd == 2)
   1147     {
   1148         __m128i y1_0_8x16b, y2_0_8x16b;
   1149 
   1150         do
   1151         {
   1152             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
   1153             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
   1154 
   1155             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
   1156             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
   1157 
   1158             y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
   1159             y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
   1160 
   1161             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
   1162             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
   1163 
   1164             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
   1165             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
   1166 
   1167             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
   1168             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
   1169 
   1170             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
   1171             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
   1172 
   1173             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
   1174             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
   1175 
   1176             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
   1177             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
   1178 
   1179             ht -= 2;
   1180             pu1_src1 += src_strd1 << 1;
   1181             pu1_src2 += src_strd2 << 1;
   1182             pu1_dst += dst_strd << 1;
   1183         }
   1184         while(ht > 0);
   1185     }
   1186     else if(wd == 4)
   1187     {
   1188         __m128i y1_0_8x16b, y1_1_8x16b;
   1189         __m128i y2_0_8x16b, y2_1_8x16b;
   1190 
   1191         do
   1192         {
   1193             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
   1194             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
   1195 
   1196             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
   1197             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
   1198 
   1199             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
   1200             y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
   1201 
   1202             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
   1203             y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
   1204 
   1205             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
   1206             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
   1207             y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
   1208             y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
   1209 
   1210             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
   1211             y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
   1212 
   1213             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
   1214             y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
   1215 
   1216             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
   1217             y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
   1218 
   1219             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
   1220             y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
   1221 
   1222             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
   1223             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
   1224 
   1225             _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
   1226             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
   1227 
   1228             ht -= 2;
   1229             pu1_src1 += src_strd1 << 1;
   1230             pu1_src2 += src_strd2 << 1;
   1231             pu1_dst += dst_strd << 1;
   1232         }
   1233         while(ht > 0);
   1234     }
   1235     else // wd == 8
   1236     {
   1237         __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
   1238         __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
   1239 
   1240         __m128i zero_16x8b;
   1241         zero_16x8b = _mm_set1_epi8(0);
   1242 
   1243         do
   1244         {
   1245             y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
   1246             y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
   1247             y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
   1248             y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
   1249 
   1250             y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
   1251             y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
   1252             y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
   1253             y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
   1254 
   1255             y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
   1256             y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
   1257             y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
   1258             y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
   1259 
   1260             y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
   1261             y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
   1262             y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
   1263             y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
   1264 
   1265             y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
   1266             y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
   1267             y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
   1268             y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
   1269 
   1270             y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
   1271             y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
   1272             y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
   1273             y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
   1274 
   1275             y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
   1276             y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
   1277             y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
   1278             y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
   1279 
   1280             y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
   1281             y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
   1282             y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
   1283             y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
   1284 
   1285             y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
   1286             y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
   1287             y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
   1288             y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
   1289 
   1290             y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
   1291             y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
   1292 
   1293             _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
   1294             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
   1295 
   1296             ht -= 2;
   1297             pu1_src1 += src_strd1 << 1;
   1298             pu1_src2 += src_strd2 << 1;
   1299             pu1_dst += dst_strd << 1;
   1300         }
   1301         while(ht > 0);
   1302     }
   1303 }
   1304