Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 
     19 
     20 /**
     21 *******************************************************************************
     22 * @file
     23 *  ihevc_inter_pred_filters_x86_intr.c
     24 *
     25 * @brief
     26 *  Contains function definitions for inter prediction  interpolation filters
     27 *  coded in x86 intrinsics
     28 *
     29 *
     30 * @author
     31 *
     32 *
     33 * @par List of Functions:
     34 *  - ihevc_inter_pred_luma_copy_w16out_sse42()
     35 *  - ihevc_inter_pred_chroma_copy_sse42()
     36 *  - ihevc_inter_pred_chroma_copy_w16out_sse42()
     37 *
     38 * @remarks
     39 *  None
     40 *
     41 *******************************************************************************
     42 */
     43 
     44 
     45 /*****************************************************************************/
     46 /* File Includes                                                             */
     47 /*****************************************************************************/
     48 #include <assert.h>
     49 
     50 #include "ihevc_debug.h"
     51 #include "ihevc_typedefs.h"
     52 #include "ihevc_defs.h"
     53 #include "ihevc_inter_pred.h"
     54 #include "ihevc_macros.h"
     55 #include "ihevc_platform_macros.h"
     56 #include "ihevc_func_selector.h"
     57 
     58 #include <immintrin.h>
     59 #include <emmintrin.h>
     60 #include <smmintrin.h>
     61 #include <tmmintrin.h>
     62 
     63 /*****************************************************************************/
     64 /* Function Definitions                                                      */
     65 /*****************************************************************************/
     66 
     67 
     68 
     69 /**
     70 *******************************************************************************
     71 *
     72 * @brief
     73 *       Interprediction luma filter for copy 16bit output
     74 *
     75 * @par Description:
     76 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
     77 *    by 'src' to the location pointed by 'dst' The output is upshifted by 6
     78 *    bits and is used as input for vertical filtering or weighted prediction
     79 *
     80 * @param[in] pu1_src
     81 *  UWORD8 pointer to the source
     82 *
     83 * @param[out] pi2_dst
     84 *  WORD16 pointer to the destination
     85 *
     86 * @param[in] src_strd
     87 *  integer source stride
     88 *
     89 * @param[in] dst_strd
     90 *  integer destination stride
     91 *
     92 * @param[in] pi1_coeff
     93 *  WORD8 pointer to the filter coefficients
     94 *
     95 * @param[in] ht
     96 *  integer height of the array
     97 *
     98 * @param[in] wd
     99 *  integer width of the array
    100 *
    101 * @returns
    102 *
    103 * @remarks
    104 *  None
    105 *
    106 *******************************************************************************
    107 */
    108 
    109 void ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src,
    110                                              WORD16 *pi2_dst,
    111                                              WORD32 src_strd,
    112                                              WORD32 dst_strd,
    113                                              WORD8 *pi1_coeff,
    114                                              WORD32 ht,
    115                                              WORD32 wd)
    116 {
    117     WORD32 row, col;
    118     __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
    119     UNUSED(pi1_coeff);
    120     ASSERT(wd % 4 == 0); /* checking assumption*/
    121     ASSERT(ht % 4 == 0); /* checking assumption*/
    122 
    123     if(0 == (wd & 7)) /* multiple of 8 case */
    124     {
    125         for(row = 0; row < ht; row += 4)
    126         {
    127             for(col = 0; col < wd; col += 8)
    128             {
    129                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    130                 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    131                 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    132                 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    133                 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    134 
    135                 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
    136                 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
    137                 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
    138                 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
    139 
    140                 src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
    141                 src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    142                 src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    143                 src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    144 
    145                 /* storing 16 8-bit output values */
    146                 _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
    147                 _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    148                 _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    149                 _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    150 
    151                 pu1_src += 8; /* pointer update */
    152                 pi2_dst += 8; /* pointer update */
    153             } /* inner for loop ends here(8-output values in single iteration) */
    154 
    155             pu1_src += 4 * src_strd - wd; /* pointer update */
    156             pi2_dst += 4 * dst_strd - wd; /* pointer update */
    157         }
    158     }
    159     else /* wd = multiple of 4 case */
    160     {
    161         for(row = 0; row < ht; row += 4)
    162         {
    163             for(col = 0; col < wd; col += 4)
    164             {
    165                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    166                 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    167                 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    168                 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    169                 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    170 
    171                 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
    172                 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
    173                 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
    174                 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
    175 
    176                 src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
    177                 src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    178                 src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    179                 src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    180 
    181                 /* storing 16 8-bit output values */
    182                 _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
    183                 _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    184                 _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    185                 _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    186 
    187                 pu1_src += 4; /* pointer update */
    188                 pi2_dst += 4; /* pointer update */
    189             } /* inner for loop ends here(4-output values in single iteration) */
    190 
    191             pu1_src += 4 * src_strd - wd; /* pointer update */
    192             pi2_dst += 4 * dst_strd - wd; /* pointer update */
    193         }
    194     }
    195 }
    196 
    197 /**
    198 *******************************************************************************
    199 *
    200 * @brief
    201 *      Chroma interprediction filter for copy
    202 *
    203 * @par Description:
    204 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
    205 *    by 'src' to the location pointed by 'dst'
    206 *
    207 * @param[in] pu1_src
    208 *  UWORD8 pointer to the source
    209 *
    210 * @param[out] pu1_dst
    211 *  UWORD8 pointer to the destination
    212 *
    213 * @param[in] src_strd
    214 *  integer source stride
    215 *
    216 * @param[in] dst_strd
    217 *  integer destination stride
    218 *
    219 * @param[in] pi1_coeff
    220 *  WORD8 pointer to the filter coefficients
    221 *
    222 * @param[in] ht
    223 *  integer height of the array
    224 *
    225 * @param[in] wd
    226 *  integer width of the array
    227 *
    228 * @returns
    229 *
    230 * @remarks
    231 *  None
    232 *
    233 *******************************************************************************
    234 */
    235 
    236 void ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src,
    237                                         UWORD8 *pu1_dst,
    238                                         WORD32 src_strd,
    239                                         WORD32 dst_strd,
    240                                         WORD8 *pi1_coeff,
    241                                         WORD32 ht,
    242                                         WORD32 wd)
    243 {
    244     WORD32 row, col, wdx2;
    245     __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
    246 
    247     ASSERT(wd % 2 == 0); /* checking assumption*/
    248     ASSERT(ht % 2 == 0); /* checking assumption*/
    249     UNUSED(pi1_coeff);
    250     wdx2 = wd * 2;
    251 
    252     if(0 == (ht & 3)) /* ht multiple of 4 */
    253     {
    254         if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
    255         {
    256             for(row = 0; row < ht; row += 4)
    257             {
    258                 for(col = 0; col < wdx2; col += 16)
    259                 {
    260                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    261                     src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
    262                     src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    263                     src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    264                     src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    265 
    266                     /* storing 16 8-bit output values */
    267                     _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
    268                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    269                     _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    270                     _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    271 
    272                     pu1_src += 16; /* pointer update */
    273                     pu1_dst += 16; /* pointer update */
    274                 } /* inner for loop ends here(16-output values in single iteration) */
    275 
    276                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
    277                 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
    278             }
    279 
    280         }
    281         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
    282         {
    283             for(row = 0; row < ht; row += 4)
    284             {
    285                 for(col = 0; col < wdx2; col += 8)
    286                 {
    287                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    288                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    289                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    290                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    291                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    292 
    293                     /* storing 16 8-bit output values */
    294                     _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
    295                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    296                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    297                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    298 
    299                     pu1_src += 8; /* pointer update */
    300                     pu1_dst += 8; /* pointer update */
    301                 } /*  inner for loop ends here(8-output values in single iteration) */
    302 
    303                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
    304                 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
    305             }
    306         }
    307         else /* wdx2 = multiple of 4 case */
    308         {
    309             WORD32 dst0, dst1, dst2, dst3;
    310             for(row = 0; row < ht; row += 4)
    311             {
    312                 for(col = 0; col < wdx2; col += 4)
    313                 {
    314                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    315                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    316                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    317                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    318                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    319 
    320                     dst0 = _mm_cvtsi128_si32(src0_16x8b);
    321                     dst1 = _mm_cvtsi128_si32(src1_16x8b);
    322                     dst2 = _mm_cvtsi128_si32(src2_16x8b);
    323                     dst3 = _mm_cvtsi128_si32(src3_16x8b);
    324 
    325                     /* storing 4 8-bit output values */
    326                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
    327                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
    328                     *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
    329                     *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
    330 
    331                     pu1_src += 4; /* pointer update */
    332                     pu1_dst += 4; /* pointer update */
    333                 } /*  inner for loop ends here(4- output values in single iteration) */
    334 
    335                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
    336                 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
    337             }
    338         }
    339     }
    340     else /* ht multiple of 2 */
    341     {
    342         if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
    343         {
    344             for(row = 0; row < ht; row += 2)
    345             {
    346                 for(col = 0; col < wdx2; col += 16)
    347                 {
    348                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    349                     src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
    350                     src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    351 
    352                     /* storing 16 8-bit output values */
    353                     _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
    354                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    355 
    356                     pu1_src += 16; /* pointer update */
    357                     pu1_dst += 16; /* pointer update */
    358                 } /* inner for loop ends here(16-output values in single iteration) */
    359 
    360                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
    361                 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
    362             }
    363 
    364         }
    365         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
    366         {
    367             for(row = 0; row < ht; row += 2)
    368             {
    369                 for(col = 0; col < wdx2; col += 8)
    370                 {
    371                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    372                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    373                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    374 
    375                     /* storing 16 8-bit output values */
    376                     _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
    377                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    378 
    379                     pu1_src += 8; /* pointer update */
    380                     pu1_dst += 8; /* pointer update */
    381                 } /*  inner for loop ends here(8-output values in single iteration) */
    382 
    383                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
    384                 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
    385             }
    386         }
    387         else /* wdx2 = multiple of 4 case */
    388         {
    389             WORD32 dst0, dst1;
    390             for(row = 0; row < ht; row += 2)
    391             {
    392                 for(col = 0; col < wdx2; col += 4)
    393                 {
    394                     /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    395                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    396                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    397 
    398                     dst0 = _mm_cvtsi128_si32(src0_16x8b);
    399                     dst1 = _mm_cvtsi128_si32(src1_16x8b);
    400 
    401 
    402                     /* storing 4 8-bit output values */
    403                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
    404                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
    405 
    406                     pu1_src += 4; /* pointer update */
    407                     pu1_dst += 4; /* pointer update */
    408                 } /*  inner for loop ends here(4- output values in single iteration) */
    409 
    410                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
    411                 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
    412             }
    413         }
    414     }
    415 }
    416 
    417 /**
    418 *******************************************************************************
    419 *
    420 * @brief
    421 *       chroma interprediction filter for copying 16bit output
    422 *
    423 * @par Description:
    424 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
    425 *    by 'src' to the location pointed by 'dst' The output is upshifted by 6
    426 *    bits and is used as input for vertical filtering or weighted prediction
    427 *
    428 * @param[in] pu1_src
    429 *  UWORD8 pointer to the source
    430 *
    431 * @param[out] pi2_dst
    432 *  WORD16 pointer to the destination
    433 *
    434 * @param[in] src_strd
    435 *  integer source stride
    436 *
    437 * @param[in] dst_strd
    438 *  integer destination stride
    439 *
    440 * @param[in] pi1_coeff
    441 *  WORD8 pointer to the filter coefficients
    442 *
    443 * @param[in] ht
    444 *  integer height of the array
    445 *
    446 * @param[in] wd
    447 *  integer width of the array
    448 *
    449 * @returns
    450 *
    451 * @remarks
    452 *  None
    453 *
    454 *******************************************************************************
    455 */
    456 
    457 void ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src,
    458                                                WORD16 *pi2_dst,
    459                                                WORD32 src_strd,
    460                                                WORD32 dst_strd,
    461                                                WORD8 *pi1_coeff,
    462                                                WORD32 ht,
    463                                                WORD32 wd)
    464 {
    465     WORD32 row, col, wdx2;
    466     __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
    467 
    468     ASSERT(wd % 2 == 0); /* checking assumption*/
    469     ASSERT(ht % 2 == 0); /* checking assumption*/
    470     UNUSED(pi1_coeff);
    471     wdx2 = wd * 2;
    472 
    473     if(0 == (ht & 3)) /* multiple of 4 case */
    474     {
    475         if(0 == (wdx2 & 7)) /* multiple of 8 case */
    476         {
    477             for(row = 0; row < ht; row += 4)
    478             {
    479                 for(col = 0; col < wdx2; col += 8)
    480                 {
    481                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    482                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    483                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    484                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    485                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    486 
    487                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
    488                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
    489                     src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
    490                     src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
    491 
    492                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
    493                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    494                     src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    495                     src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    496 
    497                     /* storing 16 8-bit output values */
    498                     _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
    499                     _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    500                     _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    501                     _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    502 
    503                     pu1_src += 8; /* pointer update */
    504                     pi2_dst += 8; /* pointer update */
    505                 } /* inner for loop ends here(8-output values in single iteration) */
    506 
    507                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
    508                 pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
    509             }
    510         }
    511         else /* wdx2 = multiple of 4 case */
    512         {
    513             for(row = 0; row < ht; row += 4)
    514             {
    515                 for(col = 0; col < wdx2; col += 4)
    516                 {
    517                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    518                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    519                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    520                     src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    521                     src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    522 
    523                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
    524                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
    525                     src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
    526                     src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
    527 
    528                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
    529                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    530                     src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    531                     src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    532 
    533                     /* storing 16 8-bit output values */
    534                     _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
    535                     _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    536                     _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    537                     _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    538 
    539                     pu1_src += 4; /* pointer update */
    540                     pi2_dst += 4; /* pointer update */
    541                 } /* inner for loop ends here(4-output values in single iteration) */
    542 
    543                 pu1_src += 4 * src_strd - wdx2; /* pointer update */
    544                 pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
    545             }
    546         }
    547     }
    548     else  /* ht multiple of 2 case */
    549     {
    550         if(0 == (wdx2 & 7)) /* multiple of 8 case */
    551         {
    552             for(row = 0; row < ht; row += 2)
    553             {
    554                 for(col = 0; col < wdx2; col += 8)
    555                 {
    556                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    557                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    558                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    559 
    560                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
    561                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
    562 
    563                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
    564                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    565 
    566                     /* storing 16 8-bit output values */
    567                     _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
    568                     _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    569 
    570                     pu1_src += 8; /* pointer update */
    571                     pi2_dst += 8; /* pointer update */
    572                 } /* inner for loop ends here(8-output values in single iteration) */
    573 
    574                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
    575                 pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
    576             }
    577         }
    578         else /* wdx2 = multiple of 4 case */
    579         {
    580             for(row = 0; row < ht; row += 2)
    581             {
    582                 for(col = 0; col < wdx2; col += 4)
    583                 {
    584                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    585                     src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    586                     src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    587 
    588                     src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
    589                     src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
    590 
    591                     src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
    592                     src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
    593 
    594                     /* storing 16 8-bit output values */
    595                     _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
    596                     _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    597 
    598                     pu1_src += 4; /* pointer update */
    599                     pi2_dst += 4; /* pointer update */
    600                 } /* inner for loop ends here(4-output values in single iteration) */
    601 
    602                 pu1_src += 2 * src_strd - wdx2; /* pointer update */
    603                 pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
    604             }
    605         }
    606     }
    607 }
    608