Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22 ******************************************************************************
     23 * @file ime_distortion_metrics_sse42.c
     24 *
     25 * @brief
     26 *  This file contains definitions of routines that compute distortion
     27 *  between two macro/sub blocks of identical dimensions
     28 *
     29 * @author
     30 *  Ittiam
     31 *
     32 * @par List of Functions:
     33 *  - ime_compute_sad_16x16_sse42()
     34 *  - ime_compute_sad_16x16_fast_sse42()
     35 *  - ime_compute_sad_16x16_ea8_sse42()
     36 *  - ime_compute_sad_16x8_sse42()
     37 *  - ime_calculate_sad4_prog_sse42()
     38 *  - ime_sub_pel_compute_sad_16x16_sse42()
     39 *  - ime_compute_satqd_16x16_lumainter_sse42()
     40 *
     41 * @remarks
     42 *  None
     43 *
     44 *******************************************************************************
     45 */
     46 
     47 /*****************************************************************************/
     48 /* File Includes                                                             */
     49 /*****************************************************************************/
     50 
     51 /* System include files */
     52 #include <stdio.h>
     53 #include <stdlib.h>
     54 #include <string.h>
     55 
     56 /* User include files */
     57 #include "ime_typedefs.h"
     58 #include "ime_defs.h"
     59 #include "ime_macros.h"
     60 #include "ime_statistics.h"
     61 #include "ime_platform_macros.h"
     62 #include "ime_distortion_metrics.h"
     63 #include <immintrin.h>
     64 
     65 /*****************************************************************************/
     66 /* Function Definitions                                                      */
     67 /*****************************************************************************/
     68 
     69 /**
     70 ******************************************************************************
     71 *
     72 * @brief computes distortion (SAD) between 2 16x16 blocks
     73 *
     74 * @par   Description
     75 *   This functions computes SAD between 2 16x16 blocks. There is a provision
     76 *   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
     77 *   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
     78 *
     79 * @param[in] pu1_src
     80 *  UWORD8 pointer to the source
     81 *
     82 * @param[out] pu1_dst
     83 *  UWORD8 pointer to the destination
     84 *
     85 * @param[in] src_strd
     86 *  integer source stride
     87 *
     88 * @param[in] dst_strd
     89 *  integer destination stride
     90 *
     91 * @param[in] i4_max_sad
     92 *  integer maximum allowed distortion
     93 *
     94 * @param[out] pi4_mb_distortion
     95 *  integer evaluated sad
     96 *
     97 * @remarks
     98 *
     99 ******************************************************************************
    100 */
    101 void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
    102                            UWORD8 *pu1_est,
    103                            WORD32 src_strd,
    104                            WORD32 est_strd,
    105                            WORD32 i4_max_sad,
    106                            WORD32 *pi4_mb_distortion)
    107 {
    108     __m128i src_r0, src_r1, src_r2, src_r3;
    109     __m128i est_r0, est_r1, est_r2, est_r3;
    110     __m128i res_r0, res_r1, res_r2, res_r3;
    111     __m128i sad_val;
    112     int val1, val2;
    113     UNUSED (i4_max_sad);
    114 
    115     // Row 0-3 sad calculation
    116     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    117     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    118     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    119     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
    120 
    121     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    122     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    123     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    124     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
    125 
    126     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    127     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    128     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    129     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    130 
    131     sad_val = _mm_add_epi64(res_r0, res_r1);
    132     sad_val = _mm_add_epi64(sad_val, res_r2);
    133     sad_val = _mm_add_epi64(sad_val, res_r3);
    134 
    135     // Row 4-7 sad calculation
    136     pu1_src += 4*src_strd;
    137     pu1_est += 4*est_strd;
    138 
    139     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    140     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    141     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    142     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
    143 
    144     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    145     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    146     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    147     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
    148 
    149     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    150     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    151     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    152     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    153 
    154     sad_val = _mm_add_epi64(sad_val, res_r0);
    155     sad_val = _mm_add_epi64(sad_val, res_r1);
    156     sad_val = _mm_add_epi64(sad_val, res_r2);
    157     sad_val = _mm_add_epi64(sad_val, res_r3);
    158 
    159     // Row 8-11 sad calculation
    160     pu1_src += 4*src_strd;
    161     pu1_est += 4*est_strd;
    162     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    163     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    164     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    165     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
    166 
    167     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    168     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    169     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    170     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
    171 
    172     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    173     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    174     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    175     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    176 
    177     sad_val = _mm_add_epi64(sad_val, res_r0);
    178     sad_val = _mm_add_epi64(sad_val, res_r1);
    179     sad_val = _mm_add_epi64(sad_val, res_r2);
    180     sad_val = _mm_add_epi64(sad_val, res_r3);
    181 
    182     // Row 12-15 sad calculation
    183     pu1_src += 4*src_strd;
    184     pu1_est += 4*est_strd;
    185     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    186     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    187     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    188     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
    189 
    190     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    191     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    192     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    193     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
    194 
    195     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    196     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    197     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    198     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    199 
    200     sad_val = _mm_add_epi64(sad_val, res_r0);
    201     sad_val = _mm_add_epi64(sad_val, res_r1);
    202     sad_val = _mm_add_epi64(sad_val, res_r2);
    203     sad_val = _mm_add_epi64(sad_val, res_r3);
    204 
    205     val1 = _mm_extract_epi32(sad_val,0);
    206     val2 = _mm_extract_epi32(sad_val, 2);
    207     *pi4_mb_distortion = (val1+val2);
    208 
    209     return;
    210 }
    211 
    212 /**
    213 ******************************************************************************
    214 *
    215 *  @brief computes distortion (SAD) between 2 16x8  blocks
    216 *
    217 *
    218 *  @par   Description
    219 *   This functions computes SAD between 2 16x8 blocks. There is a provision
    220 *   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    221 *   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    222 *
    223 * @param[in] pu1_src
    224 *  UWORD8 pointer to the source
    225 *
    226 * @param[out] pu1_dst
    227 *  UWORD8 pointer to the destination
    228 *
    229 * @param[in] src_strd
    230 *  integer source stride
    231 *
    232 * @param[in] dst_strd
    233 *  integer destination stride
    234 *
    235 * @param[in] u4_max_sad
    236 *  integer maximum allowed distortion
    237 *
    238 * @param[out] pi4_mb_distortion
    239 *  integer evaluated sad
    240 *
    241 * @remarks
    242 *
    243 ******************************************************************************
    244 */
    245 void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
    246                     UWORD8 *pu1_est,
    247                     WORD32 src_strd,
    248                     WORD32 est_strd,
    249                     WORD32 i4_max_sad,
    250                     WORD32 *pi4_mb_distortion)
    251 {
    252     __m128i src_r0, src_r1, src_r2, src_r3;
    253     __m128i est_r0, est_r1, est_r2, est_r3;
    254     __m128i res_r0, res_r1, res_r2, res_r3;
    255     __m128i sad_val;
    256     int val1, val2;
    257     UNUSED (i4_max_sad);
    258 
    259     // Row 0-3 sad calculation
    260     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    261     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    262     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    263     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
    264 
    265     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    266     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    267     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    268     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
    269 
    270     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    271     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    272     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    273     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    274 
    275     sad_val = _mm_add_epi64(res_r0, res_r1);
    276     sad_val = _mm_add_epi64(sad_val, res_r2);
    277     sad_val = _mm_add_epi64(sad_val, res_r3);
    278 
    279     // Row 4-7 sad calculation
    280     pu1_src += 4*src_strd;
    281     pu1_est += 4*est_strd;
    282 
    283     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    284     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
    285     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    286     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
    287 
    288     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    289     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
    290     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    291     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
    292 
    293     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    294     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    295     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    296     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    297 
    298     sad_val = _mm_add_epi64(sad_val, res_r0);
    299     sad_val = _mm_add_epi64(sad_val, res_r1);
    300     sad_val = _mm_add_epi64(sad_val, res_r2);
    301     sad_val = _mm_add_epi64(sad_val, res_r3);
    302 
    303     val1 = _mm_extract_epi32(sad_val,0);
    304     val2 = _mm_extract_epi32(sad_val, 2);
    305     *pi4_mb_distortion = (val1+val2);
    306     return;
    307 }
    308 
    309 /**
    310 ******************************************************************************
    311 *
    312 * @brief computes distortion (SAD) between 2 16x16 blocks
    313 *
    314 * @par   Description
    315 *   This functions computes SAD between 2 16x16 blocks. There is a provision
    316 *   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    317 *   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    318 *
    319 * @param[in] pu1_src
    320 *  UWORD8 pointer to the source
    321 *
    322 * @param[out] pu1_dst
    323 *  UWORD8 pointer to the destination
    324 *
    325 * @param[in] src_strd
    326 *  integer source stride
    327 *
    328 * @param[in] dst_strd
    329 *  integer destination stride
    330 *
    331 * @param[in] i4_max_sad
    332 *  integer maximum allowed distortion
    333 *
    334 * @param[out] pi4_mb_distortion
    335 *  integer evaluated sad
    336 *
    337 * @remarks
    338 *
    339 ******************************************************************************
    340 */
    341 void ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src,
    342                                UWORD8 *pu1_est,
    343                                WORD32 src_strd,
    344                                WORD32 est_strd,
    345                                WORD32 i4_max_sad,
    346                                WORD32 *pi4_mb_distortion)
    347 {
    348     __m128i src_r0, src_r1, src_r2, src_r3;
    349     __m128i est_r0, est_r1, est_r2, est_r3;
    350     __m128i res_r0, res_r1, res_r2, res_r3;
    351     __m128i sad_val;
    352     WORD32 val1, val2;
    353     WORD32 i4_sad;
    354     UWORD8 *pu1_src_temp = pu1_src + src_strd;
    355     UWORD8 *pu1_est_temp = pu1_est + est_strd;
    356 
    357     // Row 0,2,4,6 sad calculation
    358     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    359     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    360     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    361     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
    362 
    363     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    364     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    365     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    366     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
    367 
    368     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    369     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    370     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    371     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    372 
    373     sad_val = _mm_add_epi64(res_r0, res_r1);
    374     sad_val = _mm_add_epi64(sad_val, res_r2);
    375     sad_val = _mm_add_epi64(sad_val, res_r3);
    376 
    377     // Row 8,10,12,14 sad calculation
    378     pu1_src += 8*src_strd;
    379     pu1_est += 8*est_strd;
    380 
    381     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    382     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    383     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    384     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
    385 
    386     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    387     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    388     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    389     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
    390 
    391     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    392     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    393     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    394     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    395 
    396     sad_val = _mm_add_epi64(sad_val, res_r0);
    397     sad_val = _mm_add_epi64(sad_val, res_r1);
    398     sad_val = _mm_add_epi64(sad_val, res_r2);
    399     sad_val = _mm_add_epi64(sad_val, res_r3);
    400 
    401     pu1_src = pu1_src_temp;
    402     pu1_est = pu1_est_temp;
    403 
    404     val1 = _mm_extract_epi32(sad_val, 0);
    405     val2 = _mm_extract_epi32(sad_val, 2);
    406 
    407     i4_sad = val1 + val2;
    408     if (i4_max_sad < i4_sad)
    409     {
    410         *pi4_mb_distortion = i4_sad;
    411         return ;
    412     }
    413     // Row 1,3,5,7 sad calculation
    414     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    415     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    416     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    417     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
    418 
    419     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    420     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    421     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    422     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
    423 
    424     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    425     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    426     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    427     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    428 
    429     sad_val = _mm_add_epi64(sad_val, res_r0);
    430     sad_val = _mm_add_epi64(sad_val, res_r1);
    431     sad_val = _mm_add_epi64(sad_val, res_r2);
    432     sad_val = _mm_add_epi64(sad_val, res_r3);
    433 
    434     // Row 9,11,13,15 sad calculation
    435     pu1_src += 8*src_strd;
    436     pu1_est += 8*est_strd;
    437     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    438     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
    439     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
    440     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
    441 
    442     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    443     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
    444     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
    445     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
    446 
    447     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    448     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    449     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    450     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    451 
    452     sad_val = _mm_add_epi64(sad_val, res_r0);
    453     sad_val = _mm_add_epi64(sad_val, res_r1);
    454     sad_val = _mm_add_epi64(sad_val, res_r2);
    455     sad_val = _mm_add_epi64(sad_val, res_r3);
    456 
    457     val1 = _mm_extract_epi32(sad_val, 0);
    458     val2 = _mm_extract_epi32(sad_val, 2);
    459     *pi4_mb_distortion = (val1+val2);
    460 
    461     return;
    462 }
    463 
    464 /**
    465 ******************************************************************************
    466 *
    467 * @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
    468 *
    469 * @par   Description
    470 *   This functions computes SAD between 2 16x16 blocks by processing alternate
    471 *   rows (fast mode). For fast mode it is assumed sad obtained by processing
    472 *   alternate rows is approximately twice as that for the whole block.
    473 *
    474 * @param[in] pu1_src
    475 *  UWORD8 pointer to the source
    476 *
    477 * @param[out] pu1_dst
    478 *  UWORD8 pointer to the destination
    479 *
    480 * @param[in] src_strd
    481 *  integer source stride
    482 *
    483 * @param[in] dst_strd
    484 *  integer destination stride
    485 *
    486 * @param[in] i4_max_sad
    487 *  integer maximum allowed distortion
    488 *
    489 * @param[out] pi4_mb_distortion
    490 *  integer evaluated sad
    491 *
    492 * @remarks
    493 *
    494 ******************************************************************************
    495 */
    496 void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
    497                                 UWORD8 *pu1_est,
    498                                 WORD32 src_strd,
    499                                 WORD32 est_strd,
    500                                 WORD32 i4_max_sad,
    501                                 WORD32 *pi4_mb_distortion)
    502 {
    503     __m128i src_r0, src_r1, src_r2, src_r3;
    504     __m128i est_r0, est_r1, est_r2, est_r3;
    505     __m128i res_r0, res_r1, res_r2, res_r3;
    506     __m128i sad_val;
    507     WORD32 val1, val2;
    508     WORD32 i4_sad;
    509     UWORD8 *pu1_src_temp = pu1_src + src_strd;
    510     UWORD8 *pu1_est_temp = pu1_est + est_strd;
    511     UNUSED (i4_max_sad);
    512 
    513     // Row 0,2,4,6 sad calculation
    514     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    515     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
    516     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
    517     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
    518 
    519     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    520     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
    521     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
    522     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
    523 
    524     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    525     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    526     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    527     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    528 
    529     sad_val = _mm_add_epi64(res_r0, res_r1);
    530     sad_val = _mm_add_epi64(sad_val, res_r2);
    531     sad_val = _mm_add_epi64(sad_val, res_r3);
    532 
    533     // Row 8,10,12,14 sad calculation
    534     pu1_src += 8 * src_strd;
    535     pu1_est += 8 * est_strd;
    536 
    537     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
    538     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
    539     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
    540     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
    541 
    542     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
    543     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
    544     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
    545     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
    546 
    547     res_r0 = _mm_sad_epu8(src_r0, est_r0);
    548     res_r1 = _mm_sad_epu8(src_r1, est_r1);
    549     res_r2 = _mm_sad_epu8(src_r2, est_r2);
    550     res_r3 = _mm_sad_epu8(src_r3, est_r3);
    551 
    552     sad_val = _mm_add_epi64(sad_val, res_r0);
    553     sad_val = _mm_add_epi64(sad_val, res_r1);
    554     sad_val = _mm_add_epi64(sad_val, res_r2);
    555     sad_val = _mm_add_epi64(sad_val, res_r3);
    556 
    557     pu1_src = pu1_src_temp;
    558     pu1_est = pu1_est_temp;
    559 
    560     val1 = _mm_extract_epi32(sad_val, 0);
    561     val2 = _mm_extract_epi32(sad_val, 2);
    562 
    563     i4_sad = val1 + val2;
    564     *pi4_mb_distortion = (i4_sad<<1);
    565     return;
    566 }
    567 
    568 /**
    569 *******************************************************************************
    570 *
    571 * @brief compute sad
    572 *
    573 * @par Description: This function computes the sad at vertices of diamond grid
    574 * centered at reference pointer and at unit distance from it.
    575 *
    576 * @param[in] pu1_ref
    577 *  UWORD8 pointer to the reference
    578 *
    579 * @param[out] pu1_src
    580 *  UWORD8 pointer to the source
    581 *
    582 * @param[in] ref_strd
    583 *  integer reference stride
    584 *
    585 * @param[in] src_strd
    586 *  integer source stride
    587 *
    588 * @param[out] pi4_sad
    589 *  pointer to integer array evaluated sad
    590 *
    591 * @returns  sad at all evaluated vertexes
    592 *
    593 * @remarks  none
    594 *
    595 *******************************************************************************
    596 */
    597 void ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref,
    598                              UWORD8 *pu1_src,
    599                              WORD32 ref_strd,
    600                              WORD32 src_strd,
    601                              WORD32 *pi4_sad)
    602 {
    603     /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */
    604     UWORD8 *left_ptr    = pu1_ref - 1;
    605     UWORD8 *right_ptr   = pu1_ref + 1;
    606     UWORD8 *top_ptr     = pu1_ref - ref_strd;
    607     UWORD8 *bot_ptr     = pu1_ref + ref_strd;
    608 
    609     WORD32 val1, val2;
    610     __m128i src, ref_left, ref_right, ref_top, ref_bot;
    611     __m128i res_r0, res_r1, res_r2, res_r3;
    612     __m128i sad_r0, sad_r1, sad_r2, sad_r3;
    613 
    614     // Row 0 sad calculation
    615     src = _mm_loadu_si128((__m128i *) (pu1_src));
    616     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    617     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    618     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    619     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    620 
    621     sad_r0 = _mm_sad_epu8(src, ref_left);
    622     sad_r1 = _mm_sad_epu8(src, ref_right);
    623     sad_r2 = _mm_sad_epu8(src, ref_top);
    624     sad_r3 = _mm_sad_epu8(src, ref_bot);
    625 
    626     pu1_src += src_strd;
    627     left_ptr += ref_strd;
    628     right_ptr += ref_strd;
    629     top_ptr += ref_strd;
    630     bot_ptr += ref_strd;
    631 
    632     // Row 1 sad calculation
    633     src = _mm_loadu_si128((__m128i *) (pu1_src));
    634     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    635     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    636     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    637     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    638 
    639     res_r0 = _mm_sad_epu8(src, ref_left);
    640     res_r1 = _mm_sad_epu8(src, ref_right);
    641     res_r2 = _mm_sad_epu8(src, ref_top);
    642     res_r3 = _mm_sad_epu8(src, ref_bot);
    643 
    644     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    645     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    646     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    647     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    648 
    649     pu1_src += src_strd;
    650     left_ptr += ref_strd;
    651     right_ptr += ref_strd;
    652     top_ptr += ref_strd;
    653     bot_ptr += ref_strd;
    654 
    655     // Row 2 sad calculation
    656     src = _mm_loadu_si128((__m128i *) (pu1_src));
    657     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    658     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    659     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    660     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    661 
    662     res_r0 = _mm_sad_epu8(src, ref_left);
    663     res_r1 = _mm_sad_epu8(src, ref_right);
    664     res_r2 = _mm_sad_epu8(src, ref_top);
    665     res_r3 = _mm_sad_epu8(src, ref_bot);
    666 
    667     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    668     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    669     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    670     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    671 
    672     pu1_src += src_strd;
    673     left_ptr += ref_strd;
    674     right_ptr += ref_strd;
    675     top_ptr += ref_strd;
    676     bot_ptr += ref_strd;
    677 
    678     // Row 3 sad calculation
    679     src = _mm_loadu_si128((__m128i *) (pu1_src));
    680     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    681     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    682     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    683     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    684 
    685     res_r0 = _mm_sad_epu8(src, ref_left);
    686     res_r1 = _mm_sad_epu8(src, ref_right);
    687     res_r2 = _mm_sad_epu8(src, ref_top);
    688     res_r3 = _mm_sad_epu8(src, ref_bot);
    689 
    690     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    691     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    692     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    693     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    694 
    695     pu1_src += src_strd;
    696     left_ptr += ref_strd;
    697     right_ptr += ref_strd;
    698     top_ptr += ref_strd;
    699     bot_ptr += ref_strd;
    700 
    701     // Row 4 sad calculation
    702     src = _mm_loadu_si128((__m128i *) (pu1_src));
    703     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    704     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    705     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    706     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    707 
    708     res_r0 = _mm_sad_epu8(src, ref_left);
    709     res_r1 = _mm_sad_epu8(src, ref_right);
    710     res_r2 = _mm_sad_epu8(src, ref_top);
    711     res_r3 = _mm_sad_epu8(src, ref_bot);
    712 
    713     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    714     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    715     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    716     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    717 
    718     pu1_src += src_strd;
    719     left_ptr += ref_strd;
    720     right_ptr += ref_strd;
    721     top_ptr += ref_strd;
    722     bot_ptr += ref_strd;
    723 
    724     // Row 5 sad calculation
    725     src = _mm_loadu_si128((__m128i *) (pu1_src));
    726     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    727     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    728     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    729     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    730 
    731     res_r0 = _mm_sad_epu8(src, ref_left);
    732     res_r1 = _mm_sad_epu8(src, ref_right);
    733     res_r2 = _mm_sad_epu8(src, ref_top);
    734     res_r3 = _mm_sad_epu8(src, ref_bot);
    735 
    736     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    737     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    738     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    739     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    740 
    741     pu1_src += src_strd;
    742     left_ptr += ref_strd;
    743     right_ptr += ref_strd;
    744     top_ptr += ref_strd;
    745     bot_ptr += ref_strd;
    746 
    747     // Row 6 sad calculation
    748     src = _mm_loadu_si128((__m128i *) (pu1_src));
    749     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    750     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    751     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    752     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    753 
    754     res_r0 = _mm_sad_epu8(src, ref_left);
    755     res_r1 = _mm_sad_epu8(src, ref_right);
    756     res_r2 = _mm_sad_epu8(src, ref_top);
    757     res_r3 = _mm_sad_epu8(src, ref_bot);
    758 
    759     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    760     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    761     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    762     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    763 
    764     pu1_src += src_strd;
    765     left_ptr += ref_strd;
    766     right_ptr += ref_strd;
    767     top_ptr += ref_strd;
    768     bot_ptr += ref_strd;
    769 
    770     // Row 7 sad calculation
    771     src = _mm_loadu_si128((__m128i *) (pu1_src));
    772     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    773     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    774     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    775     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    776 
    777     res_r0 = _mm_sad_epu8(src, ref_left);
    778     res_r1 = _mm_sad_epu8(src, ref_right);
    779     res_r2 = _mm_sad_epu8(src, ref_top);
    780     res_r3 = _mm_sad_epu8(src, ref_bot);
    781 
    782     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    783     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    784     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    785     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    786 
    787     pu1_src += src_strd;
    788     left_ptr += ref_strd;
    789     right_ptr += ref_strd;
    790     top_ptr += ref_strd;
    791     bot_ptr += ref_strd;
    792 
    793     // Row 8 sad calculation
    794     src = _mm_loadu_si128((__m128i *) (pu1_src));
    795     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    796     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    797     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    798     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    799 
    800     res_r0 = _mm_sad_epu8(src, ref_left);
    801     res_r1 = _mm_sad_epu8(src, ref_right);
    802     res_r2 = _mm_sad_epu8(src, ref_top);
    803     res_r3 = _mm_sad_epu8(src, ref_bot);
    804 
    805     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    806     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    807     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    808     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    809 
    810     pu1_src += src_strd;
    811     left_ptr += ref_strd;
    812     right_ptr += ref_strd;
    813     top_ptr += ref_strd;
    814     bot_ptr += ref_strd;
    815 
    816     // Row 9 sad calculation
    817     src = _mm_loadu_si128((__m128i *) (pu1_src));
    818     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    819     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    820     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    821     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    822 
    823     res_r0 = _mm_sad_epu8(src, ref_left);
    824     res_r1 = _mm_sad_epu8(src, ref_right);
    825     res_r2 = _mm_sad_epu8(src, ref_top);
    826     res_r3 = _mm_sad_epu8(src, ref_bot);
    827 
    828     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    829     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    830     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    831     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    832 
    833     pu1_src += src_strd;
    834     left_ptr += ref_strd;
    835     right_ptr += ref_strd;
    836     top_ptr += ref_strd;
    837     bot_ptr += ref_strd;
    838 
    839     // Row 10 sad calculation
    840     src = _mm_loadu_si128((__m128i *) (pu1_src));
    841     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    842     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    843     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    844     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    845 
    846     res_r0 = _mm_sad_epu8(src, ref_left);
    847     res_r1 = _mm_sad_epu8(src, ref_right);
    848     res_r2 = _mm_sad_epu8(src, ref_top);
    849     res_r3 = _mm_sad_epu8(src, ref_bot);
    850 
    851     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    852     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    853     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    854     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    855 
    856     pu1_src += src_strd;
    857     left_ptr += ref_strd;
    858     right_ptr += ref_strd;
    859     top_ptr += ref_strd;
    860     bot_ptr += ref_strd;
    861 
    862     // Row 11 sad calculation
    863     src = _mm_loadu_si128((__m128i *) (pu1_src));
    864     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    865     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    866     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    867     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    868 
    869     res_r0 = _mm_sad_epu8(src, ref_left);
    870     res_r1 = _mm_sad_epu8(src, ref_right);
    871     res_r2 = _mm_sad_epu8(src, ref_top);
    872     res_r3 = _mm_sad_epu8(src, ref_bot);
    873 
    874     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    875     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    876     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    877     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    878 
    879     pu1_src += src_strd;
    880     left_ptr += ref_strd;
    881     right_ptr += ref_strd;
    882     top_ptr += ref_strd;
    883     bot_ptr += ref_strd;
    884 
    885     // Row 12 sad calculation
    886     src = _mm_loadu_si128((__m128i *) (pu1_src));
    887     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    888     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    889     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    890     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    891 
    892     res_r0 = _mm_sad_epu8(src, ref_left);
    893     res_r1 = _mm_sad_epu8(src, ref_right);
    894     res_r2 = _mm_sad_epu8(src, ref_top);
    895     res_r3 = _mm_sad_epu8(src, ref_bot);
    896 
    897     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    898     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    899     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    900     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    901 
    902     pu1_src += src_strd;
    903     left_ptr += ref_strd;
    904     right_ptr += ref_strd;
    905     top_ptr += ref_strd;
    906     bot_ptr += ref_strd;
    907 
    908     // Row 13 sad calculation
    909     src = _mm_loadu_si128((__m128i *) (pu1_src));
    910     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    911     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    912     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    913     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    914 
    915     res_r0 = _mm_sad_epu8(src, ref_left);
    916     res_r1 = _mm_sad_epu8(src, ref_right);
    917     res_r2 = _mm_sad_epu8(src, ref_top);
    918     res_r3 = _mm_sad_epu8(src, ref_bot);
    919 
    920     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    921     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    922     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    923     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    924 
    925     pu1_src += src_strd;
    926     left_ptr += ref_strd;
    927     right_ptr += ref_strd;
    928     top_ptr += ref_strd;
    929     bot_ptr += ref_strd;
    930 
    931     // Row 14 sad calculation
    932     src = _mm_loadu_si128((__m128i *) (pu1_src));
    933     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    934     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    935     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    936     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    937 
    938     res_r0 = _mm_sad_epu8(src, ref_left);
    939     res_r1 = _mm_sad_epu8(src, ref_right);
    940     res_r2 = _mm_sad_epu8(src, ref_top);
    941     res_r3 = _mm_sad_epu8(src, ref_bot);
    942 
    943     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    944     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    945     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    946     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    947 
    948     pu1_src += src_strd;
    949     left_ptr += ref_strd;
    950     right_ptr += ref_strd;
    951     top_ptr += ref_strd;
    952     bot_ptr += ref_strd;
    953 
    954     // Row 15 sad calculation
    955     src = _mm_loadu_si128((__m128i *) (pu1_src));
    956     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
    957     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
    958     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
    959     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
    960 
    961     res_r0 = _mm_sad_epu8(src, ref_left);
    962     res_r1 = _mm_sad_epu8(src, ref_right);
    963     res_r2 = _mm_sad_epu8(src, ref_top);
    964     res_r3 = _mm_sad_epu8(src, ref_bot);
    965 
    966     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
    967     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
    968     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
    969     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
    970 
    971     val1 = _mm_extract_epi32(sad_r0, 0);
    972     val2 = _mm_extract_epi32(sad_r0, 2);
    973     pi4_sad[0] = (val1 + val2);
    974 
    975     val1 = _mm_extract_epi32(sad_r1, 0);
    976     val2 = _mm_extract_epi32(sad_r1, 2);
    977     pi4_sad[1] = (val1 + val2);
    978 
    979     val1 = _mm_extract_epi32(sad_r2, 0);
    980     val2 = _mm_extract_epi32(sad_r2, 2);
    981     pi4_sad[2] = (val1 + val2);
    982 
    983     val1 = _mm_extract_epi32(sad_r3, 0);
    984     val2 = _mm_extract_epi32(sad_r3, 2);
    985     pi4_sad[3] = (val1 + val2);
    986 }
    987 
    988 /**
    989 ******************************************************************************
    990 *
    991 * @brief computes distortion (SAD) at all subpel points about the src location
    992 *
    993 * @par Description
    994 *   This functions computes SAD at all points at a subpel distance from the
    995 *   current source location.
    996 *
    997 * @param[in] pu1_src
    998 *  UWORD8 pointer to the source
    999 *
   1000 * @param[out] pu1_ref_half_x
   1001 *  UWORD8 pointer to half pel buffer
   1002 *
   1003 * @param[out] pu1_ref_half_y
   1004 *  UWORD8 pointer to half pel buffer
   1005 *
   1006 * @param[out] pu1_ref_half_xy
   1007 *  UWORD8 pointer to half pel buffer
   1008 *
   1009 * @param[in] src_strd
   1010 *  integer source stride
   1011 *
   1012 * @param[in] ref_strd
   1013 *  integer ref stride
   1014 *
   1015 * @param[out] pi4_sad
   1016 *  integer evaluated sad
   1017 *  pi4_sad[0] - half x
   1018 *  pi4_sad[1] - half x - 1
   1019 *  pi4_sad[2] - half y
   1020 *  pi4_sad[3] - half y - 1
   1021 *  pi4_sad[4] - half xy
   1022 *  pi4_sad[5] - half xy - 1
   1023 *  pi4_sad[6] - half xy - strd
   1024 *  pi4_sad[7] - half xy - 1 - strd
   1025 *
   1026 * @remarks
   1027 *
   1028 ******************************************************************************
   1029 */
   1030 void ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src,
   1031                                    UWORD8 *pu1_ref_half_x,
   1032                                    UWORD8 *pu1_ref_half_y,
   1033                                    UWORD8 *pu1_ref_half_xy,
   1034                                    WORD32 src_strd,
   1035                                    WORD32 ref_strd,
   1036                                    WORD32 *pi4_sad)
   1037 {
   1038     UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1;
   1039     UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd;
   1040     UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1;
   1041     UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd;
   1042     UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1;
   1043     WORD32 val1, val2;
   1044 
   1045     __m128i src, ref_half_x, ref_half_y, ref_half_xy;
   1046     __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left;
   1047     __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7;
   1048     __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7;
   1049     // Row 0 sad calculation
   1050     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1051     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1052     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1053     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1054     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1055     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1056     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1057     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1058     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1059 
   1060     sad_r0 = _mm_sad_epu8(src, ref_half_x);
   1061     sad_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1062     sad_r2 = _mm_sad_epu8(src, ref_half_y);
   1063     sad_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1064     sad_r4 = _mm_sad_epu8(src, ref_half_xy);
   1065     sad_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1066     sad_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1067     sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1068 
   1069     pu1_src += src_strd;
   1070     pu1_ref_half_x += ref_strd;
   1071     pu1_ref_half_x_left += ref_strd;
   1072     pu1_ref_half_y += ref_strd;
   1073     pu1_ref_half_y_top += ref_strd;
   1074     pu1_ref_half_xy += ref_strd;
   1075     pu1_ref_half_xy_left += ref_strd;
   1076     pu1_ref_half_xy_top += ref_strd;
   1077     pu1_ref_half_xy_top_left += ref_strd;
   1078 
   1079     // Row 1 sad calculation
   1080     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1081     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1082     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1083     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1084     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1085     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1086     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1087     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1088     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1089 
   1090     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1091     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1092     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1093     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1094     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1095     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1096     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1097     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1098 
   1099     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1100     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1101     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1102     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1103     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1104     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1105     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1106     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1107 
   1108     pu1_src += src_strd;
   1109     pu1_ref_half_x += ref_strd;
   1110     pu1_ref_half_x_left += ref_strd;
   1111     pu1_ref_half_y += ref_strd;
   1112     pu1_ref_half_y_top += ref_strd;
   1113     pu1_ref_half_xy += ref_strd;
   1114     pu1_ref_half_xy_left += ref_strd;
   1115     pu1_ref_half_xy_top += ref_strd;
   1116     pu1_ref_half_xy_top_left += ref_strd;
   1117 
   1118     // Row 2 sad calculation
   1119     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1120     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1121     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1122     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1123     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1124     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1125     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1126     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1127     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1128 
   1129     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1130     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1131     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1132     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1133     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1134     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1135     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1136     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1137 
   1138     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1139     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1140     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1141     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1142     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1143     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1144     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1145     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1146 
   1147     pu1_src += src_strd;
   1148     pu1_ref_half_x += ref_strd;
   1149     pu1_ref_half_x_left += ref_strd;
   1150     pu1_ref_half_y += ref_strd;
   1151     pu1_ref_half_y_top += ref_strd;
   1152     pu1_ref_half_xy += ref_strd;
   1153     pu1_ref_half_xy_left += ref_strd;
   1154     pu1_ref_half_xy_top += ref_strd;
   1155     pu1_ref_half_xy_top_left += ref_strd;
   1156 
   1157     // Row 3 sad calculation
   1158     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1159     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1160     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1161     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1162     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1163     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1164     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1165     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1166     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1167 
   1168     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1169     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1170     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1171     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1172     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1173     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1174     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1175     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1176 
   1177     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1178     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1179     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1180     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1181     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1182     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1183     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1184     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1185 
   1186     pu1_src += src_strd;
   1187     pu1_ref_half_x += ref_strd;
   1188     pu1_ref_half_x_left += ref_strd;
   1189     pu1_ref_half_y += ref_strd;
   1190     pu1_ref_half_y_top += ref_strd;
   1191     pu1_ref_half_xy += ref_strd;
   1192     pu1_ref_half_xy_left += ref_strd;
   1193     pu1_ref_half_xy_top += ref_strd;
   1194     pu1_ref_half_xy_top_left += ref_strd;
   1195 
   1196     // Row 4 sad calculation
   1197     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1198     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1199     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1200     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1201     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1202     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1203     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1204     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1205     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1206 
   1207     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1208     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1209     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1210     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1211     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1212     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1213     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1214     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1215 
   1216     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1217     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1218     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1219     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1220     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1221     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1222     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1223     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1224 
   1225     pu1_src += src_strd;
   1226     pu1_ref_half_x += ref_strd;
   1227     pu1_ref_half_x_left += ref_strd;
   1228     pu1_ref_half_y += ref_strd;
   1229     pu1_ref_half_y_top += ref_strd;
   1230     pu1_ref_half_xy += ref_strd;
   1231     pu1_ref_half_xy_left += ref_strd;
   1232     pu1_ref_half_xy_top += ref_strd;
   1233     pu1_ref_half_xy_top_left += ref_strd;
   1234 
   1235 
   1236     // Row 5 sad calculation
   1237     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1238     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1239     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1240     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1241     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1242     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1243     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1244     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1245     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1246 
   1247     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1248     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1249     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1250     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1251     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1252     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1253     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1254     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1255 
   1256     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1257     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1258     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1259     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1260     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1261     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1262     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1263     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1264 
   1265     pu1_src += src_strd;
   1266     pu1_ref_half_x += ref_strd;
   1267     pu1_ref_half_x_left += ref_strd;
   1268     pu1_ref_half_y += ref_strd;
   1269     pu1_ref_half_y_top += ref_strd;
   1270     pu1_ref_half_xy += ref_strd;
   1271     pu1_ref_half_xy_left += ref_strd;
   1272     pu1_ref_half_xy_top += ref_strd;
   1273     pu1_ref_half_xy_top_left += ref_strd;
   1274 
   1275     // Row 6 sad calculation
   1276     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1277     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1278     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1279     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1280     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1281     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1282     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1283     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1284     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1285 
   1286     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1287     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1288     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1289     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1290     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1291     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1292     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1293     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1294 
   1295     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1296     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1297     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1298     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1299     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1300     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1301     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1302     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1303 
   1304     pu1_src += src_strd;
   1305     pu1_ref_half_x += ref_strd;
   1306     pu1_ref_half_x_left += ref_strd;
   1307     pu1_ref_half_y += ref_strd;
   1308     pu1_ref_half_y_top += ref_strd;
   1309     pu1_ref_half_xy += ref_strd;
   1310     pu1_ref_half_xy_left += ref_strd;
   1311     pu1_ref_half_xy_top += ref_strd;
   1312     pu1_ref_half_xy_top_left += ref_strd;
   1313 
   1314     // Row 7 sad calculation
   1315     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1316     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1317     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1318     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1319     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1320     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1321     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1322     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1323     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1324 
   1325     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1326     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1327     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1328     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1329     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1330     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1331     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1332     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1333 
   1334     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1335     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1336     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1337     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1338     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1339     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1340     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1341     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1342 
   1343     pu1_src += src_strd;
   1344     pu1_ref_half_x += ref_strd;
   1345     pu1_ref_half_x_left += ref_strd;
   1346     pu1_ref_half_y += ref_strd;
   1347     pu1_ref_half_y_top += ref_strd;
   1348     pu1_ref_half_xy += ref_strd;
   1349     pu1_ref_half_xy_left += ref_strd;
   1350     pu1_ref_half_xy_top += ref_strd;
   1351     pu1_ref_half_xy_top_left += ref_strd;
   1352 
   1353     // Row 8 sad calculation
   1354     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1355     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1356     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1357     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1358     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1359     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1360     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1361     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1362     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1363 
   1364     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1365     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1366     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1367     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1368     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1369     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1370     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1371     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1372 
   1373     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1374     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1375     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1376     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1377     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1378     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1379     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1380     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1381 
   1382     pu1_src += src_strd;
   1383     pu1_ref_half_x += ref_strd;
   1384     pu1_ref_half_x_left += ref_strd;
   1385     pu1_ref_half_y += ref_strd;
   1386     pu1_ref_half_y_top += ref_strd;
   1387     pu1_ref_half_xy += ref_strd;
   1388     pu1_ref_half_xy_left += ref_strd;
   1389     pu1_ref_half_xy_top += ref_strd;
   1390     pu1_ref_half_xy_top_left += ref_strd;
   1391 
   1392     // Row 9 sad calculation
   1393     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1394     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1395     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1396     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1397     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1398     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1399     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1400     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1401     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1402 
   1403     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1404     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1405     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1406     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1407     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1408     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1409     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1410     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1411 
   1412     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1413     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1414     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1415     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1416     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1417     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1418     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1419     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1420 
   1421     pu1_src += src_strd;
   1422     pu1_ref_half_x += ref_strd;
   1423     pu1_ref_half_x_left += ref_strd;
   1424     pu1_ref_half_y += ref_strd;
   1425     pu1_ref_half_y_top += ref_strd;
   1426     pu1_ref_half_xy += ref_strd;
   1427     pu1_ref_half_xy_left += ref_strd;
   1428     pu1_ref_half_xy_top += ref_strd;
   1429     pu1_ref_half_xy_top_left += ref_strd;
   1430 
   1431     // Row 10 sad calculation
   1432     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1433     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1434     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1435     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1436     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1437     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1438     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1439     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1440     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1441 
   1442     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1443     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1444     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1445     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1446     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1447     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1448     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1449     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1450 
   1451     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1452     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1453     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1454     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1455     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1456     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1457     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1458     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1459 
   1460     pu1_src += src_strd;
   1461     pu1_ref_half_x += ref_strd;
   1462     pu1_ref_half_x_left += ref_strd;
   1463     pu1_ref_half_y += ref_strd;
   1464     pu1_ref_half_y_top += ref_strd;
   1465     pu1_ref_half_xy += ref_strd;
   1466     pu1_ref_half_xy_left += ref_strd;
   1467     pu1_ref_half_xy_top += ref_strd;
   1468     pu1_ref_half_xy_top_left += ref_strd;
   1469 
   1470     // Row 11 sad calculation
   1471     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1472     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1473     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1474     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1475     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1476     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1477     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1478     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1479     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1480 
   1481     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1482     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1483     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1484     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1485     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1486     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1487     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1488     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1489 
   1490     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1491     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1492     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1493     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1494     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1495     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1496     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1497     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1498 
   1499     pu1_src += src_strd;
   1500     pu1_ref_half_x += ref_strd;
   1501     pu1_ref_half_x_left += ref_strd;
   1502     pu1_ref_half_y += ref_strd;
   1503     pu1_ref_half_y_top += ref_strd;
   1504     pu1_ref_half_xy += ref_strd;
   1505     pu1_ref_half_xy_left += ref_strd;
   1506     pu1_ref_half_xy_top += ref_strd;
   1507     pu1_ref_half_xy_top_left += ref_strd;
   1508 
   1509     // Row 12 sad calculation
   1510     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1511     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1512     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1513     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1514     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1515     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1516     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1517     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1518     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1519 
   1520     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1521     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1522     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1523     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1524     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1525     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1526     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1527     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1528 
   1529     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1530     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1531     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1532     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1533     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1534     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1535     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1536     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1537 
   1538     pu1_src += src_strd;
   1539     pu1_ref_half_x += ref_strd;
   1540     pu1_ref_half_x_left += ref_strd;
   1541     pu1_ref_half_y += ref_strd;
   1542     pu1_ref_half_y_top += ref_strd;
   1543     pu1_ref_half_xy += ref_strd;
   1544     pu1_ref_half_xy_left += ref_strd;
   1545     pu1_ref_half_xy_top += ref_strd;
   1546     pu1_ref_half_xy_top_left += ref_strd;
   1547 
   1548     // Row 13 sad calculation
   1549     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1550     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1551     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1552     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1553     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1554     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1555     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1556     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1557     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1558 
   1559     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1560     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1561     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1562     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1563     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1564     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1565     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1566     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1567 
   1568     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1569     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1570     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1571     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1572     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1573     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1574     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1575     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1576 
   1577     pu1_src += src_strd;
   1578     pu1_ref_half_x += ref_strd;
   1579     pu1_ref_half_x_left += ref_strd;
   1580     pu1_ref_half_y += ref_strd;
   1581     pu1_ref_half_y_top += ref_strd;
   1582     pu1_ref_half_xy += ref_strd;
   1583     pu1_ref_half_xy_left += ref_strd;
   1584     pu1_ref_half_xy_top += ref_strd;
   1585     pu1_ref_half_xy_top_left += ref_strd;
   1586 
   1587     // Row 14 sad calculation
   1588     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1589     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1590     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1591     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1592     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1593     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1594     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1595     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1596     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1597 
   1598     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1599     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1600     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1601     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1602     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1603     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1604     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1605     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1606 
   1607     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1608     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1609     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1610     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1611     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1612     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1613     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1614     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1615 
   1616     pu1_src += src_strd;
   1617     pu1_ref_half_x += ref_strd;
   1618     pu1_ref_half_x_left += ref_strd;
   1619     pu1_ref_half_y += ref_strd;
   1620     pu1_ref_half_y_top += ref_strd;
   1621     pu1_ref_half_xy += ref_strd;
   1622     pu1_ref_half_xy_left += ref_strd;
   1623     pu1_ref_half_xy_top += ref_strd;
   1624     pu1_ref_half_xy_top_left += ref_strd;
   1625 
   1626     // Row 15 sad calculation
   1627     src = _mm_loadu_si128((__m128i *) (pu1_src));
   1628     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
   1629     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
   1630     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
   1631     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
   1632     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
   1633     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
   1634     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
   1635     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
   1636 
   1637     res_r0 = _mm_sad_epu8(src, ref_half_x);
   1638     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
   1639     res_r2 = _mm_sad_epu8(src, ref_half_y);
   1640     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
   1641     res_r4 = _mm_sad_epu8(src, ref_half_xy);
   1642     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
   1643     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
   1644     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
   1645 
   1646     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
   1647     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
   1648     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
   1649     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
   1650     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
   1651     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
   1652     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
   1653     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
   1654 
   1655     val1 = _mm_extract_epi32(sad_r0, 0);
   1656     val2 = _mm_extract_epi32(sad_r0, 2);
   1657     pi4_sad[0] = (val1 + val2);
   1658 
   1659     val1 = _mm_extract_epi32(sad_r1, 0);
   1660     val2 = _mm_extract_epi32(sad_r1, 2);
   1661     pi4_sad[1] = (val1 + val2);
   1662 
   1663     val1 = _mm_extract_epi32(sad_r2, 0);
   1664     val2 = _mm_extract_epi32(sad_r2, 2);
   1665     pi4_sad[2] = (val1 + val2);
   1666 
   1667     val1 = _mm_extract_epi32(sad_r3, 0);
   1668     val2 = _mm_extract_epi32(sad_r3, 2);
   1669     pi4_sad[3] = (val1 + val2);
   1670 
   1671     val1 = _mm_extract_epi32(sad_r4, 0);
   1672     val2 = _mm_extract_epi32(sad_r4, 2);
   1673     pi4_sad[4] = (val1 + val2);
   1674 
   1675     val1 = _mm_extract_epi32(sad_r5, 0);
   1676     val2 = _mm_extract_epi32(sad_r5, 2);
   1677     pi4_sad[5] = (val1 + val2);
   1678 
   1679     val1 = _mm_extract_epi32(sad_r6, 0);
   1680     val2 = _mm_extract_epi32(sad_r6, 2);
   1681     pi4_sad[6] = (val1 + val2);
   1682 
   1683     val1 = _mm_extract_epi32(sad_r7, 0);
   1684     val2 = _mm_extract_epi32(sad_r7, 2);
   1685     pi4_sad[7] = (val1 + val2);
   1686 
   1687     return;
   1688 }
   1689 /*
   1690 *
   1691 * @brief This function computes SAD between two 16x16 blocks
   1692 *        It also computes if the block will be zero after H264 transform and quant for
   1693 *        Intra 16x16 blocks
   1694 *
   1695 * @param[in] pu1_src
   1696 *  UWORD8 pointer to the source
   1697 *
   1698 * @param[out] pu1_dst
   1699 *  UWORD8 pointer to the destination
   1700 *
   1701 * @param[in] src_strd
   1702 *  integer source stride
   1703 *
   1704 * @param[in] dst_strd
   1705 *  integer destination stride
   1706 *
   1707 * @param[in] pu2_thrsh
   1708 *  Threshold for each element of transofrmed quantized block
   1709 *
   1710 * @param[out] pi4_mb_distortion
   1711 *  integer evaluated sad
   1712 *
   1713 * @param[out] pu4_is_zero
   1714 *  Poitner to store if the block is zero after transform and quantization
   1715 *
   1716 * @remarks
   1717 *
   1718 ******************************************************************************
   1719 */
   1720 void ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src,
   1721                                          UWORD8 *pu1_est,
   1722                                          WORD32 src_strd,
   1723                                          WORD32 est_strd,
   1724                                          UWORD16 *pu2_thrsh,
   1725                                          WORD32 *pi4_mb_distortion,
   1726                                          UWORD32 *pu4_is_zero)
   1727 {
   1728     __m128i src_r0, src_r1, src_r2, src_r3;
   1729     __m128i est_r0, est_r1, est_r2, est_r3;
   1730     __m128i temp0, temp1, temp2, temp3, temp4;
   1731     __m128i zero = _mm_setzero_si128();          // all bits reset to zero
   1732     __m128i all_one = _mm_set1_epi8(0xFF);
   1733     __m128i sad_b1, sad_b2, threshold;
   1734     WORD16 sad_1, sad_2;
   1735     WORD32 i;
   1736     UWORD32 flag = 0;
   1737     WORD32 test1, test2;
   1738     threshold = _mm_loadu_si128((__m128i *) pu2_thrsh);
   1739     (*pi4_mb_distortion) = 0;
   1740 
   1741     for (i=0; i<4; i++)
   1742     {
   1743         src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
   1744         src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
   1745         src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
   1746         src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
   1747 
   1748         src_r0 = _mm_cvtepu8_epi16(src_r0);
   1749         src_r1 = _mm_cvtepu8_epi16(src_r1);
   1750         src_r2 = _mm_cvtepu8_epi16(src_r2);
   1751         src_r3 = _mm_cvtepu8_epi16(src_r3);
   1752 
   1753         est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
   1754         est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
   1755         est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
   1756         est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
   1757 
   1758         est_r0 = _mm_cvtepu8_epi16(est_r0);
   1759         est_r1 = _mm_cvtepu8_epi16(est_r1);
   1760         est_r2 = _mm_cvtepu8_epi16(est_r2);
   1761         est_r3 = _mm_cvtepu8_epi16(est_r3);
   1762 
   1763         src_r0 = _mm_sub_epi16(src_r0, est_r0);
   1764         src_r1 = _mm_sub_epi16(src_r1, est_r1);
   1765         src_r2 = _mm_sub_epi16(src_r2, est_r2);
   1766         src_r3 = _mm_sub_epi16(src_r3, est_r3);
   1767 
   1768         src_r0 = _mm_abs_epi16(src_r0);
   1769         src_r1 = _mm_abs_epi16(src_r1);
   1770         src_r2 = _mm_abs_epi16(src_r2);
   1771         src_r3 = _mm_abs_epi16(src_r3);
   1772 
   1773         src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
   1774         src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
   1775 
   1776         //SAD calculation
   1777         temp0 = _mm_add_epi16(src_r0, src_r1);      //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2
   1778         temp0 = _mm_hadd_epi16(temp0, zero);
   1779         temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
   1780 
   1781         sad_1 = _mm_extract_epi16(temp0, 0);
   1782         sad_2 = _mm_extract_epi16(temp0, 1);
   1783 
   1784         (*pi4_mb_distortion) += sad_1 + sad_2;
   1785 
   1786         if (flag == 0) {
   1787             sad_b1 = _mm_set1_epi16((sad_1 << 1));
   1788             sad_b2 = _mm_set1_epi16((sad_2 << 1));
   1789 
   1790             src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
   1791             src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
   1792 
   1793             src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
   1794             src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
   1795 
   1796             src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
   1797             src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
   1798 
   1799             temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
   1800             temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
   1801 
   1802             temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
   1803             temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
   1804 
   1805             temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
   1806             temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
   1807 
   1808             temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
   1809 
   1810             temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
   1811             temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
   1812 
   1813             temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
   1814 
   1815             temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
   1816             temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
   1817 
   1818             temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
   1819             temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
   1820 
   1821             sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
   1822             sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
   1823 
   1824             temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
   1825 
   1826             temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
   1827 
   1828             temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
   1829             temp1 = _mm_xor_si128(temp1, all_one);
   1830 
   1831             test1 = _mm_test_all_zeros(temp0, all_one);
   1832             test2 = _mm_test_all_zeros(temp1, all_one);
   1833 
   1834             if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
   1835                     || pu2_thrsh[8] <= sad_2)
   1836                 flag = 1;
   1837         }
   1838 
   1839         pu1_src += 8;
   1840         pu1_est += 8;
   1841 
   1842         src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
   1843         src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
   1844         src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
   1845         src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
   1846 
   1847         src_r0 = _mm_cvtepu8_epi16(src_r0);
   1848         src_r1 = _mm_cvtepu8_epi16(src_r1);
   1849         src_r2 = _mm_cvtepu8_epi16(src_r2);
   1850         src_r3 = _mm_cvtepu8_epi16(src_r3);
   1851 
   1852         est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
   1853         est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
   1854         est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
   1855         est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
   1856 
   1857         est_r0 = _mm_cvtepu8_epi16(est_r0);
   1858         est_r1 = _mm_cvtepu8_epi16(est_r1);
   1859         est_r2 = _mm_cvtepu8_epi16(est_r2);
   1860         est_r3 = _mm_cvtepu8_epi16(est_r3);
   1861 
   1862         src_r0 = _mm_sub_epi16(src_r0, est_r0);
   1863         src_r1 = _mm_sub_epi16(src_r1, est_r1);
   1864         src_r2 = _mm_sub_epi16(src_r2, est_r2);
   1865         src_r3 = _mm_sub_epi16(src_r3, est_r3);
   1866 
   1867         src_r0 = _mm_abs_epi16(src_r0);
   1868         src_r1 = _mm_abs_epi16(src_r1);
   1869         src_r2 = _mm_abs_epi16(src_r2);
   1870         src_r3 = _mm_abs_epi16(src_r3);
   1871 
   1872         src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
   1873         src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
   1874 
   1875         //SAD calculation
   1876         temp0 = _mm_add_epi16(src_r0, src_r1);
   1877         temp0 = _mm_hadd_epi16(temp0, zero);
   1878         temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
   1879 
   1880         sad_1 = _mm_extract_epi16(temp0, 0);
   1881         sad_2 = _mm_extract_epi16(temp0, 1);
   1882 
   1883         (*pi4_mb_distortion) += sad_1 + sad_2;
   1884 
   1885         if (flag == 0) {
   1886             sad_b1 = _mm_set1_epi16((sad_1 << 1));
   1887             sad_b2 = _mm_set1_epi16((sad_2 << 1));
   1888 
   1889             src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
   1890             src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
   1891 
   1892             src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
   1893             src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
   1894 
   1895             src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
   1896             src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
   1897 
   1898             temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
   1899             temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
   1900 
   1901             temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
   1902             temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
   1903 
   1904             temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
   1905             temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
   1906 
   1907             temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
   1908 
   1909             temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
   1910             temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
   1911 
   1912             temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
   1913 
   1914             temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
   1915             temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
   1916 
   1917             temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
   1918             temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
   1919 
   1920             sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
   1921             sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
   1922 
   1923             temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
   1924 
   1925             temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
   1926 
   1927             temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
   1928             temp1 = _mm_xor_si128(temp1, all_one);
   1929 
   1930             test1 = _mm_test_all_zeros(temp0, all_one);
   1931             test2 = _mm_test_all_zeros(temp1, all_one);
   1932 
   1933             if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
   1934                     || pu2_thrsh[8] <= sad_2)
   1935                 flag = 1;
   1936         }
   1937 
   1938         pu1_src += 4*src_strd - 8;
   1939         pu1_est += 4*est_strd - 8;
   1940     }
   1941 
   1942         *pu4_is_zero = flag;
   1943 }
   1944