Home | History | Annotate | Download | only in arm
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21 *******************************************************************************
     22 * @file
     23 *  ihevce_had_compute_neon.c
     24 *
     25 * @brief
     26 *  Contains intrinsic definitions of functions for computing had
     27 *
     28 * @author
     29 *  Ittiam
     30 *
     31 * @par List of Functions:
     32 *
     33 * @remarks
     34 *  None
     35 *
     36 ********************************************************************************
     37 */
     38 
     39 /*****************************************************************************/
     40 /* File Includes                                                             */
     41 /*****************************************************************************/
     42 /* System include files */
     43 #include <string.h>
     44 #include <assert.h>
     45 #include <arm_neon.h>
     46 
     47 /* User include files */
     48 #include "ihevc_typedefs.h"
     49 #include "itt_video_api.h"
     50 #include "ihevc_cmn_utils_neon.h"
     51 #include "ihevce_had_satd.h"
     52 #include "ihevce_cmn_utils_instr_set_router.h"
     53 
     54 /*****************************************************************************/
     55 /* Globals                                                                   */
     56 /*****************************************************************************/
     57 const int16_t gu2_dc_mask[8] = { 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
     58 
     59 /*****************************************************************************/
     60 /* Function Macros                                                           */
     61 /*****************************************************************************/
     62 #define RESIDUE(k, is_chroma)                                                                      \
     63     if(!is_chroma)                                                                                 \
     64     {                                                                                              \
     65         const uint8x8_t s##k = vld1_u8(pu1_src);                                                   \
     66         const uint8x8_t p##k = vld1_u8(pu1_pred);                                                  \
     67         *r##k = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k));                                       \
     68         pu1_src += src_strd;                                                                       \
     69         pu1_pred += pred_strd;                                                                     \
     70     }                                                                                              \
     71     else                                                                                           \
     72     {                                                                                              \
     73         const uint8x8_t s##k = vld2_u8(pu1_src).val[0];                                            \
     74         const uint8x8_t p##k = vld2_u8(pu1_pred).val[0];                                           \
     75         *r##k = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k));                                       \
     76         pu1_src += src_strd;                                                                       \
     77         pu1_pred += pred_strd;                                                                     \
     78     }
     79 
     80 /*****************************************************************************/
     81 /* Function Definitions                                                      */
     82 /*****************************************************************************/
     83 
     84 static INLINE void
     85     hadamard4x4_2_one_pass(int16x8_t *r0, int16x8_t *r1, int16x8_t *r2, int16x8_t *r3)
     86 {
     87     const int16x8_t a0 = vaddq_s16(*r0, *r2);
     88     const int16x8_t a1 = vaddq_s16(*r1, *r3);
     89     const int16x8_t a2 = vsubq_s16(*r0, *r2);
     90     const int16x8_t a3 = vsubq_s16(*r1, *r3);
     91 
     92     *r0 = vaddq_s16(a0, a1);
     93     *r1 = vsubq_s16(a0, a1);
     94     *r2 = vaddq_s16(a2, a3);
     95     *r3 = vsubq_s16(a2, a3);
     96 }
     97 
     98 static INLINE void hadamard4x4_2(
     99     UWORD8 *pu1_src,
    100     WORD32 src_strd,
    101     UWORD8 *pu1_pred,
    102     WORD32 pred_strd,
    103     int16x8_t *r0,
    104     int16x8_t *r1,
    105     int16x8_t *r2,
    106     int16x8_t *r3)
    107 {
    108     // compute error between src and pred
    109     RESIDUE(0, 0);
    110     RESIDUE(1, 0);
    111     RESIDUE(2, 0);
    112     RESIDUE(3, 0);
    113 
    114     // vertical hadamard tx
    115     hadamard4x4_2_one_pass(r0, r1, r2, r3);
    116 
    117     // transpose
    118     transpose_s16_4x4q(r0, r1, r2, r3);
    119 
    120     // horizontal hadamard tx
    121     hadamard4x4_2_one_pass(r0, r1, r2, r3);
    122 }
    123 
    124 static INLINE void hadamard4x4_4(
    125     UWORD8 *pu1_src,
    126     WORD32 src_strd,
    127     UWORD8 *pu1_pred,
    128     WORD32 pred_strd,
    129     int16x8_t *r0,
    130     int16x8_t *r1,
    131     int16x8_t *r2,
    132     int16x8_t *r3,
    133     int16x8_t *r4,
    134     int16x8_t *r5,
    135     int16x8_t *r6,
    136     int16x8_t *r7)
    137 {
    138     // hadamard 4x4_2n
    139     hadamard4x4_2(pu1_src, src_strd, pu1_pred, pred_strd, r0, r1, r2, r3);
    140 
    141     // hadamard 4x4_2n
    142     pu1_src += (4 * src_strd);
    143     pu1_pred += (4 * pred_strd);
    144     hadamard4x4_2(pu1_src, src_strd, pu1_pred, pred_strd, r4, r5, r6, r7);
    145 }
    146 
    147 static INLINE WORD32 hadamard_sad4x4_4(int16x8_t *a, WORD32 *pi4_hsad, WORD32 hsad_stride)
    148 {
    149     int16x8_t p[8];
    150     int32x4_t b01, b23;
    151     int64x2_t c01, c23;
    152     int32x2_t d01, d23;
    153 
    154     // satd
    155     p[0] = vabsq_s16(a[0]);
    156     p[1] = vabsq_s16(a[1]);
    157     p[0] = vaddq_s16(p[0], p[1]);
    158     p[2] = vabsq_s16(a[2]);
    159     p[3] = vabsq_s16(a[3]);
    160     p[2] = vaddq_s16(p[2], p[3]);
    161 
    162     p[4] = vabsq_s16(a[4]);
    163     p[5] = vabsq_s16(a[5]);
    164     p[4] = vaddq_s16(p[4], p[5]);
    165     p[6] = vabsq_s16(a[6]);
    166     p[7] = vabsq_s16(a[7]);
    167     p[6] = vaddq_s16(p[6], p[7]);
    168 
    169     p[0] = vaddq_s16(p[0], p[2]);
    170     b01 = vpaddlq_s16(p[0]);
    171     c01 = vpaddlq_s32(b01);
    172     d01 = vrshrn_n_s64(c01, 2);
    173     vst1_s32(pi4_hsad, d01);
    174     pi4_hsad += hsad_stride;
    175 
    176     p[4] = vaddq_s16(p[4], p[6]);
    177     b23 = vpaddlq_s16(p[4]);
    178     c23 = vpaddlq_s32(b23);
    179     d23 = vrshrn_n_s64(c23, 2);
    180     vst1_s32(pi4_hsad, d23);
    181 
    182     d01 = vadd_s32(d01, d23);
    183 
    184     return (WORD32)(vget_lane_s64(vpaddl_s32(d01), 0));
    185 }
    186 
    187 static INLINE WORD32 hadamard_sad8x8_using4x4(int16x8_t *a, WORD32 *early_cbf, WORD32 i4_frm_qstep)
    188 {
    189     int16x8_t p[8];
    190     const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
    191     int32x4_t b;
    192     int64x2_t c;
    193     int64_t satd;
    194     WORD32 i;
    195 
    196     for(i = 0; i < 4; i++)
    197     {
    198         int16x8_t p0 = vaddq_s16(a[i], a[i + 4]);
    199         int16x8_t p1 = vsubq_s16(a[i], a[i + 4]);
    200 
    201         int16x4_t q0 = vadd_s16(vget_low_s16(p0), vget_high_s16(p0));
    202         int16x4_t q1 = vsub_s16(vget_low_s16(p0), vget_high_s16(p0));
    203         int16x4_t q2 = vadd_s16(vget_low_s16(p1), vget_high_s16(p1));
    204         int16x4_t q3 = vsub_s16(vget_low_s16(p1), vget_high_s16(p1));
    205 
    206         a[i] = vcombine_s16(q0, q2);
    207         a[i + 4] = vcombine_s16(q1, q3);
    208     }
    209 
    210 #define EARLY_EXIT(k)                                                                              \
    211     {                                                                                              \
    212         p[k] = vabsq_s16(a[k]);                                                                    \
    213         if(*early_cbf == 0)                                                                        \
    214         {                                                                                          \
    215             uint16x8_t cmp;                                                                        \
    216             cmp = vcgtq_s16(p[k], threshold);                                                      \
    217             if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp)), 0) ||                        \
    218                vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp)), 0))                         \
    219             {                                                                                      \
    220                 *early_cbf = 1;                                                                    \
    221             }                                                                                      \
    222         }                                                                                          \
    223     }
    224     // satd
    225     EARLY_EXIT(0);
    226     EARLY_EXIT(1);
    227     p[0] = vaddq_s16(p[0], p[1]);
    228     EARLY_EXIT(2);
    229     EARLY_EXIT(3);
    230     p[2] = vaddq_s16(p[2], p[3]);
    231 
    232     EARLY_EXIT(4);
    233     EARLY_EXIT(5);
    234     p[4] = vaddq_s16(p[4], p[5]);
    235     EARLY_EXIT(6);
    236     EARLY_EXIT(7);
    237 #undef EARLY_EXIT
    238     p[6] = vaddq_s16(p[6], p[7]);
    239 
    240     p[0] = vaddq_s16(p[0], p[2]);
    241     p[4] = vaddq_s16(p[4], p[6]);
    242     p[0] = vaddq_s16(p[0], p[4]);
    243     b = vpaddlq_s16(p[0]);
    244     c = vpaddlq_s32(b);
    245     satd = vget_lane_s64(vadd_s64(vget_low_s64(c), vget_high_s64(c)), 0);
    246 
    247     return ((satd + 4) >> 3);
    248 }
    249 
    250 static INLINE void hadamard8x8_one_pass(
    251     int16x8_t *r0,
    252     int16x8_t *r1,
    253     int16x8_t *r2,
    254     int16x8_t *r3,
    255     int16x8_t *r4,
    256     int16x8_t *r5,
    257     int16x8_t *r6,
    258     int16x8_t *r7)
    259 {
    260     const int16x8_t a0 = vaddq_s16(*r0, *r4);
    261     const int16x8_t a4 = vsubq_s16(*r0, *r4);
    262     const int16x8_t a1 = vaddq_s16(*r1, *r5);
    263     const int16x8_t a5 = vsubq_s16(*r1, *r5);
    264     const int16x8_t a2 = vaddq_s16(*r2, *r6);
    265     const int16x8_t a6 = vsubq_s16(*r2, *r6);
    266     const int16x8_t a3 = vaddq_s16(*r3, *r7);
    267     const int16x8_t a7 = vsubq_s16(*r3, *r7);
    268 
    269     const int16x8_t b0 = vaddq_s16(a0, a2);
    270     const int16x8_t b2 = vsubq_s16(a0, a2);
    271     const int16x8_t b1 = vaddq_s16(a1, a3);
    272     const int16x8_t b3 = vsubq_s16(a1, a3);
    273     const int16x8_t b4 = vaddq_s16(a4, a6);
    274     const int16x8_t b6 = vsubq_s16(a4, a6);
    275     const int16x8_t b5 = vaddq_s16(a5, a7);
    276     const int16x8_t b7 = vsubq_s16(a5, a7);
    277 
    278     *r0 = vaddq_s16(b0, b1);
    279     *r1 = vsubq_s16(b0, b1);
    280     *r2 = vaddq_s16(b2, b3);
    281     *r3 = vsubq_s16(b2, b3);
    282     *r4 = vaddq_s16(b4, b5);
    283     *r5 = vsubq_s16(b4, b5);
    284     *r6 = vaddq_s16(b6, b7);
    285     *r7 = vsubq_s16(b6, b7);
    286 }
    287 
    288 static INLINE void hadamard8x8(
    289     UWORD8 *pu1_src,
    290     WORD32 src_strd,
    291     UWORD8 *pu1_pred,
    292     WORD32 pred_strd,
    293     int16x8_t *r0,
    294     int16x8_t *r1,
    295     int16x8_t *r2,
    296     int16x8_t *r3,
    297     int16x8_t *r4,
    298     int16x8_t *r5,
    299     int16x8_t *r6,
    300     int16x8_t *r7,
    301     WORD32 is_chroma)
    302 {
    303     // compute error between src and pred
    304     RESIDUE(0, is_chroma);
    305     RESIDUE(1, is_chroma);
    306     RESIDUE(2, is_chroma);
    307     RESIDUE(3, is_chroma);
    308     RESIDUE(4, is_chroma);
    309     RESIDUE(5, is_chroma);
    310     RESIDUE(6, is_chroma);
    311     RESIDUE(7, is_chroma);
    312 
    313     // vertical hadamard tx
    314     hadamard8x8_one_pass(r0, r1, r2, r3, r4, r5, r6, r7);
    315 
    316     // transpose
    317     transpose_s16_8x8(r0, r1, r2, r3, r4, r5, r6, r7);
    318 
    319     // horizontal hadamard tx
    320     hadamard8x8_one_pass(r0, r1, r2, r3, r4, r5, r6, r7);
    321 }
    322 
    323 static INLINE UWORD32 ihevce_HAD_8x8_8bit_plane_neon(
    324     UWORD8 *pu1_src,
    325     WORD32 src_strd,
    326     UWORD8 *pu1_pred,
    327     WORD32 pred_strd,
    328     WORD32 is_chroma,
    329     WORD32 ac_only)
    330 {
    331     int16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
    332     int32x4_t b;
    333     int64x2_t c;
    334     int64_t satd;
    335 
    336     // hadamard 8x8
    337     hadamard8x8(
    338         pu1_src, src_strd, pu1_pred, pred_strd, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, is_chroma);
    339 
    340     if(ac_only)
    341     {
    342         const int16x8_t mask = vld1q_s16(gu2_dc_mask);
    343         a0 = vandq_s16(a0, mask);
    344     }
    345 
    346     // satd
    347     a0 = vabsq_s16(a0);
    348     a1 = vabsq_s16(a1);
    349     a0 = vaddq_s16(a0, a1);
    350     a2 = vabsq_s16(a2);
    351     a3 = vabsq_s16(a3);
    352     a2 = vaddq_s16(a2, a3);
    353 
    354     a4 = vabsq_s16(a4);
    355     a5 = vabsq_s16(a5);
    356     a4 = vaddq_s16(a4, a5);
    357     a6 = vabsq_s16(a6);
    358     a7 = vabsq_s16(a7);
    359     a6 = vaddq_s16(a6, a7);
    360 
    361     a0 = vaddq_s16(a0, a2);
    362     a4 = vaddq_s16(a4, a6);
    363     a0 = vaddq_s16(a0, a4);
    364     b = vpaddlq_s16(a0);
    365     c = vpaddlq_s32(b);
    366     satd = vget_lane_s64(vadd_s64(vget_low_s64(c), vget_high_s64(c)), 0);
    367 
    368     return ((satd + 4) >> 3);
    369 }
    370 
    371 static INLINE UWORD32 ihevce_HAD_4x4_8bit_plane_neon(
    372     UWORD8 *pu1_src,
    373     WORD32 src_strd,
    374     UWORD8 *pu1_pred,
    375     WORD32 pred_strd,
    376     WORD32 is_chroma,
    377     WORD32 ac_only)
    378 {
    379     uint8x16_t src_u8, pred_u8;
    380     int16x8_t res_01, res_23;
    381     int16x4_t h[4];
    382     int16x4_t v[4];
    383     int16x4x2_t trans_4[2];
    384     int16x8_t combined_rows[4];
    385     int32x4x2_t trans_8;
    386     int32x4_t sad_32_4[3];
    387     int32x2_t sad_32_2;
    388     int64x1_t sad_64_1;
    389     int32_t sad;
    390 
    391     if(!is_chroma)
    392     {
    393         src_u8 = load_unaligned_u8q(pu1_src, src_strd);
    394         pred_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
    395     }
    396     else
    397     {
    398         src_u8 = load_unaligned_u8qi(pu1_src, src_strd);
    399         pred_u8 = load_unaligned_u8qi(pu1_pred, pred_strd);
    400     }
    401     res_01 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8)));
    402     res_23 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(src_u8), vget_high_u8(pred_u8)));
    403 
    404     h[0] = vadd_s16(vget_low_s16(res_01), vget_high_s16(res_23));
    405     h[1] = vadd_s16(vget_high_s16(res_01), vget_low_s16(res_23));
    406     h[2] = vsub_s16(vget_high_s16(res_01), vget_low_s16(res_23));
    407     h[3] = vsub_s16(vget_low_s16(res_01), vget_high_s16(res_23));
    408 
    409     v[0] = vadd_s16(h[0], h[1]);
    410     v[1] = vadd_s16(h[3], h[2]);
    411     v[2] = vsub_s16(h[0], h[1]);
    412     v[3] = vsub_s16(h[3], h[2]);
    413 
    414     trans_4[0] = vtrn_s16(v[0], v[2]);
    415     trans_4[1] = vtrn_s16(v[1], v[3]);
    416 
    417     combined_rows[0] = vcombine_s16(trans_4[0].val[0], trans_4[1].val[0]);
    418     combined_rows[1] = vcombine_s16(trans_4[0].val[1], trans_4[1].val[1]);
    419 
    420     combined_rows[2] = vaddq_s16(combined_rows[0], combined_rows[1]);
    421     combined_rows[3] = vsubq_s16(combined_rows[0], combined_rows[1]);
    422 
    423     trans_8 =
    424         vtrnq_s32(vreinterpretq_s32_s16(combined_rows[2]), vreinterpretq_s32_s16(combined_rows[3]));
    425 
    426     combined_rows[0] =
    427         vaddq_s16(vreinterpretq_s16_s32(trans_8.val[0]), vreinterpretq_s16_s32(trans_8.val[1]));
    428     combined_rows[0] = vabsq_s16(combined_rows[0]);
    429     combined_rows[1] =
    430         vsubq_s16(vreinterpretq_s16_s32(trans_8.val[0]), vreinterpretq_s16_s32(trans_8.val[1]));
    431     combined_rows[1] = vabsq_s16(combined_rows[1]);
    432 
    433     if(ac_only)
    434     {
    435         const int16x8_t mask = vld1q_s16(gu2_dc_mask);
    436         combined_rows[0] = vandq_s16(combined_rows[0], mask);
    437     }
    438 
    439     sad_32_4[0] = vpaddlq_s16(combined_rows[0]);
    440     sad_32_4[1] = vpaddlq_s16(combined_rows[1]);
    441     sad_32_4[2] = vaddq_s32(sad_32_4[0], sad_32_4[1]);
    442     sad_32_2 = vadd_s32(vget_high_s32(sad_32_4[2]), vget_low_s32(sad_32_4[2]));
    443     sad_64_1 = vpaddl_s32(sad_32_2);
    444     sad = vget_lane_s64(sad_64_1, 0);
    445 
    446     return ((sad + 2) >> 2);
    447 }
    448 
    449 UWORD32 ihevce_HAD_4x4_8bit_neon(
    450     UWORD8 *pu1_src,
    451     WORD32 src_strd,
    452     UWORD8 *pu1_pred,
    453     WORD32 pred_strd,
    454     WORD16 *pi2_dst,
    455     WORD32 dst_strd)
    456 {
    457     (void)pi2_dst;
    458     (void)dst_strd;
    459     return ihevce_HAD_4x4_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 0);
    460 }
    461 
    462 UWORD32 ihevce_chroma_compute_AC_HAD_4x4_8bit_neon(
    463     UWORD8 *pu1_origin,
    464     WORD32 src_strd,
    465     UWORD8 *pu1_pred_buf,
    466     WORD32 pred_strd,
    467     WORD16 *pi2_dst,
    468     WORD32 dst_strd)
    469 {
    470     (void)pi2_dst;
    471     (void)dst_strd;
    472     return ihevce_HAD_4x4_8bit_plane_neon(pu1_origin, src_strd, pu1_pred_buf, pred_strd, 1, 1);
    473 }
    474 
    475 UWORD32 ihevce_HAD_8x8_8bit_neon(
    476     UWORD8 *pu1_src,
    477     WORD32 src_strd,
    478     UWORD8 *pu1_pred,
    479     WORD32 pred_strd,
    480     WORD16 *pi2_dst,
    481     WORD32 dst_strd)
    482 {
    483     (void)pi2_dst;
    484     (void)dst_strd;
    485     return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 0);
    486 }
    487 
    488 UWORD32 ihevce_compute_ac_had_8x8_8bit_neon(
    489     UWORD8 *pu1_src,
    490     WORD32 src_strd,
    491     UWORD8 *pu1_pred,
    492     WORD32 pred_strd,
    493     WORD16 *pi2_dst,
    494     WORD32 dst_strd)
    495 {
    496     (void)pi2_dst;
    497     (void)dst_strd;
    498     return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 1);
    499 }
    500 
    501 UWORD32 ihevce_HAD_16x16_8bit_neon(
    502     UWORD8 *pu1_src,
    503     WORD32 src_strd,
    504     UWORD8 *pu1_pred,
    505     WORD32 pred_strd,
    506     WORD16 *pi2_dst,
    507     WORD32 dst_strd)
    508 {
    509     int16x8_t b0[8];
    510     int16x8_t b1[8];
    511     int16x8_t b2[8];
    512     int16x8_t b3[8];
    513     uint32x4_t sum = vdupq_n_u32(0);
    514     uint64x2_t c;
    515     uint64_t satd;
    516     WORD32 i;
    517 
    518     (void)pi2_dst;
    519     (void)dst_strd;
    520 
    521     // hadamard 8x8 - b0
    522     hadamard8x8(
    523         pu1_src,
    524         src_strd,
    525         pu1_pred,
    526         pred_strd,
    527         &b0[0],
    528         &b0[1],
    529         &b0[2],
    530         &b0[3],
    531         &b0[4],
    532         &b0[5],
    533         &b0[6],
    534         &b0[7],
    535         0);
    536     // hadamard 8x8 - b1
    537     hadamard8x8(
    538         pu1_src + 8,
    539         src_strd,
    540         pu1_pred + 8,
    541         pred_strd,
    542         &b1[0],
    543         &b1[1],
    544         &b1[2],
    545         &b1[3],
    546         &b1[4],
    547         &b1[5],
    548         &b1[6],
    549         &b1[7],
    550         0);
    551     // hadamard 8x8 - b2
    552     hadamard8x8(
    553         pu1_src + (8 * src_strd),
    554         src_strd,
    555         pu1_pred + (8 * pred_strd),
    556         pred_strd,
    557         &b2[0],
    558         &b2[1],
    559         &b2[2],
    560         &b2[3],
    561         &b2[4],
    562         &b2[5],
    563         &b2[6],
    564         &b2[7],
    565         0);
    566     // hadamard 8x8 - b3
    567     hadamard8x8(
    568         pu1_src + (8 * src_strd) + 8,
    569         src_strd,
    570         pu1_pred + (8 * pred_strd) + 8,
    571         pred_strd,
    572         &b3[0],
    573         &b3[1],
    574         &b3[2],
    575         &b3[3],
    576         &b3[4],
    577         &b3[5],
    578         &b3[6],
    579         &b3[7],
    580         0);
    581 
    582     for(i = 0; i < 8; i++)
    583     {
    584         int16x8_t p0 = vhaddq_s16(b0[i], b1[i]);
    585         int16x8_t p1 = vhsubq_s16(b0[i], b1[i]);
    586         int16x8_t p2 = vhaddq_s16(b2[i], b3[i]);
    587         int16x8_t p3 = vhsubq_s16(b2[i], b3[i]);
    588 
    589         int16x8_t q0 = vaddq_s16(p0, p2);
    590         int16x8_t q1 = vsubq_s16(p0, p2);
    591         int16x8_t q2 = vaddq_s16(p1, p3);
    592         int16x8_t q3 = vsubq_s16(p1, p3);
    593 
    594         uint16x8_t r0 =
    595             vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(q0)), vreinterpretq_u16_s16(vabsq_s16(q1)));
    596         uint16x8_t r1 =
    597             vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(q2)), vreinterpretq_u16_s16(vabsq_s16(q3)));
    598 
    599         uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
    600         uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
    601 
    602         sum = vaddq_u32(sum, s0);
    603         sum = vaddq_u32(sum, s1);
    604     }
    605 
    606     c = vpaddlq_u32(sum);
    607     satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
    608 
    609     return ((satd + 4) >> 3);
    610 }
    611 
    612 UWORD32 ihevce_chroma_HAD_4x4_8bit_neon(
    613     UWORD8 *pu1_src,
    614     WORD32 src_strd,
    615     UWORD8 *pu1_pred,
    616     WORD32 pred_strd,
    617     WORD16 *pi2_dst,
    618     WORD32 dst_strd)
    619 {
    620     (void)pi2_dst;
    621     (void)dst_strd;
    622     return ihevce_HAD_4x4_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
    623 }
    624 
    625 UWORD32 ihevce_chroma_HAD_8x8_8bit_neon(
    626     UWORD8 *pu1_src,
    627     WORD32 src_strd,
    628     UWORD8 *pu1_pred,
    629     WORD32 pred_strd,
    630     WORD16 *pi2_dst,
    631     WORD32 dst_strd)
    632 {
    633     (void)pi2_dst;
    634     (void)dst_strd;
    635     return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
    636 }
    637 
    638 UWORD32 ihevce_chroma_HAD_16x16_8bit_neon(
    639     UWORD8 *pu1_src,
    640     WORD32 src_strd,
    641     UWORD8 *pu1_pred,
    642     WORD32 pred_strd,
    643     WORD16 *pi2_dst,
    644     WORD32 dst_strd)
    645 {
    646     UWORD32 au4_satd[4];
    647 
    648     (void)pi2_dst;
    649     (void)dst_strd;
    650     au4_satd[0] = ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
    651     au4_satd[1] =
    652         ihevce_HAD_8x8_8bit_plane_neon(pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, 1, 0);
    653     au4_satd[2] = ihevce_HAD_8x8_8bit_plane_neon(
    654         pu1_src + 8 * src_strd, src_strd, pu1_pred + 8 * pred_strd, pred_strd, 1, 0);
    655     au4_satd[3] = ihevce_HAD_8x8_8bit_plane_neon(
    656         pu1_src + 8 * src_strd + 16, src_strd, pu1_pred + 8 * pred_strd + 16, pred_strd, 1, 0);
    657 
    658     return au4_satd[0] + au4_satd[1] + au4_satd[2] + au4_satd[3];
    659 }
    660 
    661 UWORD32 ihevce_HAD_32x32_8bit_neon(
    662     UWORD8 *pu1_src,
    663     WORD32 src_strd,
    664     UWORD8 *pu1_pred,
    665     WORD32 pred_strd,
    666     WORD16 *pi2_dst,
    667     WORD32 dst_strd)
    668 {
    669     int16x8_t a[4][4][8];
    670     uint32x4_t sum = vdupq_n_u32(0);
    671     WORD32 b8, b16;
    672     uint64x2_t c;
    673     uint64_t satd;
    674     WORD32 i, j;
    675 
    676     (void)pi2_dst;
    677     (void)dst_strd;
    678     // hadamard 32x32
    679     for(b16 = 0; b16 < 4; b16++)
    680     {
    681         UWORD8 *pu1_src_b16 = pu1_src + (b16 >> 1) * (src_strd * 16) + ((b16 & 1) * 16);
    682         UWORD8 *pu1_pred_b16 = pu1_pred + (b16 >> 1) * (pred_strd * 16) + ((b16 & 1) * 16);
    683         // hadamard 16x16
    684         for(b8 = 0; b8 < 4; b8++)
    685         {
    686             UWORD8 *pu1_src_b8 = pu1_src_b16 + (b8 >> 1) * (src_strd * 8) + ((b8 & 1) * 8);
    687             UWORD8 *pu1_pred_b8 = pu1_pred_b16 + (b8 >> 1) * (pred_strd * 8) + ((b8 & 1) * 8);
    688             // hadamard 8x8
    689             hadamard8x8(
    690                 pu1_src_b8,
    691                 src_strd,
    692                 pu1_pred_b8,
    693                 pred_strd,
    694                 &a[b16][b8][0],
    695                 &a[b16][b8][1],
    696                 &a[b16][b8][2],
    697                 &a[b16][b8][3],
    698                 &a[b16][b8][4],
    699                 &a[b16][b8][5],
    700                 &a[b16][b8][6],
    701                 &a[b16][b8][7],
    702                 0);
    703         }
    704         for(i = 0; i < 8; i++)
    705         {
    706             int16x8_t p0 = vhaddq_s16(a[b16][0][i], a[b16][1][i]);
    707             int16x8_t p1 = vhsubq_s16(a[b16][0][i], a[b16][1][i]);
    708             int16x8_t p2 = vhaddq_s16(a[b16][2][i], a[b16][3][i]);
    709             int16x8_t p3 = vhsubq_s16(a[b16][2][i], a[b16][3][i]);
    710 
    711             a[b16][0][i] = vaddq_s16(p0, p2);
    712             a[b16][1][i] = vsubq_s16(p0, p2);
    713             a[b16][2][i] = vaddq_s16(p1, p3);
    714             a[b16][3][i] = vsubq_s16(p1, p3);
    715 
    716             a[b16][0][i] = vshrq_n_s16(a[b16][0][i], 2);
    717             a[b16][1][i] = vshrq_n_s16(a[b16][1][i], 2);
    718             a[b16][2][i] = vshrq_n_s16(a[b16][2][i], 2);
    719             a[b16][3][i] = vshrq_n_s16(a[b16][3][i], 2);
    720         }
    721     }
    722     for(j = 0; j < 4; j++)
    723     {
    724         for(i = 0; i < 8; i++)
    725         {
    726             int16x8_t p0 = vaddq_s16(a[0][j][i], a[1][j][i]);
    727             int16x8_t p1 = vsubq_s16(a[0][j][i], a[1][j][i]);
    728             int16x8_t p2 = vaddq_s16(a[2][j][i], a[3][j][i]);
    729             int16x8_t p3 = vsubq_s16(a[2][j][i], a[3][j][i]);
    730 
    731             int16x8_t q0 = vaddq_s16(p0, p2);
    732             int16x8_t q1 = vsubq_s16(p0, p2);
    733             int16x8_t q2 = vaddq_s16(p1, p3);
    734             int16x8_t q3 = vsubq_s16(p1, p3);
    735 
    736             uint16x8_t r0 = vaddq_u16(
    737                 vreinterpretq_u16_s16(vabsq_s16(q0)), vreinterpretq_u16_s16(vabsq_s16(q1)));
    738             uint16x8_t r1 = vaddq_u16(
    739                 vreinterpretq_u16_s16(vabsq_s16(q2)), vreinterpretq_u16_s16(vabsq_s16(q3)));
    740 
    741             uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
    742             uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
    743 
    744             sum = vaddq_u32(sum, s0);
    745             sum = vaddq_u32(sum, s1);
    746         }
    747     }
    748     c = vpaddlq_u32(sum);
    749     satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
    750 
    751     return ((satd + 2) >> 2);
    752 }
    753 
    754 WORD32 ihevce_had4_4x4_neon(
    755     UWORD8 *pu1_src,
    756     WORD32 src_strd,
    757     UWORD8 *pu1_pred,
    758     WORD32 pred_strd,
    759     WORD16 *pi2_dst4x4,
    760     WORD32 dst_strd,
    761     WORD32 *pi4_hsad,
    762     WORD32 hsad_stride,
    763     WORD32 i4_frm_qstep)
    764 {
    765     int16x8_t a[8];
    766 
    767     (void)pi2_dst4x4;
    768     (void)dst_strd;
    769     (void)i4_frm_qstep;
    770 
    771     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
    772     hadamard4x4_4(
    773         pu1_src,
    774         src_strd,
    775         pu1_pred,
    776         pred_strd,
    777         &a[0],
    778         &a[1],
    779         &a[2],
    780         &a[3],
    781         &a[4],
    782         &a[5],
    783         &a[6],
    784         &a[7]);
    785 
    786     return hadamard_sad4x4_4(a, pi4_hsad, hsad_stride);
    787 }
    788 
    789 WORD32 ihevce_had_8x8_using_4_4x4_r_neon(
    790     UWORD8 *pu1_src,
    791     WORD32 src_strd,
    792     UWORD8 *pu1_pred,
    793     WORD32 pred_strd,
    794     WORD16 *pi2_dst,
    795     WORD32 dst_strd,
    796     WORD32 **ppi4_hsad,
    797     WORD32 **ppi4_tu_split,
    798     WORD32 **ppi4_tu_early_cbf,
    799     WORD32 pos_x_y_4x4,
    800     WORD32 num_4x4_in_row,
    801     WORD32 lambda,
    802     WORD32 lambda_q_shift,
    803     WORD32 i4_frm_qstep,
    804     WORD32 i4_cur_depth,
    805     WORD32 i4_max_depth,
    806     WORD32 i4_max_tr_size,
    807     WORD32 *pi4_tu_split_cost,
    808     void *pv_func_sel)
    809 {
    810     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
    811     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
    812 
    813     WORD32 *pi4_4x4_hsad;
    814     WORD32 *pi4_8x8_hsad;
    815     WORD32 *pi4_8x8_tu_split;
    816     WORD32 *pi4_8x8_tu_early_cbf;
    817 
    818     WORD32 cost_child, cost_parent;
    819     WORD32 best_cost;
    820     WORD32 early_cbf = 0;
    821     const UWORD8 u1_cur_tr_size = 8;
    822 
    823     WORD32 i;
    824 
    825     int16x8_t a[8];
    826 
    827     (void)pv_func_sel;
    828 
    829     assert(pos_x >= 0);
    830     assert(pos_y >= 0);
    831 
    832     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
    833     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
    834     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
    835     pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
    836     pi4_8x8_tu_early_cbf =
    837         ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
    838 
    839     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
    840     hadamard4x4_4(
    841         pu1_src,
    842         src_strd,
    843         pu1_pred,
    844         pred_strd,
    845         &a[0],
    846         &a[1],
    847         &a[2],
    848         &a[3],
    849         &a[4],
    850         &a[5],
    851         &a[6],
    852         &a[7]);
    853 
    854     /* -------- cost child -------- */
    855     cost_child = hadamard_sad4x4_4(a, pi4_4x4_hsad, num_4x4_in_row);
    856     /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
    857     cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
    858 
    859     /* -------- cost parent -------- */
    860     cost_parent = hadamard_sad8x8_using4x4(a, &early_cbf, i4_frm_qstep);
    861     for(i = 0; i < 8; i++, pi2_dst += dst_strd)
    862         vst1q_s16(pi2_dst, a[i]);
    863 
    864     if(i4_cur_depth < i4_max_depth)
    865     {
    866         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
    867         {
    868             *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
    869             best_cost = cost_child;
    870             best_cost <<= 1;
    871             best_cost++;
    872             pi4_8x8_tu_split[0] = 1;
    873             pi4_8x8_hsad[0] = cost_child;
    874         }
    875         else
    876         {
    877             best_cost = cost_parent;
    878             best_cost <<= 1;
    879             pi4_8x8_tu_split[0] = 0;
    880             pi4_8x8_hsad[0] = cost_parent;
    881         }
    882     }
    883     else
    884     {
    885         best_cost = cost_parent;
    886         best_cost <<= 1;
    887         pi4_8x8_tu_split[0] = 0;
    888         pi4_8x8_hsad[0] = cost_parent;
    889     }
    890 
    891     pi4_8x8_tu_early_cbf[0] = early_cbf;
    892 
    893     /* best cost has tu_split_flag at LSB(Least significant bit) */
    894     return ((best_cost << 1) + early_cbf);
    895 }
    896 
    897 static WORD32 ihevce_compute_16x16HAD_using_8x8_neon(
    898     WORD16 *pi2_8x8_had,
    899     WORD32 had8_strd,
    900     WORD16 *pi2_dst,
    901     WORD32 dst_strd,
    902     WORD32 i4_frm_qstep,
    903     WORD32 *pi4_cbf)
    904 {
    905     int16x8_t b0[8];
    906     int16x8_t b1[8];
    907     int16x8_t b2[8];
    908     int16x8_t b3[8];
    909     const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
    910     uint32x4_t sum = vdupq_n_u32(0);
    911     uint64x2_t c;
    912     uint64_t satd;
    913     WORD32 i;
    914 
    915     for(i = 0; i < 8; i++, pi2_8x8_had += had8_strd)
    916     {
    917         b0[i] = vld1q_s16(pi2_8x8_had);
    918         b1[i] = vld1q_s16(pi2_8x8_had + 8);
    919     }
    920     for(i = 0; i < 8; i++, pi2_8x8_had += had8_strd)
    921     {
    922         b2[i] = vld1q_s16(pi2_8x8_had);
    923         b3[i] = vld1q_s16(pi2_8x8_had + 8);
    924     }
    925 
    926 #define EARLY_EXIT(k)                                                                              \
    927     {                                                                                              \
    928         p##k = vabsq_s16(q##k);                                                                    \
    929         if(*pi4_cbf == 0)                                                                          \
    930         {                                                                                          \
    931             uint16x8_t cmp;                                                                        \
    932             cmp = vcgtq_s16(p##k, threshold);                                                      \
    933             if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp)), 0) ||                        \
    934                vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp)), 0))                         \
    935             {                                                                                      \
    936                 *pi4_cbf = 1;                                                                      \
    937             }                                                                                      \
    938         }                                                                                          \
    939     }
    940     for(i = 0; i < 8; i++, pi2_dst += dst_strd)
    941     {
    942         int16x8_t p0 = vhaddq_s16(b0[i], b1[i]);
    943         int16x8_t p1 = vhsubq_s16(b0[i], b1[i]);
    944         int16x8_t p2 = vhaddq_s16(b2[i], b3[i]);
    945         int16x8_t p3 = vhsubq_s16(b2[i], b3[i]);
    946 
    947         int16x8_t q0 = vaddq_s16(p0, p2);
    948         int16x8_t q1 = vsubq_s16(p0, p2);
    949         int16x8_t q2 = vaddq_s16(p1, p3);
    950         int16x8_t q3 = vsubq_s16(p1, p3);
    951 
    952         vst1q_s16(pi2_dst, q0);
    953         EARLY_EXIT(0);
    954         vst1q_s16(pi2_dst + 8, q1);
    955         EARLY_EXIT(1);
    956         vst1q_s16(pi2_dst + 8 * dst_strd, q2);
    957         EARLY_EXIT(2);
    958         vst1q_s16(pi2_dst + 8 * dst_strd + 8, q3);
    959         EARLY_EXIT(3);
    960         uint16x8_t r0 = vaddq_u16(vreinterpretq_u16_s16(p0), vreinterpretq_u16_s16(p1));
    961         uint16x8_t r1 = vaddq_u16(vreinterpretq_u16_s16(p2), vreinterpretq_u16_s16(p3));
    962 
    963         uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
    964         uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
    965 
    966         sum = vaddq_u32(sum, s0);
    967         sum = vaddq_u32(sum, s1);
    968     }
    969 
    970     c = vpaddlq_u32(sum);
    971     satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
    972 
    973     return ((satd + 4) >> 3);
    974 }
    975 
    976 WORD32 ihevce_had_16x16_r_neon(
    977     UWORD8 *pu1_src,
    978     WORD32 src_strd,
    979     UWORD8 *pu1_pred,
    980     WORD32 pred_strd,
    981     WORD16 *pi2_dst,
    982     WORD32 dst_strd,
    983     WORD32 **ppi4_hsad,
    984     WORD32 **ppi4_tu_split,
    985     WORD32 **ppi4_tu_early_cbf,
    986     WORD32 pos_x_y_4x4,
    987     WORD32 num_4x4_in_row,
    988     WORD32 lambda,
    989     WORD32 lambda_q_shift,
    990     WORD32 i4_frm_qstep,
    991     WORD32 i4_cur_depth,
    992     WORD32 i4_max_depth,
    993     WORD32 i4_max_tr_size,
    994     WORD32 *pi4_tu_split_cost,
    995     void *pv_func_sel)
    996 {
    997     WORD16 ai2_8x8_had[256];
    998 
    999     WORD32 *pi4_16x16_hsad;
   1000     WORD32 *pi4_16x16_tu_split;
   1001     WORD32 *pi4_16x16_tu_early_cbf;
   1002 
   1003     WORD32 best_cost, best_cost_tu_split;
   1004     WORD32 tu_split_flag = 0;
   1005     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
   1006     WORD32 cost_parent, cost_child = 0;
   1007 
   1008     const UWORD8 u1_cur_tr_size = 16;
   1009 
   1010     WORD32 i;
   1011 
   1012     WORD16 *pi2_y0;
   1013     UWORD8 *src, *pred;
   1014     WORD32 pos_x_y_4x4_0;
   1015 
   1016     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
   1017     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
   1018 
   1019     assert(pos_x >= 0);
   1020     assert(pos_y >= 0);
   1021 
   1022     /* Initialize pointers to  store 16x16 SATDs */
   1023     pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
   1024 
   1025     pi4_16x16_tu_split =
   1026         ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
   1027 
   1028     pi4_16x16_tu_early_cbf =
   1029         ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
   1030 
   1031     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
   1032     for(i = 0; i < 4; i++)
   1033     {
   1034         src = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
   1035         pred = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
   1036         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
   1037         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
   1038 
   1039         best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r_neon(
   1040             src,
   1041             src_strd,
   1042             pred,
   1043             pred_strd,
   1044             pi2_y0,
   1045             16,
   1046             ppi4_hsad,
   1047             ppi4_tu_split,
   1048             ppi4_tu_early_cbf,
   1049             pos_x_y_4x4_0,
   1050             num_4x4_in_row,
   1051             lambda,
   1052             lambda_q_shift,
   1053             i4_frm_qstep,
   1054             i4_cur_depth + 1,
   1055             i4_max_depth,
   1056             i4_max_tr_size,
   1057             pi4_tu_split_cost,
   1058             pv_func_sel);
   1059 
   1060         /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
   1061         best_cost = (best_cost_tu_split >> 2);
   1062 
   1063         /* Last but one bit stores the information regarding the TU_Split */
   1064         tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
   1065 
   1066         /* Last bit stores the information regarding the early_cbf */
   1067         i4_early_cbf_flag += (best_cost_tu_split & 0x1);
   1068 
   1069         cost_child += best_cost;
   1070 
   1071         tu_split_flag <<= 1;
   1072         i4_early_cbf_flag <<= 1;
   1073     }
   1074 
   1075     /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
   1076     pi2_y0 = ai2_8x8_had;
   1077 
   1078     /* Threshold currently passed as "0" */
   1079     cost_parent = ihevce_compute_16x16HAD_using_8x8_neon(
   1080         pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
   1081 
   1082     /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
   1083     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
   1084 
   1085     i4_early_cbf_flag += early_cbf;
   1086 
   1087     /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
   1088     which decides the extent to which TU_REC needs to be done */
   1089     if(i4_cur_depth < i4_max_depth)
   1090     {
   1091         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
   1092         {
   1093             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
   1094             tu_split_flag += 1;
   1095             best_cost = cost_child;
   1096         }
   1097         else
   1098         {
   1099             tu_split_flag += 0;
   1100             best_cost = cost_parent;
   1101         }
   1102     }
   1103     else
   1104     {
   1105         tu_split_flag += 0;
   1106         best_cost = cost_parent;
   1107     }
   1108 
   1109     pi4_16x16_hsad[0] = best_cost;
   1110     pi4_16x16_tu_split[0] = tu_split_flag;
   1111     pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
   1112 
   1113     /*returning two values(best cost & tu_split_flag) as a single value*/
   1114     return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
   1115 }
   1116 
   1117 UWORD32 ihevce_compute_32x32HAD_using_16x16_neon(
   1118     WORD16 *pi2_16x16_had,
   1119     WORD32 had16_strd,
   1120     WORD16 *pi2_dst,
   1121     WORD32 dst_strd,
   1122     WORD32 i4_frm_qstep,
   1123     WORD32 *pi4_cbf)
   1124 {
   1125     int16x8_t a[4][4][8];
   1126     uint32x4_t sum = vdupq_n_u32(0);
   1127     const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
   1128     WORD32 b8, b16;
   1129     uint64x2_t c;
   1130     WORD32 i, j;
   1131 
   1132     (void)pi2_dst;
   1133     (void)dst_strd;
   1134 
   1135     for(b16 = 0; b16 < 4; b16++)
   1136     {
   1137         WORD16 *pi2_b16 = pi2_16x16_had + (b16 >> 1) * (had16_strd * 16) + ((b16 & 1) * 16);
   1138 
   1139         for(b8 = 0; b8 < 4; b8++)
   1140         {
   1141             WORD16 *pi2_b8 = pi2_b16 + (b8 >> 1) * (had16_strd * 8) + ((b8 & 1) * 8);
   1142 
   1143             for(i = 0; i < 8; i++, pi2_b8 += had16_strd)
   1144             {
   1145                 a[b16][b8][i] = vld1q_s16(pi2_b8);
   1146                 a[b16][b8][i] = vshrq_n_s16(a[b16][b8][i], 2);
   1147             }
   1148         }
   1149     }
   1150 
   1151     for(j = 0; j < 4; j++)
   1152     {
   1153         for(i = 0; i < 8; i++)
   1154         {
   1155             int16x8_t p0 = vaddq_s16(a[0][j][i], a[1][j][i]);
   1156             int16x8_t p1 = vsubq_s16(a[0][j][i], a[1][j][i]);
   1157             int16x8_t p2 = vaddq_s16(a[2][j][i], a[3][j][i]);
   1158             int16x8_t p3 = vsubq_s16(a[2][j][i], a[3][j][i]);
   1159 
   1160             int16x8_t q0 = vaddq_s16(p0, p2);
   1161             int16x8_t q1 = vsubq_s16(p0, p2);
   1162             int16x8_t q2 = vaddq_s16(p1, p3);
   1163             int16x8_t q3 = vsubq_s16(p1, p3);
   1164 
   1165             EARLY_EXIT(0);
   1166             EARLY_EXIT(1);
   1167             EARLY_EXIT(2);
   1168             EARLY_EXIT(3);
   1169 
   1170             uint16x8_t r0 = vaddq_u16(vreinterpretq_u16_s16(p0), vreinterpretq_u16_s16(p1));
   1171             uint16x8_t r1 = vaddq_u16(vreinterpretq_u16_s16(p2), vreinterpretq_u16_s16(p3));
   1172 
   1173             uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
   1174             uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
   1175 
   1176             sum = vaddq_u32(sum, s0);
   1177             sum = vaddq_u32(sum, s1);
   1178         }
   1179     }
   1180     c = vpaddlq_u32(sum);
   1181 
   1182     return vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
   1183 }
   1184