Home | History | Annotate | Download | only in arm
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21 ******************************************************************************
     22 * @file
     23 *  ihevce_subpel_neon.c
     24 *
     25 * @brief
     26 *  Subpel refinement modules for ME algo
     27 *
     28 * @author
     29 *  Ittiam
     30 *
     31 * @par List of Functions:
     32 *
     33 * @remarks
     34 *  None
     35 *
     36 ********************************************************************************
     37 */
     38 
     39 /*****************************************************************************/
     40 /* File Includes                                                             */
     41 /*****************************************************************************/
     42 /* System include files */
     43 #include <stdio.h>
     44 #include <string.h>
     45 #include <assert.h>
     46 #include <arm_neon.h>
     47 
     48 /* User include files */
     49 #include "ihevc_typedefs.h"
     50 #include "itt_video_api.h"
     51 #include "ihevc_cmn_utils_neon.h"
     52 #include "ihevc_chroma_itrans_recon.h"
     53 #include "ihevc_chroma_intra_pred.h"
     54 #include "ihevc_debug.h"
     55 #include "ihevc_deblk.h"
     56 #include "ihevc_defs.h"
     57 #include "ihevc_itrans_recon.h"
     58 #include "ihevc_intra_pred.h"
     59 #include "ihevc_inter_pred.h"
     60 #include "ihevc_macros.h"
     61 #include "ihevc_mem_fns.h"
     62 #include "ihevc_padding.h"
     63 #include "ihevc_quant_iquant_ssd.h"
     64 #include "ihevc_resi_trans.h"
     65 #include "ihevc_sao.h"
     66 #include "ihevc_structs.h"
     67 #include "ihevc_weighted_pred.h"
     68 
     69 #include "rc_cntrl_param.h"
     70 #include "rc_frame_info_collector.h"
     71 #include "rc_look_ahead_params.h"
     72 
     73 #include "ihevce_api.h"
     74 #include "ihevce_defs.h"
     75 #include "ihevce_lap_enc_structs.h"
     76 #include "ihevce_multi_thrd_structs.h"
     77 #include "ihevce_function_selector.h"
     78 #include "ihevce_me_common_defs.h"
     79 #include "ihevce_enc_structs.h"
     80 #include "ihevce_had_satd.h"
     81 #include "ihevce_ipe_instr_set_router.h"
     82 #include "ihevce_global_tables.h"
     83 
     84 #include "hme_datatype.h"
     85 #include "hme_common_defs.h"
     86 #include "hme_interface.h"
     87 #include "hme_defs.h"
     88 
     89 #include "ihevce_me_instr_set_router.h"
     90 
     91 /*****************************************************************************/
     92 /* Function Declarations                                                     */
     93 /*****************************************************************************/
     94 FT_CALC_SATD_AND_RESULT hme_evalsatd_update_1_best_result_pt_pu_16x16_neon;
     95 
     96 WORD32 ihevce_had4_4x4_neon(
     97     UWORD8 *pu1_src,
     98     WORD32 src_strd,
     99     UWORD8 *pu1_pred,
    100     WORD32 pred_strd,
    101     WORD16 *pi2_dst4x4,
    102     WORD32 dst_strd,
    103     WORD32 *pi4_hsad,
    104     WORD32 hsad_stride,
    105     WORD32 i4_frm_qstep);
    106 
    107 /*****************************************************************************/
    108 /* Function Definitions                                                      */
    109 /*****************************************************************************/
    110 
    111 static void hme_4x4_qpel_interp_avg_neon(
    112     UWORD8 *pu1_src_a,
    113     UWORD8 *pu1_src_b,
    114     WORD32 src_a_strd,
    115     WORD32 src_b_strd,
    116     UWORD8 *pu1_dst,
    117     WORD32 dst_strd)
    118 {
    119     uint8x16_t src_a = load_unaligned_u8q(pu1_src_a, src_a_strd);
    120     uint8x16_t src_b = load_unaligned_u8q(pu1_src_b, src_b_strd);
    121     uint8x16_t dst = vrhaddq_u8(src_a, src_b);
    122 
    123     store_unaligned_u8q(pu1_dst, dst_strd, dst);
    124 }
    125 
    126 static void hme_8xn_qpel_interp_avg_neon(
    127     UWORD8 *pu1_src_a,
    128     UWORD8 *pu1_src_b,
    129     WORD32 src_a_strd,
    130     WORD32 src_b_strd,
    131     UWORD8 *pu1_dst,
    132     WORD32 dst_strd,
    133     WORD32 ht)
    134 {
    135     WORD32 i;
    136 
    137     for(i = 0; i < ht; i++)
    138     {
    139         uint8x8_t src_a = vld1_u8(pu1_src_a);
    140         uint8x8_t src_b = vld1_u8(pu1_src_b);
    141         uint8x8_t dst = vrhadd_u8(src_a, src_b);
    142 
    143         vst1_u8(pu1_dst, dst);
    144         pu1_src_a += src_a_strd;
    145         pu1_src_b += src_b_strd;
    146         pu1_dst += dst_strd;
    147     }
    148 }
    149 
    150 static void hme_16xn_qpel_interp_avg_neon(
    151     UWORD8 *pu1_src_a,
    152     UWORD8 *pu1_src_b,
    153     WORD32 src_a_strd,
    154     WORD32 src_b_strd,
    155     UWORD8 *pu1_dst,
    156     WORD32 dst_strd,
    157     WORD32 ht)
    158 {
    159     WORD32 i;
    160 
    161     for(i = 0; i < ht; i++)
    162     {
    163         uint8x16_t src_a = vld1q_u8(pu1_src_a);
    164         uint8x16_t src_b = vld1q_u8(pu1_src_b);
    165         uint8x16_t dst = vrhaddq_u8(src_a, src_b);
    166 
    167         vst1q_u8(pu1_dst, dst);
    168         pu1_src_a += src_a_strd;
    169         pu1_src_b += src_b_strd;
    170         pu1_dst += dst_strd;
    171     }
    172 }
    173 
    174 static void hme_32xn_qpel_interp_avg_neon(
    175     UWORD8 *pu1_src_a,
    176     UWORD8 *pu1_src_b,
    177     WORD32 src_a_strd,
    178     WORD32 src_b_strd,
    179     UWORD8 *pu1_dst,
    180     WORD32 dst_strd,
    181     WORD32 ht)
    182 {
    183     WORD32 i;
    184 
    185     for(i = 0; i < ht; i++)
    186     {
    187         uint8x16_t src_a_0 = vld1q_u8(pu1_src_a);
    188         uint8x16_t src_b_0 = vld1q_u8(pu1_src_b);
    189         uint8x16_t dst_0 = vrhaddq_u8(src_a_0, src_b_0);
    190 
    191         uint8x16_t src_a_1 = vld1q_u8(pu1_src_a + 16);
    192         uint8x16_t src_b_1 = vld1q_u8(pu1_src_b + 16);
    193         uint8x16_t dst_1 = vrhaddq_u8(src_a_1, src_b_1);
    194 
    195         vst1q_u8(pu1_dst, dst_0);
    196         vst1q_u8(pu1_dst + 16, dst_1);
    197         pu1_src_a += src_a_strd;
    198         pu1_src_b += src_b_strd;
    199         pu1_dst += dst_strd;
    200     }
    201 }
    202 
    203 static void hme_4mx4n_qpel_interp_avg_neon(
    204     UWORD8 *pu1_src_a,
    205     UWORD8 *pu1_src_b,
    206     WORD32 src_a_strd,
    207     WORD32 src_b_strd,
    208     UWORD8 *pu1_dst,
    209     WORD32 dst_strd,
    210     WORD32 blk_wd,
    211     WORD32 blk_ht)
    212 {
    213     WORD32 i, j;
    214 
    215     assert(blk_wd % 4 == 0);
    216     assert(blk_ht % 4 == 0);
    217 
    218     for(i = 0; i < blk_ht; i += 4)
    219     {
    220         for(j = 0; j < blk_wd;)
    221         {
    222             WORD32 wd = blk_wd - j;
    223 
    224             if(wd >= 32)
    225             {
    226                 hme_32xn_qpel_interp_avg_neon(
    227                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
    228                 j += 32;
    229             }
    230             else if(wd >= 16)
    231             {
    232                 hme_16xn_qpel_interp_avg_neon(
    233                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
    234                 j += 16;
    235             }
    236             else if(wd >= 8)
    237             {
    238                 hme_8xn_qpel_interp_avg_neon(
    239                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
    240                 j += 8;
    241             }
    242             else
    243             {
    244                 hme_4x4_qpel_interp_avg_neon(
    245                     pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd);
    246                 j += 4;
    247             }
    248         }
    249         pu1_src_a += (4 * src_a_strd);
    250         pu1_src_b += (4 * src_b_strd);
    251         pu1_dst += (4 * dst_strd);
    252     }
    253 }
    254 
    255 void hme_qpel_interp_avg_neon(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
    256 {
    257     U08 *pu1_src1, *pu1_src2, *pu1_dst;
    258     qpel_input_buf_cfg_t *ps_inp_cfg;
    259     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    260     S32 i4_ref_stride = ps_prms->i4_ref_stride;
    261 
    262     i4_mv_x_frac = i4_mv_x & 3;
    263     i4_mv_y_frac = i4_mv_y & 3;
    264 
    265     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
    266 
    267     /* Derive the descriptor that has all offset and size info */
    268     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    269 
    270     if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
    271     {
    272         /* This is case for fxfy/hxfy/fxhy/hxhy */
    273         ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    274         ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    275         ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
    276         ps_prms->i4_final_out_stride = i4_ref_stride;
    277 
    278         return;
    279     }
    280 
    281     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    282     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    283     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    284 
    285     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
    286     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
    287     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);
    288 
    289     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
    290 
    291     hme_4mx4n_qpel_interp_avg_neon(
    292         pu1_src1,
    293         pu1_src2,
    294         ps_prms->i4_ref_stride,
    295         ps_prms->i4_ref_stride,
    296         pu1_dst,
    297         ps_prms->i4_out_stride,
    298         ps_prms->i4_blk_wd,
    299         ps_prms->i4_blk_ht);
    300     ps_prms->pu1_final_out = pu1_dst;
    301     ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
    302 }
    303 
    304 // TODO: Can this function and above function be unified
    305 void hme_qpel_interp_avg_1pt_neon(
    306     interp_prms_t *ps_prms,
    307     S32 i4_mv_x,
    308     S32 i4_mv_y,
    309     S32 i4_buf_id,
    310     U08 **ppu1_final,
    311     S32 *pi4_final_stride)
    312 {
    313     U08 *pu1_src1, *pu1_src2, *pu1_dst;
    314     qpel_input_buf_cfg_t *ps_inp_cfg;
    315     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    316     S32 i4_ref_stride = ps_prms->i4_ref_stride;
    317 
    318     i4_mv_x_frac = i4_mv_x & 3;
    319     i4_mv_y_frac = i4_mv_y & 3;
    320 
    321     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
    322 
    323     /* Derive the descriptor that has all offset and size info */
    324     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    325 
    326     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    327     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    328     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    329 
    330     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
    331     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
    332     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);
    333 
    334     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
    335 
    336     hme_4mx4n_qpel_interp_avg_neon(
    337         pu1_src1,
    338         pu1_src2,
    339         ps_prms->i4_ref_stride,
    340         ps_prms->i4_ref_stride,
    341         pu1_dst,
    342         ps_prms->i4_out_stride,
    343         ps_prms->i4_blk_wd,
    344         ps_prms->i4_blk_ht);
    345     ppu1_final[i4_buf_id] = pu1_dst;
    346     pi4_final_stride[i4_buf_id] = ps_prms->i4_out_stride;
    347 }
    348 
    349 void hme_qpel_interp_avg_2pt_vert_with_reuse_neon(
    350     interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
    351 {
    352     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
    353 
    354     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
    355 }
    356 
    357 void hme_qpel_interp_avg_2pt_horz_with_reuse_neon(
    358     interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
    359 {
    360     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
    361 
    362     hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
    363 }
    364 
    365 void hme_evalsatd_update_1_best_result_pt_pu_16x16_neon(
    366     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
    367 {
    368     mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
    369     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
    370     S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
    371 
    372     S32 ai4_satd_4x4[16];
    373     S32 ai4_satd_8x8[4];
    374 
    375     U08 *pu1_inp = ps_prms->pu1_inp;
    376     U08 *pu1_ref = ps_prms->pu1_ref;
    377 
    378     S32 inp_stride = ps_prms->i4_inp_stride;
    379     S32 ref_stride = ps_prms->i4_ref_stride;
    380 
    381     S32 i;
    382 
    383     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
    384     for(i = 0; i < 4; i++)
    385     {
    386         U08 *pu1_src = pu1_inp + (i & 0x1) * 8 + (i >> 1) * inp_stride * 8;
    387         U08 *pu1_pred = pu1_ref + (i & 0x1) * 8 + (i >> 1) * ref_stride * 8;
    388         S16 idx = (i & 0x1) * 2 + (i >> 1) * 8;
    389 
    390         ai4_satd_8x8[i] = ihevce_had4_4x4_neon(
    391             pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 0, &ai4_satd_4x4[idx], 4, 0);
    392     }
    393 
    394     /* Update 16x16 SATDs */
    395     pi4_sad_grid[PART_ID_2Nx2N] =
    396         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
    397 
    398     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
    399     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
    400     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
    401     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
    402 
    403     /* Update 8x16 / 16x8 SATDs */
    404     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
    405     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
    406     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
    407     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
    408 
    409     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
    410     pi4_sad_grid[PART_ID_nLx2N_L] =
    411         ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
    412     pi4_sad_grid[PART_ID_nRx2N_R] =
    413         ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
    414     pi4_sad_grid[PART_ID_2NxnU_T] =
    415         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
    416     pi4_sad_grid[PART_ID_2NxnD_B] =
    417         ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
    418 
    419     pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
    420     pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
    421     pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
    422     pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
    423 
    424     /* For each valid partition, update the refine_prm structure to
    425      * reflect the best and second best candidates for that partition */
    426     for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
    427     {
    428         S32 part_id = pi4_valid_part_ids[i];
    429         S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
    430         S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
    431         S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
    432         S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
    433         S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
    434 
    435         if(i4_tot_cost < best_node_cost)
    436         {
    437             refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
    438             refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
    439             refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
    440             refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
    441             refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
    442         }
    443     }
    444 }
    445