1 /****************************************************************************** 2 * 3 * Copyright (C) 2018 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /** 21 ****************************************************************************** 22 * @file 23 * ihevce_subpel_neon.c 24 * 25 * @brief 26 * Subpel refinement modules for ME algo 27 * 28 * @author 29 * Ittiam 30 * 31 * @par List of Functions: 32 * 33 * @remarks 34 * None 35 * 36 ******************************************************************************** 37 */ 38 39 /*****************************************************************************/ 40 /* File Includes */ 41 /*****************************************************************************/ 42 /* System include files */ 43 #include <stdio.h> 44 #include <string.h> 45 #include <assert.h> 46 #include <arm_neon.h> 47 48 /* User include files */ 49 #include "ihevc_typedefs.h" 50 #include "itt_video_api.h" 51 #include "ihevc_cmn_utils_neon.h" 52 #include "ihevc_chroma_itrans_recon.h" 53 #include "ihevc_chroma_intra_pred.h" 54 #include "ihevc_debug.h" 55 #include "ihevc_deblk.h" 56 #include "ihevc_defs.h" 57 #include "ihevc_itrans_recon.h" 58 #include "ihevc_intra_pred.h" 59 #include "ihevc_inter_pred.h" 60 #include "ihevc_macros.h" 61 #include "ihevc_mem_fns.h" 62 #include "ihevc_padding.h" 63 #include "ihevc_quant_iquant_ssd.h" 64 #include "ihevc_resi_trans.h" 65 #include "ihevc_sao.h" 66 #include "ihevc_structs.h" 67 #include "ihevc_weighted_pred.h" 68 69 #include "rc_cntrl_param.h" 70 #include "rc_frame_info_collector.h" 71 #include "rc_look_ahead_params.h" 72 73 #include "ihevce_api.h" 74 #include "ihevce_defs.h" 75 #include "ihevce_lap_enc_structs.h" 76 #include "ihevce_multi_thrd_structs.h" 77 #include "ihevce_function_selector.h" 78 #include "ihevce_me_common_defs.h" 79 #include "ihevce_enc_structs.h" 80 #include "ihevce_had_satd.h" 81 #include "ihevce_ipe_instr_set_router.h" 82 #include "ihevce_global_tables.h" 83 84 #include "hme_datatype.h" 85 #include "hme_common_defs.h" 86 #include "hme_interface.h" 87 #include "hme_defs.h" 88 89 #include "ihevce_me_instr_set_router.h" 90 91 /*****************************************************************************/ 92 /* Function Declarations */ 93 /*****************************************************************************/ 94 FT_CALC_SATD_AND_RESULT hme_evalsatd_update_1_best_result_pt_pu_16x16_neon; 95 96 WORD32 ihevce_had4_4x4_neon( 97 UWORD8 *pu1_src, 98 WORD32 src_strd, 99 UWORD8 *pu1_pred, 100 WORD32 pred_strd, 101 WORD16 *pi2_dst4x4, 102 WORD32 dst_strd, 103 WORD32 *pi4_hsad, 104 WORD32 hsad_stride, 105 WORD32 i4_frm_qstep); 106 107 /*****************************************************************************/ 108 /* Function Definitions */ 109 /*****************************************************************************/ 110 111 static void hme_4x4_qpel_interp_avg_neon( 112 UWORD8 *pu1_src_a, 113 UWORD8 *pu1_src_b, 114 WORD32 src_a_strd, 115 WORD32 src_b_strd, 116 UWORD8 *pu1_dst, 117 WORD32 dst_strd) 118 { 119 uint8x16_t src_a = load_unaligned_u8q(pu1_src_a, src_a_strd); 120 uint8x16_t src_b = load_unaligned_u8q(pu1_src_b, src_b_strd); 121 uint8x16_t dst = vrhaddq_u8(src_a, src_b); 122 123 store_unaligned_u8q(pu1_dst, dst_strd, dst); 124 } 125 126 static void hme_8xn_qpel_interp_avg_neon( 127 UWORD8 *pu1_src_a, 128 UWORD8 *pu1_src_b, 129 WORD32 src_a_strd, 130 WORD32 src_b_strd, 131 UWORD8 *pu1_dst, 132 WORD32 dst_strd, 133 WORD32 ht) 134 { 135 WORD32 i; 136 137 for(i = 0; i < ht; i++) 138 { 139 uint8x8_t src_a = vld1_u8(pu1_src_a); 140 uint8x8_t src_b = vld1_u8(pu1_src_b); 141 uint8x8_t dst = vrhadd_u8(src_a, src_b); 142 143 vst1_u8(pu1_dst, dst); 144 pu1_src_a += src_a_strd; 145 pu1_src_b += src_b_strd; 146 pu1_dst += dst_strd; 147 } 148 } 149 150 static void hme_16xn_qpel_interp_avg_neon( 151 UWORD8 *pu1_src_a, 152 UWORD8 *pu1_src_b, 153 WORD32 src_a_strd, 154 WORD32 src_b_strd, 155 UWORD8 *pu1_dst, 156 WORD32 dst_strd, 157 WORD32 ht) 158 { 159 WORD32 i; 160 161 for(i = 0; i < ht; i++) 162 { 163 uint8x16_t src_a = vld1q_u8(pu1_src_a); 164 uint8x16_t src_b = vld1q_u8(pu1_src_b); 165 uint8x16_t dst = vrhaddq_u8(src_a, src_b); 166 167 vst1q_u8(pu1_dst, dst); 168 pu1_src_a += src_a_strd; 169 pu1_src_b += src_b_strd; 170 pu1_dst += dst_strd; 171 } 172 } 173 174 static void hme_32xn_qpel_interp_avg_neon( 175 UWORD8 *pu1_src_a, 176 UWORD8 *pu1_src_b, 177 WORD32 src_a_strd, 178 WORD32 src_b_strd, 179 UWORD8 *pu1_dst, 180 WORD32 dst_strd, 181 WORD32 ht) 182 { 183 WORD32 i; 184 185 for(i = 0; i < ht; i++) 186 { 187 uint8x16_t src_a_0 = vld1q_u8(pu1_src_a); 188 uint8x16_t src_b_0 = vld1q_u8(pu1_src_b); 189 uint8x16_t dst_0 = vrhaddq_u8(src_a_0, src_b_0); 190 191 uint8x16_t src_a_1 = vld1q_u8(pu1_src_a + 16); 192 uint8x16_t src_b_1 = vld1q_u8(pu1_src_b + 16); 193 uint8x16_t dst_1 = vrhaddq_u8(src_a_1, src_b_1); 194 195 vst1q_u8(pu1_dst, dst_0); 196 vst1q_u8(pu1_dst + 16, dst_1); 197 pu1_src_a += src_a_strd; 198 pu1_src_b += src_b_strd; 199 pu1_dst += dst_strd; 200 } 201 } 202 203 static void hme_4mx4n_qpel_interp_avg_neon( 204 UWORD8 *pu1_src_a, 205 UWORD8 *pu1_src_b, 206 WORD32 src_a_strd, 207 WORD32 src_b_strd, 208 UWORD8 *pu1_dst, 209 WORD32 dst_strd, 210 WORD32 blk_wd, 211 WORD32 blk_ht) 212 { 213 WORD32 i, j; 214 215 assert(blk_wd % 4 == 0); 216 assert(blk_ht % 4 == 0); 217 218 for(i = 0; i < blk_ht; i += 4) 219 { 220 for(j = 0; j < blk_wd;) 221 { 222 WORD32 wd = blk_wd - j; 223 224 if(wd >= 32) 225 { 226 hme_32xn_qpel_interp_avg_neon( 227 pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4); 228 j += 32; 229 } 230 else if(wd >= 16) 231 { 232 hme_16xn_qpel_interp_avg_neon( 233 pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4); 234 j += 16; 235 } 236 else if(wd >= 8) 237 { 238 hme_8xn_qpel_interp_avg_neon( 239 pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4); 240 j += 8; 241 } 242 else 243 { 244 hme_4x4_qpel_interp_avg_neon( 245 pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd); 246 j += 4; 247 } 248 } 249 pu1_src_a += (4 * src_a_strd); 250 pu1_src_b += (4 * src_b_strd); 251 pu1_dst += (4 * dst_strd); 252 } 253 } 254 255 void hme_qpel_interp_avg_neon(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id) 256 { 257 U08 *pu1_src1, *pu1_src2, *pu1_dst; 258 qpel_input_buf_cfg_t *ps_inp_cfg; 259 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 260 S32 i4_ref_stride = ps_prms->i4_ref_stride; 261 262 i4_mv_x_frac = i4_mv_x & 3; 263 i4_mv_y_frac = i4_mv_y & 3; 264 265 i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride; 266 267 /* Derive the descriptor that has all offset and size info */ 268 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 269 270 if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2) 271 { 272 /* This is case for fxfy/hxfy/fxhy/hxhy */ 273 ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 274 ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 275 ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride); 276 ps_prms->i4_final_out_stride = i4_ref_stride; 277 278 return; 279 } 280 281 pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 282 pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 283 pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 284 285 pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2]; 286 pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset; 287 pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride); 288 289 pu1_dst = ps_prms->apu1_interp_out[i4_buf_id]; 290 291 hme_4mx4n_qpel_interp_avg_neon( 292 pu1_src1, 293 pu1_src2, 294 ps_prms->i4_ref_stride, 295 ps_prms->i4_ref_stride, 296 pu1_dst, 297 ps_prms->i4_out_stride, 298 ps_prms->i4_blk_wd, 299 ps_prms->i4_blk_ht); 300 ps_prms->pu1_final_out = pu1_dst; 301 ps_prms->i4_final_out_stride = ps_prms->i4_out_stride; 302 } 303 304 // TODO: Can this function and above function be unified 305 void hme_qpel_interp_avg_1pt_neon( 306 interp_prms_t *ps_prms, 307 S32 i4_mv_x, 308 S32 i4_mv_y, 309 S32 i4_buf_id, 310 U08 **ppu1_final, 311 S32 *pi4_final_stride) 312 { 313 U08 *pu1_src1, *pu1_src2, *pu1_dst; 314 qpel_input_buf_cfg_t *ps_inp_cfg; 315 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 316 S32 i4_ref_stride = ps_prms->i4_ref_stride; 317 318 i4_mv_x_frac = i4_mv_x & 3; 319 i4_mv_y_frac = i4_mv_y & 3; 320 321 i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride; 322 323 /* Derive the descriptor that has all offset and size info */ 324 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 325 326 pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 327 pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 328 pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 329 330 pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2]; 331 pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset; 332 pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride); 333 334 pu1_dst = ps_prms->apu1_interp_out[i4_buf_id]; 335 336 hme_4mx4n_qpel_interp_avg_neon( 337 pu1_src1, 338 pu1_src2, 339 ps_prms->i4_ref_stride, 340 ps_prms->i4_ref_stride, 341 pu1_dst, 342 ps_prms->i4_out_stride, 343 ps_prms->i4_blk_wd, 344 ps_prms->i4_blk_ht); 345 ppu1_final[i4_buf_id] = pu1_dst; 346 pi4_final_stride[i4_buf_id] = ps_prms->i4_out_stride; 347 } 348 349 void hme_qpel_interp_avg_2pt_vert_with_reuse_neon( 350 interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride) 351 { 352 hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride); 353 354 hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride); 355 } 356 357 void hme_qpel_interp_avg_2pt_horz_with_reuse_neon( 358 interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride) 359 { 360 hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride); 361 362 hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride); 363 } 364 365 void hme_evalsatd_update_1_best_result_pt_pu_16x16_neon( 366 err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms) 367 { 368 mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt; 369 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 370 S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0]; 371 372 S32 ai4_satd_4x4[16]; 373 S32 ai4_satd_8x8[4]; 374 375 U08 *pu1_inp = ps_prms->pu1_inp; 376 U08 *pu1_ref = ps_prms->pu1_ref; 377 378 S32 inp_stride = ps_prms->i4_inp_stride; 379 S32 ref_stride = ps_prms->i4_ref_stride; 380 381 S32 i; 382 383 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ 384 for(i = 0; i < 4; i++) 385 { 386 U08 *pu1_src = pu1_inp + (i & 0x1) * 8 + (i >> 1) * inp_stride * 8; 387 U08 *pu1_pred = pu1_ref + (i & 0x1) * 8 + (i >> 1) * ref_stride * 8; 388 S16 idx = (i & 0x1) * 2 + (i >> 1) * 8; 389 390 ai4_satd_8x8[i] = ihevce_had4_4x4_neon( 391 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 0, &ai4_satd_4x4[idx], 4, 0); 392 } 393 394 /* Update 16x16 SATDs */ 395 pi4_sad_grid[PART_ID_2Nx2N] = 396 ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3]; 397 398 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0]; 399 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1]; 400 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2]; 401 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3]; 402 403 /* Update 8x16 / 16x8 SATDs */ 404 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2]; 405 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3]; 406 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1]; 407 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3]; 408 409 /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */ 410 pi4_sad_grid[PART_ID_nLx2N_L] = 411 ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10]; 412 pi4_sad_grid[PART_ID_nRx2N_R] = 413 ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15]; 414 pi4_sad_grid[PART_ID_2NxnU_T] = 415 ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5]; 416 pi4_sad_grid[PART_ID_2NxnD_B] = 417 ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; 418 419 pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L]; 420 pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R]; 421 pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T]; 422 pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B]; 423 424 /* For each valid partition, update the refine_prm structure to 425 * reflect the best and second best candidates for that partition */ 426 for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++) 427 { 428 S32 part_id = pi4_valid_part_ids[i]; 429 S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i; 430 S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id]; 431 S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff); 432 S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost); 433 S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]); 434 435 if(i4_tot_cost < best_node_cost) 436 { 437 refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost; 438 refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost; 439 refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x; 440 refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y; 441 refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx; 442 } 443 } 444 } 445