1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_intra_pred_filters_neon_intr.c 22 * 23 * @brief 24 * Contains function Definition for intra prediction interpolation filters 25 * 26 * 27 * @author 28 * Yogeswaran RS 29 * 30 * @par List of Functions: 31 * - ihevc_intra_pred_luma_planar() 32 * - ihevc_intra_pred_luma_dc() 33 * - ihevc_intra_pred_luma_horz() 34 * - ihevc_intra_pred_luma_ver() 35 * - ihevc_intra_pred_luma_mode2() 36 * - ihevc_intra_pred_luma_mode_18_34() 37 * 38 * @remarks 39 * None 40 * 41 ******************************************************************************* 42 */ 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 #include <stdio.h> 47 48 #include "ihevc_typedefs.h" 49 #include "ihevc_intra_pred.h" 50 #include "ihevc_macros.h" 51 #include "ihevc_func_selector.h" 52 #include "arm_neon.h" 53 #include "ihevc_platform_macros.h" 54 #include "ihevc_common_tables.h" 55 56 /****************************************************************************/ 57 /* Constant Macros */ 58 /****************************************************************************/ 59 #define MAX_CU_SIZE 64 60 #define BIT_DEPTH 8 61 #define T32_4NT 128 62 #define T16_4NT 64 63 64 65 66 /*****************************************************************************/ 67 /* Table Look-up */ 68 /*****************************************************************************/ 69 70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x) 71 72 /*****************************************************************************/ 73 /* Function Definition */ 74 /*****************************************************************************/ 75 76 /** 77 ******************************************************************************* 78 * 79 * @brief 80 * Intra prediction interpolation filter for pu1_ref substitution 81 * 82 * 83 * @par Description: 84 * Reference substitution process for samples unavailable for prediction 85 * Refer to section 8.4.4.2.2 86 * 87 * @param[in] pu1_top_left 88 * UWORD8 pointer to the top-left 89 * 90 * @param[in] pu1_top 91 * UWORD8 pointer to the top 92 * 93 * @param[in] pu1_left 94 * UWORD8 pointer to the left 95 * 96 * @param[in] src_strd 97 * WORD32 Source stride 98 * 99 * @param[in] nbr_flags 100 * WORD32 neighbor availability flags 101 * 102 * @param[in] nt 103 * WORD32 transform Block size 104 * 105 * @param[in] dst_strd 106 * WORD32 Destination stride 107 * 108 * @returns 109 * 110 * @remarks 111 * None 112 * 113 ******************************************************************************* 114 */ 115 116 117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left, 118 UWORD8 *pu1_top, 119 UWORD8 *pu1_left, 120 WORD32 src_strd, 121 WORD32 nt, 122 WORD32 nbr_flags, 123 UWORD8 *pu1_dst, 124 WORD32 dst_strd) 125 { 126 UWORD8 pu1_ref; 127 WORD32 dc_val, i; 128 WORD32 total_samples = (4 * nt) + 1; 129 WORD32 two_nt = 2 * nt; 130 WORD32 three_nt = 3 * nt; 131 WORD32 get_bits; 132 WORD32 next; 133 WORD32 bot_left, left, top, tp_right, tp_left; 134 WORD32 idx, nbr_id_from_bl, frwd_nbr_flag; 135 UNUSED(dst_strd); 136 dc_val = 1 << (BIT_DEPTH - 1); 137 138 /* Neighbor Flag Structure*/ 139 /* Top-Left | Top-Right | Top | Left | Bottom-Left 140 1 4 4 4 4 141 */ 142 143 /* If no neighbor flags are present, fill the neighbor samples with DC value */ 144 if(nbr_flags == 0) 145 { 146 for(i = 0; i < total_samples; i++) 147 { 148 pu1_dst[i] = dc_val; 149 } 150 } 151 else 152 { 153 /* Else fill the corresponding samples */ 154 pu1_dst[two_nt] = *pu1_top_left; 155 UWORD8 *pu1_dst_tmp2 = pu1_dst; 156 UWORD8 *pu1_top_tmp = pu1_top; 157 pu1_dst_tmp2 += two_nt + 1; 158 159 for(i = 0; i < two_nt; i++) 160 pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd]; 161 162 uint8x8_t src; 163 for(i = two_nt; i > 0; i -= 8) 164 { 165 src = vld1_u8(pu1_top_tmp); 166 pu1_top_tmp += 8; 167 vst1_u8(pu1_dst_tmp2, src); 168 pu1_dst_tmp2 += 8; 169 } 170 171 if(nt <= 8) 172 { 173 /* 1 bit extraction for all the neighboring blocks */ 174 tp_left = (nbr_flags & 0x10000) >> 16; 175 bot_left = nbr_flags & 0x1; 176 left = (nbr_flags & 0x10) >> 4; 177 top = (nbr_flags & 0x100) >> 8; 178 tp_right = (nbr_flags & 0x1000) >> 12; 179 180 next = 1; 181 182 /* If bottom -left is not available, reverse substitution process*/ 183 if(bot_left == 0) 184 { 185 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right }; 186 187 /* Check for the 1st available sample from bottom-left*/ 188 while(!a_nbr_flag[next]) 189 next++; 190 191 /* If Left, top-left are available*/ 192 if(next <= 2) 193 { 194 idx = nt * next; 195 pu1_ref = pu1_dst[idx]; 196 for(i = 0; i < idx; i++) 197 pu1_dst[i] = pu1_ref; 198 } 199 else /* If top, top-right are available */ 200 { 201 /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/ 202 idx = (nt * (next - 1)) + 1; 203 pu1_ref = pu1_dst[idx]; 204 for(i = 0; i < idx; i++) 205 pu1_dst[i] = pu1_ref; 206 } 207 } 208 209 /* Forward Substitution Process */ 210 /* If left is Unavailable, copy the last bottom-left value */ 211 212 if(left == 0) 213 { 214 uint8x8_t dup_pu1_dst1; 215 UWORD8 *pu1_dst_const_nt = pu1_dst; 216 pu1_dst_const_nt += nt; 217 218 if(0 == (nt & 7)) 219 { 220 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]); 221 for(i = nt; i > 0; i -= 8) 222 { 223 vst1_u8(pu1_dst_const_nt, dup_pu1_dst1); 224 pu1_dst_const_nt += 8; 225 226 } 227 } 228 else 229 { 230 //uint32x2_t dup_pu1_dst4; 231 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]); 232 //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]); 233 for(i = nt; i > 0; i -= 4) 234 { 235 vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0); 236 pu1_dst_const_nt += 4; 237 238 } 239 240 } 241 242 } 243 if(tp_left == 0) 244 pu1_dst[two_nt] = pu1_dst[two_nt - 1]; 245 if(top == 0) 246 { 247 248 if(0 == (nt & 7)) 249 { 250 uint8x8_t dup_pu1_dst2; 251 UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst; 252 pu1_dst_const_two_nt_1 += (two_nt + 1); 253 dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]); 254 for(i = nt; i > 0; i -= 8) 255 { 256 vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2); 257 pu1_dst_const_two_nt_1 += 8; 258 259 } 260 } 261 else 262 { 263 for(i = 0; i < nt; i++) 264 pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt]; 265 } 266 } 267 if(tp_right == 0) 268 { 269 uint8x8_t dup_pu1_dst3; 270 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst; 271 pu1_dst_const_three_nt_1 += (three_nt + 1); 272 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]); 273 if(0 == (nt & 7)) 274 { 275 for(i = nt; i > 0; i -= 8) 276 { 277 vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3); 278 pu1_dst_const_three_nt_1 += 8; 279 280 } 281 } 282 else 283 { 284 for(i = nt; i > 0; i -= 4) 285 { 286 vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0); 287 pu1_dst_const_three_nt_1 += 4; 288 } 289 290 } 291 292 } 293 } 294 if(nt == 16) 295 { 296 WORD32 nbr_flags_temp = 0; 297 nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2) 298 + ((nbr_flags & 0x300) >> 4) 299 + ((nbr_flags & 0x3000) >> 6) 300 + ((nbr_flags & 0x10000) >> 8); 301 302 /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/ 303 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ 304 { 305 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */ 306 307 if(nbr_id_from_bl == 64) 308 nbr_id_from_bl = 32; 309 310 if(nbr_id_from_bl == 32) 311 { 312 /* for top left : 1 pel per nbr bit */ 313 if(!((nbr_flags_temp >> 8) & 0x1)) 314 { 315 nbr_id_from_bl++; 316 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */ 317 } 318 } 319 /* Reverse Substitution Process*/ 320 if(nbr_id_from_bl) 321 { 322 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ 323 pu1_ref = pu1_dst[nbr_id_from_bl]; 324 for(i = (nbr_id_from_bl - 1); i >= 0; i--) 325 { 326 pu1_dst[i] = pu1_ref; 327 } 328 } 329 } 330 331 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ 332 while(nbr_id_from_bl < ((T16_4NT) + 1)) 333 { 334 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ 335 /* Devide by 8 to obtain the original index */ 336 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ 337 338 /* The Top-left flag is at the last bit location of nbr_flags*/ 339 if(nbr_id_from_bl == (T16_4NT / 2)) 340 { 341 get_bits = GET_BITS(nbr_flags_temp, 8); 342 343 /* only pel substitution for TL */ 344 if(!get_bits) 345 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; 346 } 347 else 348 { 349 get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag); 350 if(!get_bits) 351 { 352 /* 8 pel substitution (other than TL) */ 353 pu1_ref = pu1_dst[nbr_id_from_bl - 1]; 354 for(i = 0; i < 8; i++) 355 pu1_dst[nbr_id_from_bl + i] = pu1_ref; 356 } 357 358 } 359 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8; 360 } 361 } 362 363 if(nt == 32) 364 { 365 /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/ 366 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ 367 { 368 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */ 369 370 if(nbr_id_from_bl == 64) 371 { 372 /* for top left : 1 pel per nbr bit */ 373 if(!((nbr_flags >> 16) & 0x1)) 374 { 375 /* top left not available */ 376 nbr_id_from_bl++; 377 /* top and top right; 8 pels per nbr bit */ 378 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8; 379 } 380 } 381 /* Reverse Substitution Process*/ 382 if(nbr_id_from_bl) 383 { 384 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ 385 pu1_ref = pu1_dst[nbr_id_from_bl]; 386 for(i = (nbr_id_from_bl - 1); i >= 0; i--) 387 pu1_dst[i] = pu1_ref; 388 } 389 } 390 391 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ 392 while(nbr_id_from_bl < ((T32_4NT)+1)) 393 { 394 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ 395 /* Devide by 8 to obtain the original index */ 396 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ 397 398 /* The Top-left flag is at the last bit location of nbr_flags*/ 399 if(nbr_id_from_bl == (T32_4NT / 2)) 400 { 401 get_bits = GET_BITS(nbr_flags, 16); 402 /* only pel substitution for TL */ 403 if(!get_bits) 404 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; 405 } 406 else 407 { 408 get_bits = GET_BITS(nbr_flags, frwd_nbr_flag); 409 if(!get_bits) 410 { 411 /* 8 pel substitution (other than TL) */ 412 pu1_ref = pu1_dst[nbr_id_from_bl - 1]; 413 for(i = 0; i < 8; i++) 414 pu1_dst[nbr_id_from_bl + i] = pu1_ref; 415 } 416 417 } 418 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8; 419 } 420 } 421 422 } 423 424 } 425 426 /** 427 ******************************************************************************* 428 * 429 * @brief 430 * Intra prediction interpolation filter for ref_filtering 431 * 432 * 433 * @par Description: 434 * Reference DC filtering for neighboring samples dependent on TU size and 435 * mode Refer to section 8.4.4.2.3 in the standard 436 * 437 * @param[in] pu1_src 438 * UWORD8 pointer to the source 439 * 440 * @param[out] pu1_dst 441 * UWORD8 pointer to the destination 442 * 443 * @param[in] nt 444 * integer Transform Block size 445 * 446 * @param[in] mode 447 * integer intraprediction mode 448 * 449 * @returns 450 * 451 * @remarks 452 * None 453 * 454 ******************************************************************************* 455 */ 456 457 458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src, 459 WORD32 nt, 460 UWORD8 *pu1_dst, 461 WORD32 mode, 462 WORD32 strong_intra_smoothing_enable_flag) 463 { 464 WORD32 filter_flag; 465 WORD32 i = 0; 466 WORD32 four_nt = 4 * nt; 467 468 WORD32 src_4nt; 469 470 /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */ 471 /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */ 472 UWORD8 *pu1_src_tmp_0 = pu1_src; 473 UWORD8 *pu1_src_tmp_1; 474 UWORD8 *pu1_src_tmp_2; 475 UWORD8 *pu1_dst_tmp_0 = pu1_dst; 476 UWORD8 *pu1_dst_tmp_1; 477 478 uint8x8_t src_val_0, src_val_2; 479 uint8x8_t src_val_1, shift_res; 480 uint8x8_t dup_const_2; 481 uint16x8_t mul_res, add_res; 482 WORD32 bi_linear_int_flag = 0; 483 WORD32 abs_cond_left_flag = 0; 484 WORD32 abs_cond_top_flag = 0; 485 WORD32 dc_val = 1 << (BIT_DEPTH - 5); 486 shift_res = vdup_n_u8(0); 487 488 filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)); 489 490 if(0 == filter_flag) 491 { 492 if(pu1_src == pu1_dst) 493 { 494 return; 495 } 496 else 497 { 498 for(i = four_nt; i > 0; i -= 8) 499 { 500 src_val_0 = vld1_u8(pu1_src_tmp_0); 501 pu1_src_tmp_0 += 8; 502 vst1_u8(pu1_dst_tmp_0, src_val_0); 503 pu1_dst_tmp_0 += 8; 504 } 505 pu1_dst[four_nt] = pu1_src[four_nt]; 506 } 507 } 508 509 else 510 { 511 /* If strong intra smoothin is enabled and transform size is 32 */ 512 if((1 == strong_intra_smoothing_enable_flag) && (32 == nt)) 513 { 514 /*Strong Intra Filtering*/ 515 abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt] 516 - (2 * pu1_src[3 * nt]))) < dc_val; 517 abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0] 518 - (2 * pu1_src[nt]))) < dc_val; 519 520 bi_linear_int_flag = ((1 == abs_cond_left_flag) 521 && (1 == abs_cond_top_flag)); 522 } 523 524 src_4nt = pu1_src[4 * nt]; 525 /* Strong filtering of reference samples */ 526 if(1 == bi_linear_int_flag) 527 { 528 WORD32 two_nt = four_nt >> 1; 529 530 WORD32 pu1_src_0_val = pu1_src[0]; 531 WORD32 pu1_src_2_nt_val = pu1_src[2 * nt]; 532 WORD32 pu1_src_4_nt_val = pu1_src[4 * nt]; 533 534 WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val; 535 uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val); 536 537 WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val; 538 uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val); 539 540 const UWORD8 *const_col_i; 541 uint8x8_t const_col_i_val; 542 uint16x8_t prod_val_1; 543 uint16x8_t prod_val_2; 544 uint16x8_t prod_val_3; 545 uint16x8_t prod_val_4; 546 uint8x8_t res_val_1; 547 uint8x8_t res_val_2; 548 uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val); 549 uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val); 550 uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val); 551 pu1_dst_tmp_0 = pu1_dst + 1; 552 pu1_dst_tmp_1 = pu1_dst + two_nt + 1; 553 554 const_col_i = gau1_ihevc_planar_factor + 1; 555 556 for(i = two_nt; i > 0; i -= 8) 557 { 558 const_col_i_val = vld1_u8(const_col_i); 559 const_col_i += 8; 560 561 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t); 562 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t); 563 564 res_val_1 = vrshrn_n_u16(prod_val_2, 6); 565 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t); 566 567 vst1_u8(pu1_dst_tmp_0, res_val_1); 568 pu1_dst_tmp_0 += 8; 569 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t); 570 571 res_val_2 = vrshrn_n_u16(prod_val_4, 6); 572 vst1_u8(pu1_dst_tmp_1, res_val_2); 573 pu1_dst_tmp_1 += 8; 574 } 575 pu1_dst[2 * nt] = pu1_src[2 * nt]; 576 } 577 else 578 { 579 pu1_src_tmp_1 = pu1_src + 1; 580 pu1_src_tmp_2 = pu1_src + 2; 581 pu1_dst_tmp_0 += 1; 582 583 dup_const_2 = vdup_n_u8(2); 584 585 /* Extremities Untouched*/ 586 pu1_dst[0] = pu1_src[0]; 587 588 /* To avoid the issue when the dest and src has the same pointer this load has been done 589 * outside and the 2nd consecutive load is done before the store of the 1st */ 590 591 /* Perform bilinear filtering of Reference Samples */ 592 for(i = (four_nt - 1); i > 0; i -= 8) 593 { 594 src_val_0 = vld1_u8(pu1_src_tmp_0); 595 pu1_src_tmp_0 += 8; 596 597 src_val_2 = vld1_u8(pu1_src_tmp_2); 598 pu1_src_tmp_2 += 8; 599 600 src_val_1 = vld1_u8(pu1_src_tmp_1); 601 pu1_src_tmp_1 += 8; 602 603 if(i < four_nt - 1) 604 { 605 vst1_u8(pu1_dst_tmp_0, shift_res); 606 pu1_dst_tmp_0 += 8; 607 } 608 609 add_res = vaddl_u8(src_val_0, src_val_2); 610 611 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2); 612 shift_res = vrshrn_n_u16(mul_res, 2); 613 614 } 615 vst1_u8(pu1_dst_tmp_0, shift_res); 616 pu1_dst_tmp_0 += 8; 617 } 618 pu1_dst[4 * nt] = src_4nt; 619 620 } 621 622 } 623 624 625 626 /** 627 ******************************************************************************* 628 * 629 * @brief 630 * Intra prediction interpolation filter for luma planar 631 * 632 * @par Description: 633 * Planar Intraprediction with reference neighboring samples location 634 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 635 * 636 * @param[in] pu1_src 637 * UWORD8 pointer to the source 638 * 639 * @param[out] pu1_dst 640 * UWORD8 pointer to the destination 641 * 642 * @param[in] src_strd 643 * integer source stride 644 * 645 * @param[in] dst_strd 646 * integer destination stride 647 * 648 * @param[in] nt 649 * integer Transform Block size 650 * 651 * @param[in] wd 652 * integer width of the array 653 * 654 * @returns 655 * 656 * @remarks 657 * None 658 * 659 ******************************************************************************* 660 */ 661 662 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref, 663 WORD32 src_strd, 664 UWORD8 *pu1_dst, 665 WORD32 dst_strd, 666 WORD32 nt, 667 WORD32 mode) 668 { 669 /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */ 670 /* load const_nt_1_col values into a d register */ 671 /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */ 672 /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */ 673 /* log2nt + 1 is taken care while assigning the values itself */ 674 /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/ 675 676 WORD32 row, col = 0; 677 WORD32 log2nt_plus1 = 6; 678 WORD32 two_nt, three_nt; 679 UWORD8 *pu1_ref_two_nt_1; 680 UWORD8 *pu1_dst_tmp; 681 const UWORD8 *const_nt_1_col; 682 uint8x8_t const_nt_1_col_t; 683 const UWORD8 *const_col_1; 684 uint8x8_t const_col_1_t; 685 uint8_t const_nt_1_row; 686 uint8x8_t const_nt_1_row_dup; 687 uint8_t const_row_1; 688 uint8x8_t const_row_1_dup; 689 uint8_t const_nt = nt; 690 uint16x8_t const_nt_dup; 691 uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1]; 692 uint8x8_t pu1_ref_nt_1_dup; 693 uint8_t pu1_ref_two_nt_1_row; 694 uint8_t pu1_ref_three_nt_1; 695 uint8x8_t pu1_ref_two_nt_1_row_dup; 696 uint8x8_t pu1_ref_two_nt_1_t; 697 uint8x8_t pu1_ref_three_nt_1_dup; 698 uint16x8_t prod_t1; 699 uint16x8_t prod_t2; 700 uint16x8_t sto_res_tmp; 701 uint8x8_t sto_res; 702 int16x8_t log2nt_dup; 703 UNUSED(src_strd); 704 UNUSED(mode); 705 log2nt_plus1 = 32 - CLZ(nt); 706 two_nt = 2 * nt; 707 three_nt = 3 * nt; 708 /* loops have been unrolld considering the fact width is multiple of 8 */ 709 if(0 == (nt & 7)) 710 { 711 pu1_dst_tmp = pu1_dst; 712 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8; 713 714 const_col_1 = gau1_ihevc_planar_factor + 1; 715 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1]; 716 717 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1); 718 const_nt_dup = vdupq_n_u16(const_nt); 719 720 log2nt_dup = vdupq_n_s16(log2nt_plus1); 721 log2nt_dup = vnegq_s16(log2nt_dup); 722 723 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1); 724 725 for(row = 0; row < nt; row++) 726 { 727 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row]; 728 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row); 729 730 const_nt_1_row = nt - 1 - row; 731 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row); 732 733 const_row_1 = row + 1; 734 const_row_1_dup = vdup_n_u8(const_row_1); 735 736 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8; 737 738 const_col_1 = gau1_ihevc_planar_factor + 1; 739 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1; 740 741 for(col = nt; col > 0; col -= 8) 742 { 743 const_nt_1_col_t = vld1_u8(const_nt_1_col); 744 const_nt_1_col -= 8; 745 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t); 746 747 const_col_1_t = vld1_u8(const_col_1); 748 const_col_1 += 8; 749 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup); 750 751 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1); 752 pu1_ref_two_nt_1 += 8; 753 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup); 754 755 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t); 756 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup); 757 prod_t1 = vaddq_u16(prod_t1, const_nt_dup); 758 prod_t1 = vaddq_u16(prod_t1, prod_t2); 759 760 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup)); 761 sto_res = vmovn_u16(sto_res_tmp); 762 vst1_u8(pu1_dst_tmp, sto_res); 763 pu1_dst_tmp += 8; 764 } 765 pu1_dst_tmp += dst_strd - nt; 766 } 767 } 768 /* loops have been unrolld considering the fact width is multiple of 4 */ 769 /* If column is multiple of 4 then height should be multiple of 2 */ 770 else 771 { 772 uint8x8_t const_row_1_dup1; 773 uint8x8_t pu1_ref_two_nt_1_t1; 774 uint8x8_t const_nt_1_col_t1; 775 uint8x8_t const_col_1_t1; 776 uint8x8_t pu1_ref_two_nt_1_row_dup1; 777 uint8x8_t const_nt_1_row_dup1; 778 779 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1]; 780 781 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1); 782 const_nt_dup = vdupq_n_u16(const_nt); 783 784 log2nt_dup = vdupq_n_s16(log2nt_plus1); 785 log2nt_dup = vnegq_s16(log2nt_dup); 786 787 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1); 788 789 for(row = 0; row < nt; row += 2) 790 { 791 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row]; 792 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row); 793 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row]; 794 pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row); 795 pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4); 796 797 const_nt_1_row = nt - 1 - row; 798 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row); 799 const_nt_1_row = nt - 2 - row; 800 const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row); 801 const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4); 802 803 const_row_1 = row + 1; 804 const_row_1_dup = vdup_n_u8(const_row_1); 805 const_row_1 = row + 2; 806 const_row_1_dup1 = vdup_n_u8(const_row_1); 807 const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4); 808 809 const_nt_1_col = gau1_ihevc_planar_factor + nt - 4; 810 811 const_col_1 = gau1_ihevc_planar_factor + 1; 812 813 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1; 814 815 for(col = nt; col > 0; col -= 4) 816 { 817 const_nt_1_col_t = vld1_u8(const_nt_1_col); 818 const_nt_1_col -= 4; 819 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t); 820 821 const_col_1_t = vld1_u8(const_col_1); 822 const_col_1 += 4; 823 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32)); 824 825 pu1_dst_tmp = pu1_dst; 826 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4); 827 828 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32)); 829 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup); 830 831 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1); 832 pu1_ref_two_nt_1 += 4; 833 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4); 834 835 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32)); 836 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup); 837 838 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4); 839 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup); 840 841 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t); 842 prod_t1 = vaddq_u16(prod_t1, const_nt_dup); 843 prod_t1 = vaddq_u16(prod_t1, prod_t2); 844 845 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup)); 846 sto_res = vmovn_u16(sto_res_tmp); 847 848 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 849 pu1_dst_tmp += dst_strd; 850 851 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); 852 pu1_dst += 4; 853 } 854 pu1_dst += 2 * dst_strd - nt; 855 } 856 } 857 858 } 859 /* INTRA_PRED_LUMA_PLANAR */ 860 861 /** 862 ******************************************************************************* 863 * 864 * @brief 865 * Intra prediction interpolation filter for luma dc 866 * 867 * @par Description: 868 * Intraprediction for DC mode with reference neighboring samples location 869 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 870 * 871 * @param[in] pu1_src 872 * UWORD8 pointer to the source 873 * 874 * @param[out] pu1_dst 875 * UWORD8 pointer to the destination 876 * 877 * @param[in] src_strd 878 * integer source stride 879 * 880 * @param[in] dst_strd 881 * integer destination stride 882 * 883 * @param[in] nt 884 * integer Transform Block size 885 * 886 * @param[in] wd 887 * integer width of the array 888 * 889 * @returns 890 * 891 * @remarks 892 * None 893 * 894 ******************************************************************************* 895 */ 896 897 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref, 898 WORD32 src_strd, 899 UWORD8 *pu1_dst, 900 WORD32 dst_strd, 901 WORD32 nt, 902 WORD32 mode) 903 { 904 WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0; 905 WORD32 i = 0; 906 WORD32 row = 0, col = 0, col_count; 907 WORD32 log2nt_plus1 = 6; 908 WORD32 two_nt = 0; 909 uint16x8_t ref_load_q; 910 uint16x8_t three_dc_val_t; 911 uint8x8_t sto_res_tmp; 912 uint8x8_t sto_res_tmp1; 913 uint8x8_t sto_res_tmp2; 914 uint8x8_t sto_res_tmp3; 915 uint8x8_t sto_res_tmp4; 916 uint8x8_t dc_val_t; 917 918 UWORD8 *pu1_ref_tmp; 919 UWORD8 *pu1_ref_tmp1; 920 UWORD8 *pu1_dst_tmp; 921 UWORD8 *pu1_dst_tmp1; 922 UWORD8 *pu1_dst_tmp2; 923 UNUSED(src_strd); 924 UNUSED(mode); 925 926 /* log2nt + 1 is taken care while assigning the values itself. */ 927 log2nt_plus1 = 32 - CLZ(nt); 928 929 /* loops have been unrolld considering the fact width is multiple of 8 */ 930 if(0 == (nt & 7)) 931 { 932 uint8x8_t ref_load1; 933 uint8x8_t ref_load2; 934 uint16x4_t acc_dc_pair1; 935 uint32x2_t acc_dc_pair2; 936 uint64x1_t acc_dc = vdup_n_u64(col); 937 938 two_nt = 2 * nt; 939 pu1_ref_tmp = pu1_ref + nt; 940 pu1_ref_tmp1 = pu1_ref + two_nt + 1; 941 942 for(i = two_nt; i > nt; i -= 8) 943 { 944 ref_load1 = vld1_u8(pu1_ref_tmp); 945 pu1_ref_tmp += 8; 946 acc_dc_pair1 = vpaddl_u8(ref_load1); 947 948 ref_load2 = vld1_u8(pu1_ref_tmp1); 949 pu1_ref_tmp1 += 8; 950 951 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1); 952 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2); 953 954 acc_dc_pair1 = vpaddl_u8(ref_load2); 955 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1); 956 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2); 957 } 958 959 dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1); 960 dc_val_t = vdup_n_u8(dc_val); 961 two_dc_val = 2 * dc_val; 962 three_dc_val = 3 * dc_val; 963 three_dc_val += 2; 964 965 three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val); 966 pu1_ref_tmp = pu1_ref + two_nt + 1 + 0; 967 pu1_dst_tmp = pu1_dst; 968 969 970 if(nt == 32) 971 { 972 for(row = 0; row < nt; row++) 973 { 974 for(col = nt; col > 0; col -= 8) 975 { 976 vst1_u8(pu1_dst_tmp, dc_val_t); 977 pu1_dst_tmp += 8; 978 } 979 pu1_dst_tmp += dst_strd - nt; 980 } 981 } 982 else 983 984 { 985 for(col = nt; col > 0; col -= 8) 986 { 987 ref_load1 = vld1_u8(pu1_ref_tmp); 988 pu1_ref_tmp += 8; 989 ref_load_q = vmovl_u8(ref_load1); 990 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t); 991 ref_load_q = vshrq_n_u16(ref_load_q, 2); 992 sto_res_tmp = vmovn_u16(ref_load_q); 993 vst1_u8(pu1_dst_tmp, sto_res_tmp); 994 pu1_dst_tmp += 8; 995 } 996 997 pu1_ref_tmp = pu1_ref + two_nt - 9; 998 pu1_dst_tmp = pu1_dst + dst_strd; 999 col_count = nt - 8; 1000 1001 /* Except the first row the remaining rows are done here */ 1002 /* Both column and row has been unrolled by 8 */ 1003 /* Store has been taken care for the unrolling */ 1004 /* Except the 1st column of the remaining rows(other than 1st row), the values are */ 1005 /* constant hence it is extracted with an constant value and stored */ 1006 /* If the column is greater than 8, then the remaining values are constant which is */ 1007 /* taken care in the inner for loop */ 1008 1009 for(row = nt; row > 0; row -= 8) 1010 { 1011 pu1_dst_tmp1 = pu1_dst_tmp + 8; 1012 ref_load1 = vld1_u8(pu1_ref_tmp); 1013 pu1_ref_tmp -= 8; 1014 ref_load_q = vmovl_u8(ref_load1); 1015 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t); 1016 ref_load_q = vshrq_n_u16(ref_load_q, 2); 1017 sto_res_tmp = vmovn_u16(ref_load_q); 1018 1019 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7); 1020 1021 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8)); 1022 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7); 1023 vst1_u8(pu1_dst_tmp, sto_res_tmp1); 1024 pu1_dst_tmp += dst_strd; 1025 1026 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16)); 1027 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7); 1028 vst1_u8(pu1_dst_tmp, sto_res_tmp2); 1029 pu1_dst_tmp += dst_strd; 1030 1031 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24)); 1032 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7); 1033 vst1_u8(pu1_dst_tmp, sto_res_tmp3); 1034 pu1_dst_tmp += dst_strd; 1035 1036 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32)); 1037 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7); 1038 vst1_u8(pu1_dst_tmp, sto_res_tmp4); 1039 pu1_dst_tmp += dst_strd; 1040 1041 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40)); 1042 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7); 1043 vst1_u8(pu1_dst_tmp, sto_res_tmp1); 1044 pu1_dst_tmp += dst_strd; 1045 1046 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48)); 1047 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7); 1048 vst1_u8(pu1_dst_tmp, sto_res_tmp2); 1049 pu1_dst_tmp += dst_strd; 1050 1051 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56)); 1052 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7); 1053 vst1_u8(pu1_dst_tmp, sto_res_tmp3); 1054 pu1_dst_tmp += dst_strd; 1055 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ 1056 if(row != 8) 1057 vst1_u8(pu1_dst_tmp, sto_res_tmp4); 1058 pu1_dst_tmp += dst_strd; 1059 1060 for(col = col_count; col > 0; col -= 8) 1061 { 1062 pu1_dst_tmp2 = pu1_dst_tmp1; 1063 vst1_u8(pu1_dst_tmp1, dc_val_t); 1064 pu1_dst_tmp1 += dst_strd; 1065 vst1_u8(pu1_dst_tmp1, dc_val_t); 1066 pu1_dst_tmp1 += dst_strd; 1067 vst1_u8(pu1_dst_tmp1, dc_val_t); 1068 pu1_dst_tmp1 += dst_strd; 1069 vst1_u8(pu1_dst_tmp1, dc_val_t); 1070 pu1_dst_tmp1 += dst_strd; 1071 vst1_u8(pu1_dst_tmp1, dc_val_t); 1072 pu1_dst_tmp1 += dst_strd; 1073 vst1_u8(pu1_dst_tmp1, dc_val_t); 1074 pu1_dst_tmp1 += dst_strd; 1075 vst1_u8(pu1_dst_tmp1, dc_val_t); 1076 pu1_dst_tmp1 += dst_strd; 1077 1078 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ 1079 if(row != 8) 1080 vst1_u8(pu1_dst_tmp1, dc_val_t); 1081 pu1_dst_tmp1 = pu1_dst_tmp2 + 8; 1082 } 1083 } 1084 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2; 1085 } 1086 } 1087 /* loops have been unrolld considering the fact width is multiple of 4 */ 1088 else 1089 { 1090 WORD32 acc_dc; 1091 two_nt = 2 * nt; 1092 1093 acc_dc = 0; 1094 pu1_ref_tmp = pu1_ref + nt + 1; 1095 for(i = nt; i < two_nt; i++) 1096 { 1097 acc_dc += pu1_ref[i]; 1098 acc_dc += pu1_ref_tmp[i]; 1099 } 1100 dc_val = (acc_dc + nt) >> (log2nt_plus1); 1101 two_dc_val = 2 * dc_val; 1102 three_dc_val = 3 * dc_val; 1103 three_dc_val = three_dc_val + 2; 1104 dc_val_t = vdup_n_u8(dc_val); 1105 1106 if(nt == 32) 1107 { 1108 pu1_dst_tmp = pu1_dst; 1109 for(row = 0; row < nt; row++) 1110 { 1111 for(col = nt; col > 0; col -= 4) 1112 { 1113 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0); 1114 pu1_dst_tmp += 4; 1115 } 1116 pu1_dst_tmp += dst_strd - nt; 1117 } 1118 } 1119 else 1120 1121 { 1122 for(col = 1; col < nt; col++) 1123 { 1124 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2; 1125 } 1126 1127 pu1_dst_tmp = pu1_dst + dst_strd + 0; 1128 /* Since first row is already updated before, loop count is nt-1 */ 1129 for(row = nt - 1; row > 0; row -= 1) 1130 { 1131 for(col = nt; col > 0; col -= 4) 1132 { 1133 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0); 1134 pu1_dst_tmp += 4; 1135 } 1136 pu1_dst_tmp += dst_strd - nt; 1137 } 1138 1139 for(row = 1; row < nt; row++) 1140 { 1141 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2; 1142 } 1143 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2; 1144 } 1145 } 1146 } 1147 /* INTRA_PRED_LUMA_DC */ 1148 1149 /** 1150 ******************************************************************************* 1151 * 1152 * @brief 1153 * Intra prediction interpolation filter for horizontal luma variable. 1154 * 1155 * @par Description: 1156 * Horizontal intraprediction with reference neighboring samples location 1157 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 1158 * 1159 * @param[in] pu1_src 1160 * UWORD8 pointer to the source 1161 * 1162 * @param[out] pu1_dst 1163 * UWORD8 pointer to the destination 1164 * 1165 * @param[in] src_strd 1166 * integer source stride 1167 * 1168 * @param[in] dst_strd 1169 * integer destination stride 1170 * 1171 * @param[in] nt 1172 * integer Transform Block size 1173 * 1174 * @param[in] wd 1175 * integer width of the array 1176 * 1177 * @returns 1178 * 1179 * @remarks 1180 * None 1181 * 1182 ******************************************************************************* 1183 */ 1184 1185 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref, 1186 WORD32 src_strd, 1187 UWORD8 *pu1_dst, 1188 WORD32 dst_strd, 1189 WORD32 nt, 1190 WORD32 mode) 1191 { 1192 1193 WORD32 row, col; 1194 WORD32 two_nt; 1195 UNUSED(src_strd); 1196 UNUSED(mode); 1197 1198 two_nt = 2 * nt; 1199 1200 1201 UWORD8 *pu1_dst_tmp = pu1_dst; 1202 UWORD32 pu1_val; 1203 uint8x8_t pu1_val_two_nt_1_row; 1204 if(nt == 32) 1205 { 1206 pu1_dst_tmp = pu1_dst; 1207 for(row = 0; row < nt; row++) 1208 { 1209 pu1_val = pu1_ref[two_nt - 1 - row]; 1210 pu1_val_two_nt_1_row = vdup_n_u8(pu1_val); 1211 for(col = nt; col > 0; col -= 8) 1212 { 1213 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row); 1214 pu1_dst_tmp += 8; 1215 } 1216 pu1_dst_tmp += dst_strd - nt; 1217 } 1218 } 1219 else 1220 1221 1222 /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/ 1223 /* naming of variables made according to the operation(instructions) it performs*/ 1224 /* (eg. shift_val which contains the shifted value, */ 1225 /* add_sat which has add and saturated value) */ 1226 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1227 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 1228 { 1229 if(0 != (nt & 7)) /* cond for multiple of 4 */ 1230 { 1231 UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref; 1232 UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref; 1233 UWORD8 *pu1_dst_4 = pu1_dst; 1234 UWORD8 *pu1_dst_4_tmp = pu1_dst; 1235 1236 uint32x2_t pu1_ref_val1, pu1_ref_val2; 1237 uint8x8_t dup_sub, round_val, dup_val; 1238 uint16x8_t dup_add, sub_val; 1239 int16x8_t shift_val, add_sat; 1240 1241 pu1_ref_val1 = vdup_n_u32(0); 1242 pu1_ref_val2 = vdup_n_u32(0); 1243 1244 dup_sub = vdup_n_u8(pu1_ref[two_nt]); 1245 1246 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]); 1247 1248 pu1_ref_4_two_nt_plus1 += (two_nt + 1); 1249 1250 pu1_ref_4_two_nt_minus_nt += (two_nt - nt); 1251 1252 for(row = nt; row > 0; row -= 4) 1253 { 1254 for(col = nt; col > 0; col -= 4) 1255 { 1256 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0); 1257 sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub); 1258 shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); 1259 1260 add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add)); 1261 round_val = vqmovun_s16(add_sat); 1262 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0); 1263 pu1_dst_4 += dst_strd; 1264 1265 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0); 1266 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2); 1267 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); 1268 pu1_dst_4 += dst_strd; 1269 1270 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1); 1271 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); 1272 pu1_dst_4 += dst_strd; 1273 1274 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0); 1275 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); 1276 pu1_dst_4 += dst_strd; 1277 1278 1279 } 1280 /* worst cases */ 1281 pu1_ref_4_two_nt_minus_nt += 3; 1282 pu1_ref_4_two_nt_plus1 += 4; 1283 pu1_dst_4 = (pu1_dst_4_tmp + 4); 1284 } 1285 1286 } 1287 1288 /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */ 1289 /* naming of variables made according to the operation(instructions) it performs */ 1290 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1291 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1292 1293 else 1294 { 1295 UWORD8 *pu1_ref_tmp_1 = pu1_ref; 1296 UWORD8 *pu1_ref_tmp_2 = pu1_ref; 1297 1298 UWORD8 *pu1_dst_tmp_1 = pu1_dst; 1299 UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd; 1300 UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd; 1301 1302 uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res; 1303 uint16x8_t sub_res, dup_add; 1304 int16x8_t shift_res, add_res; 1305 1306 dup_sub = vdup_n_u8(pu1_ref[two_nt]); 1307 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]); 1308 1309 pu1_ref_tmp_1 += (two_nt + 1); 1310 pu1_ref_tmp_2 += (two_nt - 1); 1311 1312 for(col = nt; col > 0; col -= 8) 1313 { 1314 src_tmp = vld1_u8(pu1_ref_tmp_1); 1315 pu1_ref_tmp_1 += 8; 1316 1317 sub_res = vsubl_u8(src_tmp, dup_sub); 1318 shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1); 1319 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add)); 1320 round_val = vqmovun_s16(add_res); 1321 vst1_u8(pu1_dst_tmp_1, round_val); 1322 pu1_dst_tmp_1 += 8; 1323 } 1324 1325 for(row = nt; row > 0; row -= 8) 1326 { 1327 pu1_ref_tmp_2 -= 8; 1328 1329 src_tmp_1 = vld1_u8(pu1_ref_tmp_2); 1330 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */ 1331 1332 dup_1 = vdup_lane_u8(rev_res, 0); 1333 dup_2 = vdup_lane_u8(rev_res, 1); 1334 dup_3 = vdup_lane_u8(rev_res, 2); 1335 dup_4 = vdup_lane_u8(rev_res, 3); 1336 dup_5 = vdup_lane_u8(rev_res, 4); 1337 dup_6 = vdup_lane_u8(rev_res, 5); 1338 dup_7 = vdup_lane_u8(rev_res, 6); 1339 dup_8 = vdup_lane_u8(rev_res, 7); 1340 1341 for(col = nt; col > 0; col -= 8) 1342 { 1343 pu1_dst_tmp_2 = pu1_dst_tmp_3; 1344 1345 vst1_u8(pu1_dst_tmp_2, dup_1); 1346 pu1_dst_tmp_2 += dst_strd; 1347 1348 vst1_u8(pu1_dst_tmp_2, dup_2); 1349 pu1_dst_tmp_2 += dst_strd; 1350 1351 vst1_u8(pu1_dst_tmp_2, dup_3); 1352 pu1_dst_tmp_2 += dst_strd; 1353 1354 vst1_u8(pu1_dst_tmp_2, dup_4); 1355 pu1_dst_tmp_2 += dst_strd; 1356 1357 vst1_u8(pu1_dst_tmp_2, dup_5); 1358 pu1_dst_tmp_2 += dst_strd; 1359 1360 vst1_u8(pu1_dst_tmp_2, dup_6); 1361 pu1_dst_tmp_2 += dst_strd; 1362 1363 vst1_u8(pu1_dst_tmp_2, dup_7); 1364 pu1_dst_tmp_2 += dst_strd; 1365 1366 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ 1367 if(row != 8) 1368 vst1_u8(pu1_dst_tmp_2, dup_8); 1369 pu1_dst_tmp_2 += dst_strd; 1370 1371 pu1_dst_tmp_3 += 8; 1372 } 1373 pu1_dst_tmp_2 -= (nt - 8); 1374 pu1_dst_tmp_3 = pu1_dst_tmp_2; 1375 } 1376 } 1377 } 1378 } 1379 /* INTRA_PRED_LUMA_HORZ */ 1380 1381 /** 1382 ******************************************************************************* 1383 * 1384 * @brief 1385 * Intra prediction interpolation filter for vertical luma variable. 1386 * 1387 * @par Description: 1388 * Horizontal intraprediction with reference neighboring samples location 1389 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 1390 * 1391 * @param[in] pu1_src 1392 * UWORD8 pointer to the source 1393 * 1394 * @param[out] pu1_dst 1395 * UWORD8 pointer to the destination 1396 * 1397 * @param[in] src_strd 1398 * integer source stride 1399 * 1400 * @param[in] dst_strd 1401 * integer destination stride 1402 * 1403 * @param[in] nt 1404 * integer Transform Block size 1405 * 1406 * @param[in] wd 1407 * integer width of the array 1408 * 1409 * @returns 1410 * 1411 * @remarks 1412 * None 1413 * 1414 ******************************************************************************* 1415 */ 1416 1417 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref, 1418 WORD32 src_strd, 1419 UWORD8 *pu1_dst, 1420 WORD32 dst_strd, 1421 WORD32 nt, 1422 WORD32 mode) 1423 { 1424 WORD32 row, col; 1425 WORD32 two_nt; 1426 UNUSED(src_strd); 1427 UNUSED(mode); 1428 1429 two_nt = 2 * nt; 1430 1431 UWORD8 *pu1_dst_tmp = pu1_dst; 1432 UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1; 1433 uint8x8_t pu1_val_two_nt_1_col; 1434 if(nt == 32) 1435 { 1436 pu1_dst_tmp = pu1_dst; 1437 for(row = 0; row < nt; row++) 1438 { 1439 for(col = nt; col > 0; col -= 8) 1440 { 1441 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1); 1442 pu1_ref_tmp_1 += 8; 1443 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col); 1444 pu1_dst_tmp += 8; 1445 } 1446 pu1_ref_tmp_1 -= nt; 1447 pu1_dst_tmp += dst_strd - nt; 1448 } 1449 } 1450 else 1451 1452 { 1453 /* naming of variables made according to the operation(instructions) it performs */ 1454 /* (eg. shift_val which contains the shifted value, */ 1455 /* add_sat which has add and saturated value) */ 1456 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1457 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 1458 1459 if(0 != (nt & 7)) 1460 { 1461 WORD32 cond_4 = 0; 1462 UWORD8 *pu1_ref_val1 = pu1_ref; 1463 UWORD8 *pu1_ref_val2 = pu1_ref; 1464 UWORD8 *pu1_ref_val3 = pu1_ref; 1465 1466 UWORD8 *pu1_dst_val1 = pu1_dst; 1467 UWORD8 *pu1_dst_val2 = pu1_dst; 1468 UWORD8 *pu1_dst_val3 = pu1_dst; 1469 1470 uint8x8_t dup_2_sub, round_val, vext_val; 1471 uint16x8_t dup_2_add; 1472 uint32x2_t src_val1, src_val2, src_val3; 1473 uint16x8_t sub_val; 1474 int16x8_t shift_val1, add_sat; 1475 uint64x1_t shift_val2; 1476 1477 src_val1 = vdup_n_u32(0); 1478 src_val2 = vdup_n_u32(0); 1479 src_val3 = vdup_n_u32(0); 1480 pu1_ref_val1 += (two_nt - nt); 1481 pu1_ref_val3 += (two_nt + 2); 1482 pu1_ref_val2 += (two_nt + 1); 1483 1484 dup_2_sub = vdup_n_u8(pu1_ref[two_nt]); 1485 dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]); 1486 1487 /* loops to store the first nt sets of values in the destination */ 1488 1489 for(row = nt; row > 0; row -= 4) 1490 { 1491 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4) 1492 { 1493 /* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/ 1494 src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1); 1495 sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub); 1496 shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); 1497 add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add)); 1498 round_val = vqmovun_s16(add_sat); 1499 1500 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/ 1501 src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0); 1502 vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7); 1503 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1504 pu1_dst_val1 += dst_strd; 1505 1506 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8); 1507 1508 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); 1509 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1510 pu1_dst_val1 += dst_strd; 1511 1512 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16); 1513 1514 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); 1515 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1516 pu1_dst_val1 += dst_strd; 1517 1518 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24); 1519 1520 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); 1521 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1522 pu1_dst_val1 += dst_strd; 1523 1524 pu1_ref_val1 -= 4; 1525 } 1526 1527 /* loop to store next sets of eight values in the destination */ 1528 1529 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4) 1530 { 1531 src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0); 1532 1533 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1534 pu1_dst_val2 += dst_strd; 1535 1536 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1537 pu1_dst_val2 += dst_strd; 1538 1539 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1540 pu1_dst_val2 += dst_strd; 1541 1542 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1543 pu1_dst_val2 += dst_strd; 1544 } 1545 pu1_ref_val2 += 4; 1546 pu1_dst_val3 += 4; 1547 pu1_dst_val2 = pu1_dst_val3; 1548 cond_4 = 1; 1549 } 1550 } 1551 1552 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1553 else 1554 { 1555 WORD32 cond = 0, col_1; 1556 UWORD8 *pu1_dst_tmp_1 = pu1_dst; 1557 UWORD8 *pu1_dst_tmp_2 = pu1_dst; 1558 UWORD8 *pu1_dst_tmp_3 = pu1_dst; 1559 1560 UWORD8 *pu1_ref_tmp_1 = pu1_ref; 1561 UWORD8 *pu1_ref_tmp_2 = pu1_ref; 1562 UWORD8 *pu1_ref_tmp_3 = pu1_ref; 1563 1564 uint8x8_t pu1_src_tmp1; 1565 uint8x8_t pu1_src_tmp2; 1566 1567 uint8x8_t dup_sub; 1568 uint16x8_t dup_add; 1569 int16x8_t subsh_val; 1570 int16x8_t addsat_val; 1571 uint16x8_t sub_val; 1572 uint8x8_t round_val; 1573 uint8x8_t vext_t; 1574 uint64x1_t shift_64; 1575 1576 dup_sub = vdup_n_u8(pu1_ref[two_nt]); 1577 dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]); 1578 1579 pu1_ref_tmp_1 += (two_nt); 1580 pu1_ref_tmp_1 -= 8; 1581 pu1_ref_tmp_2 += (two_nt + 2); 1582 pu1_ref_tmp_3 += (two_nt + 1); 1583 1584 /* loops to store the first nt sets of values in the destination */ 1585 1586 for(row = nt; row > 0; row -= 8) 1587 { 1588 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8) 1589 { 1590 pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1); 1591 1592 sub_val = vsubl_u8(pu1_src_tmp1, dup_sub); 1593 subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); 1594 addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add)); 1595 round_val = vqmovun_s16(addsat_val); 1596 1597 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/ 1598 1599 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2); 1600 vext_t = vext_u8(round_val, pu1_src_tmp2, 7); 1601 vst1_u8(pu1_dst_tmp_1, vext_t); 1602 pu1_dst_tmp_1 += dst_strd; 1603 1604 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8); 1605 1606 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1607 vst1_u8(pu1_dst_tmp_1, vext_t); 1608 pu1_dst_tmp_1 += dst_strd; 1609 1610 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16); 1611 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1612 vst1_u8(pu1_dst_tmp_1, vext_t); 1613 pu1_dst_tmp_1 += dst_strd; 1614 1615 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24); 1616 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1617 vst1_u8(pu1_dst_tmp_1, vext_t); 1618 pu1_dst_tmp_1 += dst_strd; 1619 1620 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32); 1621 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1622 vst1_u8(pu1_dst_tmp_1, vext_t); 1623 pu1_dst_tmp_1 += dst_strd; 1624 1625 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40); 1626 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1627 vst1_u8(pu1_dst_tmp_1, vext_t); 1628 pu1_dst_tmp_1 += dst_strd; 1629 1630 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48); 1631 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1632 vst1_u8(pu1_dst_tmp_1, vext_t); 1633 pu1_dst_tmp_1 += dst_strd; 1634 1635 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56); 1636 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1637 vst1_u8(pu1_dst_tmp_1, vext_t); 1638 pu1_dst_tmp_1 += dst_strd; 1639 1640 pu1_ref_tmp_1 -= 8; 1641 } 1642 1643 /* loop to store next sets of eight values in the destination */ 1644 1645 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8) 1646 { 1647 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3); 1648 1649 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1650 pu1_dst_tmp_2 += dst_strd; 1651 1652 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1653 pu1_dst_tmp_2 += dst_strd; 1654 1655 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1656 pu1_dst_tmp_2 += dst_strd; 1657 1658 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1659 pu1_dst_tmp_2 += dst_strd; 1660 1661 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1662 pu1_dst_tmp_2 += dst_strd; 1663 1664 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1665 pu1_dst_tmp_2 += dst_strd; 1666 1667 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1668 pu1_dst_tmp_2 += dst_strd; 1669 1670 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1671 pu1_dst_tmp_2 += dst_strd; 1672 } 1673 pu1_ref_tmp_3 += 8; 1674 pu1_dst_tmp_3 += 8; 1675 pu1_dst_tmp_2 = pu1_dst_tmp_3; 1676 cond = 1; 1677 } 1678 } 1679 } 1680 } 1681 /* INTRA_PRED_LUMA_VER */ 1682 1683 /** 1684 ******************************************************************************* 1685 * 1686 * @brief 1687 * Intra prediction interpolation filter for luma mode2. 1688 * 1689 * @par Description: 1690 * Intraprediction for mode 2 (sw angle) with reference neighboring samples 1691 * location pointed by 'pu1_ref' to the TU block location pointed by 1692 * 'pu1_dst' 1693 * 1694 * @param[in] pu1_src 1695 * UWORD8 pointer to the source 1696 * 1697 * @param[out] pu1_dst 1698 * UWORD8 pointer to the destination 1699 * 1700 * @param[in] src_strd 1701 * integer source stride 1702 * 1703 * @param[in] dst_strd 1704 * integer destination stride 1705 * 1706 * @param[in] nt 1707 * integer Transform Block size 1708 * 1709 * @param[in] wd 1710 * integer width of the array 1711 * 1712 * @returns 1713 * 1714 * @remarks 1715 * None 1716 * 1717 ******************************************************************************* 1718 */ 1719 1720 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref, 1721 WORD32 src_strd, 1722 UWORD8 *pu1_dst, 1723 WORD32 dst_strd, 1724 WORD32 nt, 1725 WORD32 mode) 1726 { 1727 1728 WORD32 row, col; 1729 WORD32 two_nt; 1730 UNUSED(src_strd); 1731 UNUSED(mode); 1732 1733 /* rev_res naming has been made to have the reverse result value in it */ 1734 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1735 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 1736 1737 if(0 != (nt & 7)) 1738 { 1739 UWORD8 *pu1_ref_tmp = pu1_ref; 1740 UWORD8 *pu1_dst_tmp = pu1_dst; 1741 uint8x8_t pu1_src_val, rev_res; 1742 uint64x1_t shift_res; 1743 1744 for(col = nt; col > 0; col -= 4) 1745 { 1746 for(row = nt; row > 0; row -= 4) 1747 { 1748 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */ 1749 1750 pu1_src_val = vld1_u8(pu1_ref_tmp); 1751 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8); 1752 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res)); 1753 1754 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0); 1755 pu1_dst_tmp += dst_strd; 1756 1757 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8); 1758 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); 1759 pu1_dst_tmp += dst_strd; 1760 1761 shift_res = vshr_n_u64(shift_res, 8); 1762 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); 1763 pu1_dst_tmp += dst_strd; 1764 1765 shift_res = vshr_n_u64(shift_res, 8); 1766 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); 1767 pu1_dst_tmp += dst_strd; 1768 } 1769 } 1770 } 1771 1772 /* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */ 1773 /* shift_64 to shift the reversed 2nd values to get the value what we need */ 1774 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1775 1776 else 1777 { 1778 UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref; 1779 UWORD8 *pu1_dst_tmp = pu1_dst; 1780 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst; 1781 1782 uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first; 1783 uint64x1_t shift_val; 1784 1785 two_nt = 2 * nt; 1786 pu1_ref_two_nt_minus2 += (two_nt); 1787 pu1_ref_two_nt_minus2 -= 8; 1788 1789 for(col = nt; col > 0; col -= 8) 1790 { 1791 for(row = nt; row > 0; row -= 8) 1792 { 1793 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2); 1794 rev_val_first = vrev64_u8(pu1_src_val2); 1795 1796 pu1_ref_two_nt_minus2 -= 8; 1797 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2); 1798 rev_val_second = vrev64_u8(pu1_src_val1); 1799 1800 vext_t = vext_u8(rev_val_first, rev_val_second, 1); 1801 vst1_u8(pu1_dst_tmp, vext_t); 1802 pu1_dst_tmp += dst_strd; 1803 1804 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8); 1805 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1806 vst1_u8(pu1_dst_tmp, vext_t); 1807 pu1_dst_tmp += dst_strd; 1808 1809 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16); 1810 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1811 vst1_u8(pu1_dst_tmp, vext_t); 1812 pu1_dst_tmp += dst_strd; 1813 1814 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24); 1815 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1816 vst1_u8(pu1_dst_tmp, vext_t); 1817 pu1_dst_tmp += dst_strd; 1818 1819 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32); 1820 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1821 vst1_u8(pu1_dst_tmp, vext_t); 1822 pu1_dst_tmp += dst_strd; 1823 1824 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40); 1825 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1826 vst1_u8(pu1_dst_tmp, vext_t); 1827 pu1_dst_tmp += dst_strd; 1828 1829 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48); 1830 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1831 vst1_u8(pu1_dst_tmp, vext_t); 1832 pu1_dst_tmp += dst_strd; 1833 1834 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56); 1835 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1836 vst1_u8(pu1_dst_tmp, vext_t); 1837 pu1_dst_tmp += dst_strd; 1838 } 1839 pu1_dst_tmp_plus8 += 8; 1840 pu1_dst_tmp = pu1_dst_tmp_plus8; 1841 pu1_ref_two_nt_minus2 += (nt - 8); 1842 } 1843 } 1844 } 1845 /* INTRA_PRED_LUMA_MODE2 */ 1846 1847 /** 1848 ******************************************************************************* 1849 * 1850 * @brief 1851 * Intra prediction interpolation filter for luma mode 18 & mode 34. 1852 * 1853 * @par Description: 1854 * Intraprediction for mode 34 (ne angle) with reference neighboring 1855 * samples location pointed by 'pu1_ref' to the TU block location pointed by 1856 * 'pu1_dst' 1857 * 1858 * @param[in] pu1_src 1859 * UWORD8 pointer to the source 1860 * 1861 * @param[out] pu1_dst 1862 * UWORD8 pointer to the destination 1863 * 1864 * @param[in] src_strd 1865 * integer source stride 1866 * 1867 * @param[in] dst_strd 1868 * integer destination stride 1869 * 1870 * @param[in] nt 1871 * integer Transform Block size 1872 * 1873 * @param[in] wd 1874 * integer width of the array 1875 * 1876 * @returns 1877 * 1878 * @remarks 1879 * None 1880 * 1881 ******************************************************************************* 1882 */ 1883 1884 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref, 1885 WORD32 src_strd, 1886 UWORD8 *pu1_dst, 1887 WORD32 dst_strd, 1888 WORD32 nt, 1889 WORD32 mode) 1890 { 1891 1892 WORD32 row, col, idx; 1893 WORD32 intraPredAngle = 32; 1894 WORD32 two_nt; 1895 UNUSED(src_strd); 1896 two_nt = 2 * nt; 1897 1898 UWORD8 *pu1_ref_tmp = pu1_ref; 1899 UWORD8 *pu1_ref_tmp1 = pu1_ref; 1900 UWORD8 *pu1_dst_tmp = pu1_dst; 1901 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst; 1902 1903 uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7; 1904 1905 /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */ 1906 /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */ 1907 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1908 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1909 /* loops are maintained separately for mode18 and mode34 */ 1910 1911 /* cond to allow multiples of 8 */ 1912 if(0 == (nt & 7)) 1913 { 1914 if(mode == 34) 1915 { 1916 pu1_ref_tmp += (two_nt + 2); 1917 1918 for(row = nt; row > 0; row -= 8) 1919 { 1920 for(col = nt; col > 0; col -= 8) 1921 { 1922 /* Loading 1st eight values */ 1923 src_tmp_1st = vld1_u8(pu1_ref_tmp); 1924 pu1_ref_tmp += 8; 1925 1926 /* Loading next eight values */ 1927 src_tmp_2nd = vld1_u8(pu1_ref_tmp); 1928 1929 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */ 1930 vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1); 1931 vst1_u8(pu1_dst_tmp, src_tmp_1st); 1932 pu1_dst_tmp += dst_strd; 1933 1934 vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2); 1935 vst1_u8(pu1_dst_tmp, vext1); 1936 pu1_dst_tmp += dst_strd; 1937 1938 vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3); 1939 vst1_u8(pu1_dst_tmp, vext2); 1940 pu1_dst_tmp += dst_strd; 1941 1942 vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4); 1943 vst1_u8(pu1_dst_tmp, vext3); 1944 pu1_dst_tmp += dst_strd; 1945 1946 vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5); 1947 vst1_u8(pu1_dst_tmp, vext4); 1948 pu1_dst_tmp += dst_strd; 1949 1950 vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6); 1951 vst1_u8(pu1_dst_tmp, vext5); 1952 pu1_dst_tmp += dst_strd; 1953 1954 vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7); 1955 vst1_u8(pu1_dst_tmp, vext6); 1956 pu1_dst_tmp += dst_strd; 1957 1958 vst1_u8(pu1_dst_tmp, vext7); 1959 pu1_dst_tmp += dst_strd; 1960 } 1961 1962 pu1_dst_tmp_plus8 += 8; 1963 pu1_dst_tmp = pu1_dst_tmp_plus8; 1964 pu1_ref_tmp -= (nt - 8); 1965 } 1966 } 1967 else /* Loop for mode 18 */ 1968 { 1969 pu1_ref_tmp += (two_nt); 1970 1971 for(row = nt; row > 0; row -= 8) 1972 { 1973 for(col = nt; col > 0; col -= 8) 1974 { 1975 /* Loading 1st eight values */ 1976 src_tmp_1st = vld1_u8(pu1_ref_tmp); 1977 pu1_ref_tmp -= 8; 1978 1979 /* Loading next eight values */ 1980 src_tmp_2nd = vld1_u8(pu1_ref_tmp); 1981 1982 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */ 1983 vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7); 1984 vst1_u8(pu1_dst_tmp, src_tmp_1st); 1985 pu1_dst_tmp += dst_strd; 1986 1987 vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6); 1988 vst1_u8(pu1_dst_tmp, vext1); 1989 pu1_dst_tmp += dst_strd; 1990 1991 vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5); 1992 vst1_u8(pu1_dst_tmp, vext2); 1993 pu1_dst_tmp += dst_strd; 1994 1995 vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4); 1996 vst1_u8(pu1_dst_tmp, vext3); 1997 pu1_dst_tmp += dst_strd; 1998 1999 vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3); 2000 vst1_u8(pu1_dst_tmp, vext4); 2001 pu1_dst_tmp += dst_strd; 2002 2003 vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2); 2004 vst1_u8(pu1_dst_tmp, vext5); 2005 pu1_dst_tmp += dst_strd; 2006 2007 vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1); 2008 vst1_u8(pu1_dst_tmp, vext6); 2009 pu1_dst_tmp += dst_strd; 2010 2011 vst1_u8(pu1_dst_tmp, vext7); 2012 pu1_dst_tmp += dst_strd; 2013 } 2014 pu1_dst_tmp_plus8 += 8; 2015 pu1_dst_tmp = pu1_dst_tmp_plus8; 2016 pu1_ref_tmp += (nt + 8); 2017 } 2018 } 2019 } 2020 2021 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 2022 2023 else /* loop for multiples of 4 */ 2024 { 2025 uint8x8_t src_val1; 2026 uint8x8_t src_val2; 2027 2028 if(mode == 18) 2029 intraPredAngle = -32; 2030 else if(mode == 34) 2031 intraPredAngle = 32; 2032 2033 for(row = 0; row < nt; row += 2) 2034 { 2035 /* unrolling 2 rows */ 2036 idx = ((row + 1) * intraPredAngle) >> 5; 2037 pu1_ref_tmp = pu1_ref + two_nt + idx + 1; 2038 src_val1 = vld1_u8(pu1_ref_tmp); 2039 2040 idx = ((row + 2) * intraPredAngle) >> 5; 2041 pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1; 2042 src_val2 = vld1_u8(pu1_ref_tmp1); 2043 2044 /* unrolling 4 col */ 2045 for(col = nt; col > 0; col -= 4) 2046 { 2047 pu1_dst_tmp = pu1_dst; 2048 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0); 2049 pu1_dst_tmp += dst_strd; 2050 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0); 2051 pu1_dst += 4; 2052 } 2053 pu1_dst += 2 * dst_strd - nt; 2054 } 2055 } 2056 } 2057 /* INTRA_PRED_LUMA_MODE_18_34 */ 2058 2059 /** 2060 ******************************************************************************* 2061 * 2062 * @brief 2063 * Intra prediction interpolation filter for luma mode 3 to mode 9 2064 * 2065 * @par Description: 2066 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with 2067 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2068 * block location pointed by 'pu1_dst' 2069 * 2070 * @param[in] pu1_src 2071 * UWORD8 pointer to the source 2072 * 2073 * @param[out] pu1_dst 2074 * UWORD8 pointer to the destination 2075 * 2076 * @param[in] src_strd 2077 * integer source stride 2078 * 2079 * @param[in] dst_strd 2080 * integer destination stride 2081 * 2082 * @param[in] nt 2083 * integer Transform Block size 2084 * 2085 * @param[in] mode 2086 * integer intraprediction mode 2087 * 2088 * @returns 2089 * 2090 * @remarks 2091 * None 2092 * 2093 ******************************************************************************* 2094 */ 2095 2096 2097 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref, 2098 WORD32 src_strd, 2099 UWORD8 *pu1_dst, 2100 WORD32 dst_strd, 2101 WORD32 nt, 2102 WORD32 mode) 2103 { 2104 2105 WORD32 row, col; 2106 WORD32 intra_pred_ang; 2107 WORD32 pos, fract = 100, fract_prev; 2108 UNUSED(src_strd); 2109 if(0 == (nt & 7)) 2110 { 2111 2112 UWORD8 *pu1_ref_main_idx = pu1_ref; 2113 UWORD8 *pu1_ref_main_idx_1 = pu1_ref; 2114 2115 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2116 UWORD8 *pu1_dst_tmp2 = pu1_dst; 2117 2118 WORD32 two_nt = 2 * nt; 2119 2120 pu1_ref_main_idx += two_nt; 2121 pu1_ref_main_idx_1 += two_nt - 1; 2122 2123 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1; 2124 uint8x8_t shift_res; 2125 uint16x8_t mul_res1, mul_res2, add_res; 2126 2127 /* Intra Pred Angle according to the mode */ 2128 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2129 2130 pu1_ref_main_idx -= 8; 2131 pu1_ref_main_idx_1 -= 8; 2132 2133 for(col = 0; col < nt; col++) 2134 { 2135 fract_prev = fract; 2136 2137 pos = ((col + 1) * intra_pred_ang); 2138 fract = pos & (31); 2139 2140 if(fract_prev < fract) 2141 { 2142 pu1_ref_main_idx += 1; 2143 pu1_ref_main_idx_1 += 1; 2144 } 2145 2146 dup_const_fract = vdup_n_u8((uint8_t)fract); 2147 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2148 2149 for(row = nt; row > 0; row -= 8) 2150 { 2151 ref_main_idx = vld1_u8(pu1_ref_main_idx); 2152 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1); 2153 2154 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2155 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2156 2157 add_res = vaddq_u16(mul_res1, mul_res2); 2158 2159 shift_res = vrshrn_n_u16(add_res, 5); 2160 2161 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7); 2162 pu1_dst_tmp1 += dst_strd; 2163 2164 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6); 2165 pu1_dst_tmp1 += dst_strd; 2166 2167 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5); 2168 pu1_dst_tmp1 += dst_strd; 2169 2170 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4); 2171 pu1_dst_tmp1 += dst_strd; 2172 2173 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2174 pu1_dst_tmp1 += dst_strd; 2175 2176 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2177 pu1_dst_tmp1 += dst_strd; 2178 2179 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2180 pu1_dst_tmp1 += dst_strd; 2181 2182 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2183 pu1_dst_tmp1 += dst_strd; 2184 2185 pu1_ref_main_idx -= 8; 2186 pu1_ref_main_idx_1 -= 8; 2187 2188 } 2189 pu1_dst_tmp2 += 1; 2190 pu1_dst_tmp1 = pu1_dst_tmp2; 2191 2192 pu1_ref_main_idx += nt; 2193 pu1_ref_main_idx_1 += nt; 2194 2195 pu1_ref_main_idx -= 1; 2196 pu1_ref_main_idx_1 -= 1; 2197 2198 } 2199 } 2200 else 2201 { 2202 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2203 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2204 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2205 UWORD8 *pu1_dst_tmp2 = pu1_dst; 2206 2207 pu1_ref_tmp1 += nt; 2208 pu1_ref_tmp2 += (nt - 1); 2209 2210 uint8x8_t dup_fract, dup_32_fract, shift_res; 2211 uint16x8_t mul_res1, mul_res2, add_res; 2212 uint32x2_t pu1_ref_val1, pu1_ref_val2; 2213 2214 pu1_ref_val1 = vdup_n_u32(0); 2215 pu1_ref_val2 = vdup_n_u32(0); 2216 2217 /* Intra Pred Angle according to the mode */ 2218 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2219 2220 2221 for(col = 0; col < nt; col++) 2222 { 2223 fract_prev = fract; 2224 pos = ((col + 1) * intra_pred_ang); 2225 fract = pos & (31); 2226 if(fract_prev < fract) 2227 { 2228 pu1_ref_tmp1 += 1; 2229 pu1_ref_tmp2 += 1; 2230 } 2231 dup_fract = vdup_n_u8((uint8_t)fract); 2232 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2233 2234 for(row = nt; row > 0; row -= 4) 2235 { 2236 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0); 2237 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0); 2238 2239 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract); 2240 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract); 2241 2242 add_res = vaddq_u16(mul_res1, mul_res2); 2243 2244 shift_res = vrshrn_n_u16(add_res, 5); 2245 2246 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2247 pu1_dst_tmp1 += dst_strd; 2248 2249 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2250 pu1_dst_tmp1 += dst_strd; 2251 2252 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2253 pu1_dst_tmp1 += dst_strd; 2254 2255 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2256 2257 } 2258 pu1_ref_tmp1 -= 1; 2259 pu1_ref_tmp2 -= 1; 2260 2261 pu1_dst_tmp2 += 1; 2262 pu1_dst_tmp1 = pu1_dst_tmp2; 2263 2264 } 2265 2266 2267 } 2268 2269 } 2270 2271 /** 2272 ******************************************************************************* 2273 * 2274 * @brief 2275 * Intra prediction interpolation filter for luma mode 11 to mode 17 2276 * 2277 * @par Description: 2278 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode ) 2279 * with reference neighboring samples location pointed by 'pu1_ref' to the 2280 * TU block location pointed by 'pu1_dst' 2281 * 2282 * @param[in] pu1_src 2283 * UWORD8 pointer to the source 2284 * 2285 * @param[out] pu1_dst 2286 * UWORD8 pointer to the destination 2287 * 2288 * @param[in] src_strd 2289 * integer source stride 2290 * 2291 * @param[in] dst_strd 2292 * integer destination stride 2293 * 2294 * @param[in] nt 2295 * integer Transform Block size 2296 * 2297 * @param[in] mode 2298 * integer intraprediction mode 2299 * 2300 * @returns 2301 * 2302 * @remarks 2303 * None 2304 * 2305 ******************************************************************************* 2306 */ 2307 2308 2309 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref, 2310 WORD32 src_strd, 2311 UWORD8 *pu1_dst, 2312 WORD32 dst_strd, 2313 WORD32 nt, 2314 WORD32 mode) 2315 { 2316 2317 WORD32 row, col, k; 2318 WORD32 two_nt; 2319 WORD32 intra_pred_ang, inv_ang, inv_ang_sum; 2320 WORD32 pos, fract = 1000, fract_prev; 2321 WORD32 ref_idx; 2322 2323 UWORD8 *ref_main; 2324 UWORD8 *ref_main_tmp; 2325 2326 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2327 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2328 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2329 UWORD8 *pu1_dst_tmp2 = pu1_dst; 2330 2331 UWORD8 ref_temp[2 * MAX_CU_SIZE + 1]; 2332 2333 uint16x8_t mul_res1, mul_res2, add_res; 2334 uint8x8_t dup_const_fract, dup_const_32_fract; 2335 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res; 2336 uint8x8_t ref_left_t; 2337 uint32x2_t ref_left_tmp; 2338 UNUSED(src_strd); 2339 ref_left_tmp = vdup_n_u32(0); 2340 2341 inv_ang_sum = 128; 2342 two_nt = 2 * nt; 2343 2344 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2345 2346 inv_ang = gai4_ihevc_inv_ang_table[mode - 11]; 2347 2348 pu1_ref_tmp1 += two_nt; 2349 2350 ref_main = ref_temp + (nt - 1); 2351 ref_main_tmp = ref_main; 2352 2353 if(0 == (nt & 7)) 2354 { 2355 pu1_ref_tmp2 += (two_nt - 7); 2356 2357 for(k = nt - 1; k >= 0; k -= 8) 2358 { 2359 2360 ref_left_t = vld1_u8(pu1_ref_tmp2); 2361 2362 ref_left_t = vrev64_u8(ref_left_t); 2363 vst1_u8(ref_main_tmp, ref_left_t); 2364 ref_main_tmp += 8; 2365 pu1_ref_tmp2 -= 8; 2366 2367 } 2368 2369 } 2370 else 2371 { 2372 uint8x8_t rev_val; 2373 pu1_ref_tmp2 += (two_nt - (nt - 1)); 2374 2375 for(k = nt - 1; k >= 0; k -= 8) 2376 { 2377 2378 ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1); 2379 2380 rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp)); 2381 vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0); 2382 2383 } 2384 2385 } 2386 2387 ref_main[nt] = pu1_ref[two_nt - nt]; 2388 2389 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */ 2390 2391 ref_idx = (nt * intra_pred_ang) >> 5; 2392 2393 /* SIMD Optimization can be done using look-up table for the loop */ 2394 /* For negative angled derive the main reference samples from side */ 2395 /* reference samples refer to section 8.4.4.2.6 */ 2396 for(k = -1; k > ref_idx; k--) 2397 { 2398 inv_ang_sum += inv_ang; 2399 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)]; 2400 } 2401 2402 UWORD8 *ref_main_tmp1 = ref_main; 2403 UWORD8 *ref_main_tmp2 = ref_main; 2404 2405 ref_main_tmp2 += 1; 2406 2407 if(0 == (nt & 7)) 2408 { 2409 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 2410 /* samples dependent on distance to obtain destination sample */ 2411 for(col = 0; col < nt; col++) 2412 { 2413 2414 fract_prev = fract; 2415 pos = ((col + 1) * intra_pred_ang); 2416 fract = pos & (31); 2417 2418 if(fract_prev < fract) 2419 { 2420 ref_main_tmp1 -= 1; 2421 ref_main_tmp2 -= 1; 2422 } 2423 2424 dup_const_fract = vdup_n_u8((uint8_t)fract); 2425 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2426 2427 // Do linear filtering 2428 for(row = nt; row > 0; row -= 8) 2429 { 2430 ref_main_idx = vld1_u8(ref_main_tmp1); 2431 2432 ref_main_idx_1 = vld1_u8(ref_main_tmp2); 2433 2434 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2435 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2436 2437 add_res = vaddq_u16(mul_res1, mul_res2); 2438 2439 shift_res = vrshrn_n_u16(add_res, 5); 2440 2441 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2442 pu1_dst_tmp1 += dst_strd; 2443 2444 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2445 pu1_dst_tmp1 += dst_strd; 2446 2447 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2448 pu1_dst_tmp1 += dst_strd; 2449 2450 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2451 pu1_dst_tmp1 += dst_strd; 2452 2453 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4); 2454 pu1_dst_tmp1 += dst_strd; 2455 2456 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5); 2457 pu1_dst_tmp1 += dst_strd; 2458 2459 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6); 2460 pu1_dst_tmp1 += dst_strd; 2461 2462 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7); 2463 pu1_dst_tmp1 += dst_strd; 2464 2465 ref_main_tmp1 += 8; 2466 ref_main_tmp2 += 8; 2467 } 2468 2469 ref_main_tmp1 -= nt; 2470 ref_main_tmp2 -= nt; 2471 2472 pu1_dst_tmp2 += 1; 2473 pu1_dst_tmp1 = pu1_dst_tmp2; 2474 } 2475 } 2476 else 2477 { 2478 uint32x2_t ref_main_idx1, ref_main_idx2; 2479 2480 ref_main_idx1 = vdup_n_u32(0); 2481 ref_main_idx2 = vdup_n_u32(0); 2482 2483 for(col = 0; col < nt; col++) 2484 { 2485 fract_prev = fract; 2486 pos = ((col + 1) * intra_pred_ang); 2487 fract = pos & (31); 2488 2489 if(fract_prev < fract) 2490 { 2491 ref_main_tmp1 -= 1; 2492 ref_main_tmp2 -= 1; 2493 } 2494 2495 dup_const_fract = vdup_n_u8((uint8_t)fract); 2496 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2497 2498 for(row = nt; row > 0; row -= 4) 2499 { 2500 2501 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0); 2502 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0); 2503 2504 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract); 2505 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract); 2506 2507 add_res = vaddq_u16(mul_res1, mul_res2); 2508 2509 shift_res = vrshrn_n_u16(add_res, 5); 2510 2511 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2512 pu1_dst_tmp1 += dst_strd; 2513 2514 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2515 pu1_dst_tmp1 += dst_strd; 2516 2517 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2518 pu1_dst_tmp1 += dst_strd; 2519 2520 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2521 pu1_dst_tmp1 += dst_strd; 2522 2523 } 2524 2525 pu1_dst_tmp2 += 1; 2526 pu1_dst_tmp1 = pu1_dst_tmp2; 2527 2528 } 2529 2530 } 2531 } 2532 2533 /** 2534 ******************************************************************************* 2535 * 2536 * @brief 2537 * Intra prediction interpolation filter for luma mode 19 to mode 25 2538 * 2539 * @par Description: 2540 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with 2541 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2542 * block location pointed by 'pu1_dst' 2543 * 2544 * @param[in] pu1_src 2545 * UWORD8 pointer to the source 2546 * 2547 * @param[out] pu1_dst 2548 * UWORD8 pointer to the destination 2549 * 2550 * @param[in] src_strd 2551 * integer source stride 2552 * 2553 * @param[in] dst_strd 2554 * integer destination stride 2555 * 2556 * @param[in] nt 2557 * integer Transform Block size 2558 * 2559 * @param[in] mode 2560 * integer intraprediction mode 2561 * 2562 * @returns 2563 * 2564 * @remarks 2565 * None 2566 * 2567 ******************************************************************************* 2568 */ 2569 2570 2571 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref, 2572 WORD32 src_strd, 2573 UWORD8 *pu1_dst, 2574 WORD32 dst_strd, 2575 WORD32 nt, 2576 WORD32 mode) 2577 { 2578 2579 WORD32 row, col, k; 2580 WORD32 two_nt, intra_pred_ang; 2581 WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;; 2582 WORD32 ref_idx; 2583 UWORD8 *ref_main; 2584 UWORD8 *ref_main_tmp; 2585 UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1]; 2586 2587 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2588 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2589 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2590 2591 uint16x8_t mul_res1, mul_res2, add_res; 2592 uint8x8_t dup_const_fract, dup_const_32_fract; 2593 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res; 2594 uint8x8_t ref_above_t; 2595 uint32x2_t ref_above_tmp; 2596 UNUSED(src_strd); 2597 ref_above_tmp = vdup_n_u32(0); 2598 2599 two_nt = 2 * nt; 2600 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2601 inv_ang = gai4_ihevc_inv_ang_table[mode - 12]; 2602 2603 /* Intermediate reference samples for negative angle modes */ 2604 /* This have to be removed during optimization*/ 2605 pu1_ref_tmp1 += two_nt; 2606 2607 2608 ref_main = ref_temp + (nt - 1); 2609 ref_main_tmp = ref_main; 2610 2611 if(0 == (nt & 7)) 2612 { 2613 pu1_ref_tmp2 += (two_nt - 7); 2614 for(k = nt - 1; k >= 0; k -= 8) 2615 { 2616 2617 ref_above_t = vld1_u8(pu1_ref_tmp1); 2618 vst1_u8(ref_main_tmp, ref_above_t); 2619 ref_main_tmp += 8; 2620 pu1_ref_tmp1 += 8; 2621 2622 } 2623 2624 } 2625 else 2626 { 2627 pu1_ref_tmp2 += (two_nt - (nt - 1)); 2628 2629 for(k = nt - 1; k >= 0; k -= 4) 2630 { 2631 2632 ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0); 2633 vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0); 2634 2635 } 2636 2637 } 2638 2639 ref_main[nt] = pu1_ref[two_nt + nt]; 2640 2641 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 2642 2643 ref_idx = (nt * intra_pred_ang) >> 5; 2644 inv_ang_sum = 128; 2645 2646 /* SIMD Optimization can be done using look-up table for the loop */ 2647 /* For negative angled derive the main reference samples from side */ 2648 /* reference samples refer to section 8.4.4.2.6 */ 2649 for(k = -1; k > ref_idx; k--) 2650 { 2651 inv_ang_sum += inv_ang; 2652 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)]; 2653 } 2654 2655 UWORD8 *ref_main_tmp1 = ref_main; 2656 UWORD8 *ref_main_tmp2 = ref_main; 2657 2658 ref_main_tmp2 += 1; 2659 2660 if(0 == (nt & 7)) 2661 { 2662 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 2663 /* samples dependent on distance to obtain destination sample */ 2664 for(row = 0; row < nt; row++) 2665 { 2666 2667 fract_prev = fract; 2668 pos = ((row + 1) * intra_pred_ang); 2669 fract = pos & (31); 2670 2671 if(fract_prev < fract) 2672 { 2673 ref_main_tmp1 -= 1; 2674 ref_main_tmp2 -= 1; 2675 } 2676 2677 dup_const_fract = vdup_n_u8((uint8_t)fract); 2678 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2679 2680 // Do linear filtering 2681 for(col = nt; col > 0; col -= 8) 2682 { 2683 ref_main_idx = vld1_u8(ref_main_tmp1); 2684 2685 ref_main_idx_1 = vld1_u8(ref_main_tmp2); 2686 2687 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2688 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2689 2690 add_res = vaddq_u16(mul_res1, mul_res2); 2691 2692 shift_res = vrshrn_n_u16(add_res, 5); 2693 2694 vst1_u8(pu1_dst_tmp1, shift_res); 2695 pu1_dst_tmp1 += 8; 2696 2697 ref_main_tmp1 += 8; 2698 ref_main_tmp2 += 8; 2699 } 2700 2701 ref_main_tmp1 -= nt; 2702 ref_main_tmp2 -= nt; 2703 2704 pu1_dst_tmp1 += (dst_strd - nt); 2705 } 2706 } 2707 else 2708 { 2709 uint32x2_t ref_main_idx1, ref_main_idx2; 2710 2711 ref_main_idx1 = vdup_n_u32(0); 2712 ref_main_idx2 = vdup_n_u32(0); 2713 2714 for(row = 0; row < nt; row++) 2715 { 2716 fract_prev = fract; 2717 pos = ((row + 1) * intra_pred_ang); 2718 fract = pos & (31); 2719 2720 if(fract_prev < fract) 2721 { 2722 ref_main_tmp1 -= 1; 2723 ref_main_tmp2 -= 1; 2724 } 2725 2726 dup_const_fract = vdup_n_u8((uint8_t)fract); 2727 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2728 2729 for(col = nt; col > 0; col -= 4) 2730 { 2731 2732 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0); 2733 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0); 2734 2735 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract); 2736 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract); 2737 2738 add_res = vaddq_u16(mul_res1, mul_res2); 2739 2740 shift_res = vrshrn_n_u16(add_res, 5); 2741 2742 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0); 2743 pu1_dst_tmp1 += 4; 2744 2745 } 2746 pu1_dst_tmp1 += (dst_strd - nt); 2747 } 2748 2749 } 2750 2751 } 2752 2753 /** 2754 ******************************************************************************* 2755 * 2756 * @brief 2757 * Intra prediction interpolation filter for luma mode 27 to mode 33 2758 * 2759 * @par Description: 2760 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with 2761 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2762 * block location pointed by 'pu1_dst' 2763 * 2764 * @param[in] pu1_src 2765 * UWORD8 pointer to the source 2766 * 2767 * @param[out] pu1_dst 2768 * UWORD8 pointer to the destination 2769 * 2770 * @param[in] src_strd 2771 * integer source stride 2772 * 2773 * @param[in] dst_strd 2774 * integer destination stride 2775 * 2776 * @param[in] nt 2777 * integer Transform Block size 2778 * 2779 * @param[in] mode 2780 * integer intraprediction mode 2781 * 2782 * @returns 2783 * 2784 * @remarks 2785 * None 2786 * 2787 ******************************************************************************* 2788 */ 2789 2790 2791 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref, 2792 WORD32 src_strd, 2793 UWORD8 *pu1_dst, 2794 WORD32 dst_strd, 2795 WORD32 nt, 2796 WORD32 mode) 2797 { 2798 2799 WORD32 row, col; 2800 WORD32 intra_pred_ang; 2801 WORD32 pos, fract = 0, fract_prev; 2802 2803 WORD32 two_nt = 2 * nt; 2804 UNUSED(src_strd); 2805 if(0 == (nt & 7)) 2806 { 2807 2808 UWORD8 *pu1_ref_main_idx = pu1_ref; 2809 UWORD8 *pu1_ref_main_idx_1 = pu1_ref; 2810 2811 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2812 pu1_ref_main_idx += (two_nt + 1); 2813 pu1_ref_main_idx_1 += (two_nt + 2); 2814 2815 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1; 2816 uint8x8_t shift_res; 2817 uint16x8_t mul_res1, mul_res2, add_res; 2818 2819 /* Intra Pred Angle according to the mode */ 2820 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2821 2822 for(row = 0; row < nt; row++) 2823 { 2824 fract_prev = fract; 2825 2826 pos = ((row + 1) * intra_pred_ang); 2827 fract = pos & (31); 2828 2829 if(fract_prev > fract) 2830 { 2831 pu1_ref_main_idx += 1; 2832 pu1_ref_main_idx_1 += 1; 2833 } 2834 2835 dup_const_fract = vdup_n_u8((uint8_t)fract); 2836 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2837 2838 for(col = nt; col > 0; col -= 8) 2839 { 2840 ref_main_idx = vld1_u8(pu1_ref_main_idx); 2841 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1); 2842 2843 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2844 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2845 2846 add_res = vaddq_u16(mul_res1, mul_res2); 2847 2848 shift_res = vrshrn_n_u16(add_res, 5); 2849 2850 vst1_u8(pu1_dst_tmp1, shift_res); 2851 pu1_dst_tmp1 += 8; 2852 2853 pu1_ref_main_idx += 8; 2854 pu1_ref_main_idx_1 += 8; 2855 } 2856 2857 pu1_ref_main_idx -= nt; 2858 pu1_ref_main_idx_1 -= nt; 2859 2860 pu1_dst_tmp1 += (dst_strd - nt); 2861 } 2862 2863 } 2864 else 2865 { 2866 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2867 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2868 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2869 2870 pu1_ref_tmp1 += (two_nt + 1);; 2871 pu1_ref_tmp2 += (two_nt + 2);; 2872 2873 uint8x8_t dup_fract, dup_32_fract, shift_res; 2874 uint16x8_t mul_res1, mul_res2, add_res; 2875 uint32x2_t pu1_ref_val1, pu1_ref_val2; 2876 2877 pu1_ref_val1 = vdup_n_u32(0); 2878 pu1_ref_val2 = vdup_n_u32(0); 2879 2880 /* Intra Pred Angle according to the mode */ 2881 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2882 2883 for(row = 0; row < nt; row++) 2884 { 2885 fract_prev = fract; 2886 pos = ((row + 1) * intra_pred_ang); 2887 fract = pos & (31); 2888 if(fract_prev > fract) 2889 { 2890 pu1_ref_tmp1 += 1; 2891 pu1_ref_tmp2 += 1; 2892 } 2893 dup_fract = vdup_n_u8((uint8_t)fract); 2894 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2895 2896 for(col = nt; col > 0; col -= 4) 2897 { 2898 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0); 2899 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0); 2900 2901 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract); 2902 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract); 2903 2904 add_res = vaddq_u16(mul_res1, mul_res2); 2905 2906 shift_res = vrshrn_n_u16(add_res, 5); 2907 2908 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0); 2909 pu1_dst_tmp1 += 4; 2910 2911 } 2912 2913 pu1_dst_tmp1 += (dst_strd - nt); 2914 2915 } 2916 2917 2918 } 2919 2920 } 2921