1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_intra_pred_filters_neon_intr.c 22 * 23 * @brief 24 * Contains function Definition for intra prediction interpolation filters 25 * 26 * 27 * @author 28 * Yogeswaran RS 29 * 30 * @par List of Functions: 31 * - ihevc_intra_pred_luma_planar() 32 * - ihevc_intra_pred_luma_dc() 33 * - ihevc_intra_pred_luma_horz() 34 * - ihevc_intra_pred_luma_ver() 35 * - ihevc_intra_pred_luma_mode2() 36 * - ihevc_intra_pred_luma_mode_18_34() 37 * 38 * @remarks 39 * None 40 * 41 ******************************************************************************* 42 */ 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 #include <stdio.h> 47 48 #include "ihevc_typedefs.h" 49 #include "ihevc_intra_pred.h" 50 #include "ihevc_macros.h" 51 #include "ihevc_func_selector.h" 52 #include "arm_neon.h" 53 #include "ihevc_platform_macros.h" 54 #include "ihevc_common_tables.h" 55 56 /****************************************************************************/ 57 /* Constant Macros */ 58 /****************************************************************************/ 59 #define MAX_CU_SIZE 64 60 #define BIT_DEPTH 8 61 #define T32_4NT 128 62 #define T16_4NT 64 63 64 65 66 /*****************************************************************************/ 67 /* Table Look-up */ 68 /*****************************************************************************/ 69 70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x) 71 72 /*****************************************************************************/ 73 /* Function Definition */ 74 /*****************************************************************************/ 75 76 /** 77 ******************************************************************************* 78 * 79 * @brief 80 * Intra prediction interpolation filter for pu1_ref substitution 81 * 82 * 83 * @par Description: 84 * Reference substitution process for samples unavailable for prediction 85 * Refer to section 8.4.4.2.2 86 * 87 * @param[in] pu1_top_left 88 * UWORD8 pointer to the top-left 89 * 90 * @param[in] pu1_top 91 * UWORD8 pointer to the top 92 * 93 * @param[in] pu1_left 94 * UWORD8 pointer to the left 95 * 96 * @param[in] src_strd 97 * WORD32 Source stride 98 * 99 * @param[in] nbr_flags 100 * WORD32 neighbor availability flags 101 * 102 * @param[in] nt 103 * WORD32 transform Block size 104 * 105 * @param[in] dst_strd 106 * WORD32 Destination stride 107 * 108 * @returns 109 * 110 * @remarks 111 * None 112 * 113 ******************************************************************************* 114 */ 115 116 117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left, 118 UWORD8 *pu1_top, 119 UWORD8 *pu1_left, 120 WORD32 src_strd, 121 WORD32 nt, 122 WORD32 nbr_flags, 123 UWORD8 *pu1_dst, 124 WORD32 dst_strd) 125 { 126 UWORD8 pu1_ref; 127 WORD32 dc_val, i; 128 WORD32 total_samples = (4 * nt) + 1; 129 WORD32 two_nt = 2 * nt; 130 WORD32 three_nt = 3 * nt; 131 WORD32 get_bits; 132 WORD32 next; 133 WORD32 bot_left, left, top, tp_right, tp_left; 134 WORD32 idx, nbr_id_from_bl, frwd_nbr_flag; 135 UNUSED(dst_strd); 136 dc_val = 1 << (BIT_DEPTH - 1); 137 138 /* Neighbor Flag Structure*/ 139 /* Top-Left | Top-Right | Top | Left | Bottom-Left 140 1 4 4 4 4 141 */ 142 143 /* If no neighbor flags are present, fill the neighbor samples with DC value */ 144 if(nbr_flags == 0) 145 { 146 for(i = 0; i < total_samples; i++) 147 { 148 pu1_dst[i] = dc_val; 149 } 150 } 151 else 152 { 153 /* Else fill the corresponding samples */ 154 pu1_dst[two_nt] = *pu1_top_left; 155 UWORD8 *pu1_dst_tmp2 = pu1_dst; 156 UWORD8 *pu1_top_tmp = pu1_top; 157 pu1_dst_tmp2 += two_nt + 1; 158 159 for(i = 0; i < two_nt; i++) 160 pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd]; 161 162 uint8x8_t src; 163 for(i = two_nt; i > 0; i -= 8) 164 { 165 src = vld1_u8(pu1_top_tmp); 166 pu1_top_tmp += 8; 167 vst1_u8(pu1_dst_tmp2, src); 168 pu1_dst_tmp2 += 8; 169 } 170 171 if(nt <= 8) 172 { 173 /* 1 bit extraction for all the neighboring blocks */ 174 tp_left = (nbr_flags & 0x10000) >> 16; 175 bot_left = nbr_flags & 0x1; 176 left = (nbr_flags & 0x10) >> 4; 177 top = (nbr_flags & 0x100) >> 8; 178 tp_right = (nbr_flags & 0x1000) >> 12; 179 180 next = 1; 181 182 /* If bottom -left is not available, reverse substitution process*/ 183 if(bot_left == 0) 184 { 185 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right }; 186 187 /* Check for the 1st available sample from bottom-left*/ 188 while(!a_nbr_flag[next]) 189 next++; 190 191 /* If Left, top-left are available*/ 192 if(next <= 2) 193 { 194 idx = nt * next; 195 pu1_ref = pu1_dst[idx]; 196 for(i = 0; i < idx; i++) 197 pu1_dst[i] = pu1_ref; 198 } 199 else /* If top, top-right are available */ 200 { 201 /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/ 202 idx = (nt * (next - 1)) + 1; 203 pu1_ref = pu1_dst[idx]; 204 for(i = 0; i < idx; i++) 205 pu1_dst[i] = pu1_ref; 206 } 207 } 208 209 /* Forward Substitution Process */ 210 /* If left is Unavailable, copy the last bottom-left value */ 211 212 if(left == 0) 213 { 214 uint8x8_t dup_pu1_dst1; 215 UWORD8 *pu1_dst_const_nt = pu1_dst; 216 pu1_dst_const_nt += nt; 217 218 if(0 == (nt & 7)) 219 { 220 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]); 221 for(i = nt; i > 0; i -= 8) 222 { 223 vst1_u8(pu1_dst_const_nt, dup_pu1_dst1); 224 pu1_dst_const_nt += 8; 225 226 } 227 } 228 else 229 { 230 //uint32x2_t dup_pu1_dst4; 231 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]); 232 //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]); 233 for(i = nt; i > 0; i -= 4) 234 { 235 vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0); 236 pu1_dst_const_nt += 4; 237 238 } 239 240 } 241 242 } 243 if(tp_left == 0) 244 pu1_dst[two_nt] = pu1_dst[two_nt - 1]; 245 if(top == 0) 246 { 247 248 if(0 == (nt & 7)) 249 { 250 uint8x8_t dup_pu1_dst2; 251 UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst; 252 pu1_dst_const_two_nt_1 += (two_nt + 1); 253 dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]); 254 for(i = nt; i > 0; i -= 8) 255 { 256 vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2); 257 pu1_dst_const_two_nt_1 += 8; 258 259 } 260 } 261 else 262 { 263 for(i = 0; i < nt; i++) 264 pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt]; 265 } 266 } 267 if(tp_right == 0) 268 { 269 uint8x8_t dup_pu1_dst3; 270 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst; 271 pu1_dst_const_three_nt_1 += (three_nt + 1); 272 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]); 273 if(0 == (nt & 7)) 274 { 275 for(i = nt; i > 0; i -= 8) 276 { 277 vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3); 278 pu1_dst_const_three_nt_1 += 8; 279 280 } 281 } 282 else 283 { 284 for(i = nt; i > 0; i -= 4) 285 { 286 vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0); 287 pu1_dst_const_three_nt_1 += 4; 288 } 289 290 } 291 292 } 293 } 294 if(nt == 16) 295 { 296 WORD32 nbr_flags_temp = 0; 297 nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2) 298 + ((nbr_flags & 0x300) >> 4) 299 + ((nbr_flags & 0x3000) >> 6) 300 + ((nbr_flags & 0x10000) >> 8); 301 302 /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/ 303 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ 304 { 305 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */ 306 307 if(nbr_id_from_bl == 64) 308 nbr_id_from_bl = 32; 309 310 if(nbr_id_from_bl == 32) 311 { 312 /* for top left : 1 pel per nbr bit */ 313 if(!((nbr_flags_temp >> 8) & 0x1)) 314 { 315 nbr_id_from_bl++; 316 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */ 317 } 318 } 319 /* Reverse Substitution Process*/ 320 if(nbr_id_from_bl) 321 { 322 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ 323 pu1_ref = pu1_dst[nbr_id_from_bl]; 324 for(i = (nbr_id_from_bl - 1); i >= 0; i--) 325 { 326 pu1_dst[i] = pu1_ref; 327 } 328 } 329 } 330 331 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ 332 while(nbr_id_from_bl < ((T16_4NT) + 1)) 333 { 334 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ 335 /* Devide by 8 to obtain the original index */ 336 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ 337 338 /* The Top-left flag is at the last bit location of nbr_flags*/ 339 if(nbr_id_from_bl == (T16_4NT / 2)) 340 { 341 get_bits = GET_BITS(nbr_flags_temp, 8); 342 343 /* only pel substitution for TL */ 344 if(!get_bits) 345 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; 346 } 347 else 348 { 349 get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag); 350 if(!get_bits) 351 { 352 /* 8 pel substitution (other than TL) */ 353 pu1_ref = pu1_dst[nbr_id_from_bl - 1]; 354 for(i = 0; i < 8; i++) 355 pu1_dst[nbr_id_from_bl + i] = pu1_ref; 356 } 357 358 } 359 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8; 360 } 361 } 362 363 if(nt == 32) 364 { 365 /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/ 366 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ 367 { 368 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */ 369 370 if(nbr_id_from_bl == 64) 371 { 372 /* for top left : 1 pel per nbr bit */ 373 if(!((nbr_flags >> 16) & 0x1)) 374 { 375 /* top left not available */ 376 nbr_id_from_bl++; 377 /* top and top right; 8 pels per nbr bit */ 378 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8; 379 } 380 } 381 /* Reverse Substitution Process*/ 382 if(nbr_id_from_bl) 383 { 384 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ 385 pu1_ref = pu1_dst[nbr_id_from_bl]; 386 for(i = (nbr_id_from_bl - 1); i >= 0; i--) 387 pu1_dst[i] = pu1_ref; 388 } 389 } 390 391 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ 392 while(nbr_id_from_bl < ((T32_4NT)+1)) 393 { 394 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ 395 /* Devide by 8 to obtain the original index */ 396 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ 397 398 /* The Top-left flag is at the last bit location of nbr_flags*/ 399 if(nbr_id_from_bl == (T32_4NT / 2)) 400 { 401 get_bits = GET_BITS(nbr_flags, 16); 402 /* only pel substitution for TL */ 403 if(!get_bits) 404 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; 405 } 406 else 407 { 408 get_bits = GET_BITS(nbr_flags, frwd_nbr_flag); 409 if(!get_bits) 410 { 411 /* 8 pel substitution (other than TL) */ 412 pu1_ref = pu1_dst[nbr_id_from_bl - 1]; 413 for(i = 0; i < 8; i++) 414 pu1_dst[nbr_id_from_bl + i] = pu1_ref; 415 } 416 417 } 418 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8; 419 } 420 } 421 422 } 423 424 } 425 426 /** 427 ******************************************************************************* 428 * 429 * @brief 430 * Intra prediction interpolation filter for ref_filtering 431 * 432 * 433 * @par Description: 434 * Reference DC filtering for neighboring samples dependent on TU size and 435 * mode Refer to section 8.4.4.2.3 in the standard 436 * 437 * @param[in] pu1_src 438 * UWORD8 pointer to the source 439 * 440 * @param[out] pu1_dst 441 * UWORD8 pointer to the destination 442 * 443 * @param[in] nt 444 * integer Transform Block size 445 * 446 * @param[in] mode 447 * integer intraprediction mode 448 * 449 * @returns 450 * 451 * @remarks 452 * None 453 * 454 ******************************************************************************* 455 */ 456 457 458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src, 459 WORD32 nt, 460 UWORD8 *pu1_dst, 461 WORD32 mode, 462 WORD32 strong_intra_smoothing_enable_flag) 463 { 464 WORD32 filter_flag; 465 WORD32 i = 0; 466 WORD32 four_nt = 4 * nt; 467 468 WORD32 src_4nt; 469 WORD32 src_0nt; 470 /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */ 471 /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */ 472 UWORD8 *pu1_src_tmp_0 = pu1_src; 473 UWORD8 *pu1_src_tmp_1; 474 UWORD8 *pu1_src_tmp_2; 475 UWORD8 *pu1_dst_tmp_0 = pu1_dst; 476 UWORD8 *pu1_dst_tmp_1; 477 478 uint8x8_t src_val_0, src_val_2; 479 uint8x8_t src_val_1, shift_res; 480 uint8x8_t dup_const_2; 481 uint16x8_t mul_res, add_res; 482 WORD32 bi_linear_int_flag = 0; 483 WORD32 abs_cond_left_flag = 0; 484 WORD32 abs_cond_top_flag = 0; 485 WORD32 dc_val = 1 << (BIT_DEPTH - 5); 486 shift_res = vdup_n_u8(0); 487 488 filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)); 489 490 if(0 == filter_flag) 491 { 492 if(pu1_src == pu1_dst) 493 { 494 return; 495 } 496 else 497 { 498 for(i = four_nt; i > 0; i -= 8) 499 { 500 src_val_0 = vld1_u8(pu1_src_tmp_0); 501 pu1_src_tmp_0 += 8; 502 vst1_u8(pu1_dst_tmp_0, src_val_0); 503 pu1_dst_tmp_0 += 8; 504 } 505 pu1_dst[four_nt] = pu1_src[four_nt]; 506 } 507 } 508 509 else 510 { 511 /* If strong intra smoothin is enabled and transform size is 32 */ 512 if((1 == strong_intra_smoothing_enable_flag) && (32 == nt)) 513 { 514 /*Strong Intra Filtering*/ 515 abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt] 516 - (2 * pu1_src[3 * nt]))) < dc_val; 517 abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0] 518 - (2 * pu1_src[nt]))) < dc_val; 519 520 bi_linear_int_flag = ((1 == abs_cond_left_flag) 521 && (1 == abs_cond_top_flag)); 522 } 523 524 src_4nt = pu1_src[4 * nt]; 525 src_0nt = pu1_src[0]; 526 /* Strong filtering of reference samples */ 527 if(1 == bi_linear_int_flag) 528 { 529 WORD32 two_nt = four_nt >> 1; 530 531 WORD32 pu1_src_0_val = pu1_src[0]; 532 WORD32 pu1_src_2_nt_val = pu1_src[2 * nt]; 533 WORD32 pu1_src_4_nt_val = pu1_src[4 * nt]; 534 535 WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val; 536 uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val); 537 538 WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val; 539 uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val); 540 541 const UWORD8 *const_col_i; 542 uint8x8_t const_col_i_val; 543 uint16x8_t prod_val_1; 544 uint16x8_t prod_val_2; 545 uint16x8_t prod_val_3; 546 uint16x8_t prod_val_4; 547 uint8x8_t res_val_1; 548 uint8x8_t res_val_2; 549 uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val); 550 uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val); 551 uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val); 552 pu1_dst_tmp_0 = pu1_dst + 1; 553 pu1_dst_tmp_1 = pu1_dst + two_nt + 1; 554 555 const_col_i = gau1_ihevc_planar_factor + 1; 556 557 for(i = two_nt; i > 0; i -= 8) 558 { 559 const_col_i_val = vld1_u8(const_col_i); 560 const_col_i += 8; 561 562 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t); 563 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t); 564 565 res_val_1 = vrshrn_n_u16(prod_val_2, 6); 566 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t); 567 568 vst1_u8(pu1_dst_tmp_0, res_val_1); 569 pu1_dst_tmp_0 += 8; 570 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t); 571 572 res_val_2 = vrshrn_n_u16(prod_val_4, 6); 573 vst1_u8(pu1_dst_tmp_1, res_val_2); 574 pu1_dst_tmp_1 += 8; 575 } 576 pu1_dst[2 * nt] = pu1_src[2 * nt]; 577 } 578 else 579 { 580 pu1_src_tmp_1 = pu1_src + 1; 581 pu1_src_tmp_2 = pu1_src + 2; 582 pu1_dst_tmp_0 += 1; 583 584 dup_const_2 = vdup_n_u8(2); 585 586 /* Extremities Untouched*/ 587 pu1_dst[0] = pu1_src[0]; 588 589 /* To avoid the issue when the dest and src has the same pointer this load has been done 590 * outside and the 2nd consecutive load is done before the store of the 1st */ 591 592 /* Perform bilinear filtering of Reference Samples */ 593 for(i = (four_nt - 1); i > 0; i -= 8) 594 { 595 src_val_0 = vld1_u8(pu1_src_tmp_0); 596 pu1_src_tmp_0 += 8; 597 598 src_val_2 = vld1_u8(pu1_src_tmp_2); 599 pu1_src_tmp_2 += 8; 600 601 src_val_1 = vld1_u8(pu1_src_tmp_1); 602 pu1_src_tmp_1 += 8; 603 604 if(i < four_nt - 1) 605 { 606 vst1_u8(pu1_dst_tmp_0, shift_res); 607 pu1_dst_tmp_0 += 8; 608 } 609 610 add_res = vaddl_u8(src_val_0, src_val_2); 611 612 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2); 613 shift_res = vrshrn_n_u16(mul_res, 2); 614 615 } 616 vst1_u8(pu1_dst_tmp_0, shift_res); 617 pu1_dst_tmp_0 += 8; 618 } 619 pu1_dst[4 * nt] = src_4nt; 620 pu1_dst[0] = src_0nt; 621 } 622 623 } 624 625 626 627 /** 628 ******************************************************************************* 629 * 630 * @brief 631 * Intra prediction interpolation filter for luma planar 632 * 633 * @par Description: 634 * Planar Intraprediction with reference neighboring samples location 635 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 636 * 637 * @param[in] pu1_src 638 * UWORD8 pointer to the source 639 * 640 * @param[out] pu1_dst 641 * UWORD8 pointer to the destination 642 * 643 * @param[in] src_strd 644 * integer source stride 645 * 646 * @param[in] dst_strd 647 * integer destination stride 648 * 649 * @param[in] nt 650 * integer Transform Block size 651 * 652 * @param[in] wd 653 * integer width of the array 654 * 655 * @returns 656 * 657 * @remarks 658 * None 659 * 660 ******************************************************************************* 661 */ 662 663 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref, 664 WORD32 src_strd, 665 UWORD8 *pu1_dst, 666 WORD32 dst_strd, 667 WORD32 nt, 668 WORD32 mode) 669 { 670 /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */ 671 /* load const_nt_1_col values into a d register */ 672 /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */ 673 /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */ 674 /* log2nt + 1 is taken care while assigning the values itself */ 675 /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/ 676 677 WORD32 row, col = 0; 678 WORD32 log2nt_plus1 = 6; 679 WORD32 two_nt, three_nt; 680 UWORD8 *pu1_ref_two_nt_1; 681 UWORD8 *pu1_dst_tmp; 682 const UWORD8 *const_nt_1_col; 683 uint8x8_t const_nt_1_col_t; 684 const UWORD8 *const_col_1; 685 uint8x8_t const_col_1_t; 686 uint8_t const_nt_1_row; 687 uint8x8_t const_nt_1_row_dup; 688 uint8_t const_row_1; 689 uint8x8_t const_row_1_dup; 690 uint8_t const_nt = nt; 691 uint16x8_t const_nt_dup; 692 uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1]; 693 uint8x8_t pu1_ref_nt_1_dup; 694 uint8_t pu1_ref_two_nt_1_row; 695 uint8_t pu1_ref_three_nt_1; 696 uint8x8_t pu1_ref_two_nt_1_row_dup; 697 uint8x8_t pu1_ref_two_nt_1_t; 698 uint8x8_t pu1_ref_three_nt_1_dup; 699 uint16x8_t prod_t1; 700 uint16x8_t prod_t2; 701 uint16x8_t sto_res_tmp; 702 uint8x8_t sto_res; 703 int16x8_t log2nt_dup; 704 UNUSED(src_strd); 705 UNUSED(mode); 706 log2nt_plus1 = 32 - CLZ(nt); 707 two_nt = 2 * nt; 708 three_nt = 3 * nt; 709 /* loops have been unrolld considering the fact width is multiple of 8 */ 710 if(0 == (nt & 7)) 711 { 712 pu1_dst_tmp = pu1_dst; 713 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8; 714 715 const_col_1 = gau1_ihevc_planar_factor + 1; 716 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1]; 717 718 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1); 719 const_nt_dup = vdupq_n_u16(const_nt); 720 721 log2nt_dup = vdupq_n_s16(log2nt_plus1); 722 log2nt_dup = vnegq_s16(log2nt_dup); 723 724 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1); 725 726 for(row = 0; row < nt; row++) 727 { 728 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row]; 729 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row); 730 731 const_nt_1_row = nt - 1 - row; 732 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row); 733 734 const_row_1 = row + 1; 735 const_row_1_dup = vdup_n_u8(const_row_1); 736 737 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8; 738 739 const_col_1 = gau1_ihevc_planar_factor + 1; 740 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1; 741 742 for(col = nt; col > 0; col -= 8) 743 { 744 const_nt_1_col_t = vld1_u8(const_nt_1_col); 745 const_nt_1_col -= 8; 746 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t); 747 748 const_col_1_t = vld1_u8(const_col_1); 749 const_col_1 += 8; 750 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup); 751 752 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1); 753 pu1_ref_two_nt_1 += 8; 754 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup); 755 756 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t); 757 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup); 758 prod_t1 = vaddq_u16(prod_t1, const_nt_dup); 759 prod_t1 = vaddq_u16(prod_t1, prod_t2); 760 761 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup)); 762 sto_res = vmovn_u16(sto_res_tmp); 763 vst1_u8(pu1_dst_tmp, sto_res); 764 pu1_dst_tmp += 8; 765 } 766 pu1_dst_tmp += dst_strd - nt; 767 } 768 } 769 /* loops have been unrolld considering the fact width is multiple of 4 */ 770 /* If column is multiple of 4 then height should be multiple of 2 */ 771 else 772 { 773 uint8x8_t const_row_1_dup1; 774 uint8x8_t pu1_ref_two_nt_1_t1; 775 uint8x8_t const_nt_1_col_t1; 776 uint8x8_t const_col_1_t1; 777 uint8x8_t pu1_ref_two_nt_1_row_dup1; 778 uint8x8_t const_nt_1_row_dup1; 779 780 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1]; 781 782 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1); 783 const_nt_dup = vdupq_n_u16(const_nt); 784 785 log2nt_dup = vdupq_n_s16(log2nt_plus1); 786 log2nt_dup = vnegq_s16(log2nt_dup); 787 788 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1); 789 790 for(row = 0; row < nt; row += 2) 791 { 792 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row]; 793 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row); 794 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row]; 795 pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row); 796 pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4); 797 798 const_nt_1_row = nt - 1 - row; 799 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row); 800 const_nt_1_row = nt - 2 - row; 801 const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row); 802 const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4); 803 804 const_row_1 = row + 1; 805 const_row_1_dup = vdup_n_u8(const_row_1); 806 const_row_1 = row + 2; 807 const_row_1_dup1 = vdup_n_u8(const_row_1); 808 const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4); 809 810 const_nt_1_col = gau1_ihevc_planar_factor + nt - 4; 811 812 const_col_1 = gau1_ihevc_planar_factor + 1; 813 814 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1; 815 816 for(col = nt; col > 0; col -= 4) 817 { 818 const_nt_1_col_t = vld1_u8(const_nt_1_col); 819 const_nt_1_col -= 4; 820 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t); 821 822 const_col_1_t = vld1_u8(const_col_1); 823 const_col_1 += 4; 824 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32)); 825 826 pu1_dst_tmp = pu1_dst; 827 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4); 828 829 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32)); 830 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup); 831 832 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1); 833 pu1_ref_two_nt_1 += 4; 834 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4); 835 836 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32)); 837 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup); 838 839 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4); 840 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup); 841 842 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t); 843 prod_t1 = vaddq_u16(prod_t1, const_nt_dup); 844 prod_t1 = vaddq_u16(prod_t1, prod_t2); 845 846 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup)); 847 sto_res = vmovn_u16(sto_res_tmp); 848 849 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 850 pu1_dst_tmp += dst_strd; 851 852 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); 853 pu1_dst += 4; 854 } 855 pu1_dst += 2 * dst_strd - nt; 856 } 857 } 858 859 } 860 /* INTRA_PRED_LUMA_PLANAR */ 861 862 /** 863 ******************************************************************************* 864 * 865 * @brief 866 * Intra prediction interpolation filter for luma dc 867 * 868 * @par Description: 869 * Intraprediction for DC mode with reference neighboring samples location 870 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 871 * 872 * @param[in] pu1_src 873 * UWORD8 pointer to the source 874 * 875 * @param[out] pu1_dst 876 * UWORD8 pointer to the destination 877 * 878 * @param[in] src_strd 879 * integer source stride 880 * 881 * @param[in] dst_strd 882 * integer destination stride 883 * 884 * @param[in] nt 885 * integer Transform Block size 886 * 887 * @param[in] wd 888 * integer width of the array 889 * 890 * @returns 891 * 892 * @remarks 893 * None 894 * 895 ******************************************************************************* 896 */ 897 898 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref, 899 WORD32 src_strd, 900 UWORD8 *pu1_dst, 901 WORD32 dst_strd, 902 WORD32 nt, 903 WORD32 mode) 904 { 905 WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0; 906 WORD32 i = 0; 907 WORD32 row = 0, col = 0, col_count; 908 WORD32 log2nt_plus1 = 6; 909 WORD32 two_nt = 0; 910 uint16x8_t ref_load_q; 911 uint16x8_t three_dc_val_t; 912 uint8x8_t sto_res_tmp; 913 uint8x8_t sto_res_tmp1; 914 uint8x8_t sto_res_tmp2; 915 uint8x8_t sto_res_tmp3; 916 uint8x8_t sto_res_tmp4; 917 uint8x8_t dc_val_t; 918 919 UWORD8 *pu1_ref_tmp; 920 UWORD8 *pu1_ref_tmp1; 921 UWORD8 *pu1_dst_tmp; 922 UWORD8 *pu1_dst_tmp1; 923 UWORD8 *pu1_dst_tmp2; 924 UNUSED(src_strd); 925 UNUSED(mode); 926 927 /* log2nt + 1 is taken care while assigning the values itself. */ 928 log2nt_plus1 = 32 - CLZ(nt); 929 930 /* loops have been unrolld considering the fact width is multiple of 8 */ 931 if(0 == (nt & 7)) 932 { 933 uint8x8_t ref_load1; 934 uint8x8_t ref_load2; 935 uint16x4_t acc_dc_pair1; 936 uint32x2_t acc_dc_pair2; 937 uint64x1_t acc_dc = vdup_n_u64(col); 938 939 two_nt = 2 * nt; 940 pu1_ref_tmp = pu1_ref + nt; 941 pu1_ref_tmp1 = pu1_ref + two_nt + 1; 942 943 for(i = two_nt; i > nt; i -= 8) 944 { 945 ref_load1 = vld1_u8(pu1_ref_tmp); 946 pu1_ref_tmp += 8; 947 acc_dc_pair1 = vpaddl_u8(ref_load1); 948 949 ref_load2 = vld1_u8(pu1_ref_tmp1); 950 pu1_ref_tmp1 += 8; 951 952 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1); 953 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2); 954 955 acc_dc_pair1 = vpaddl_u8(ref_load2); 956 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1); 957 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2); 958 } 959 960 dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1); 961 dc_val_t = vdup_n_u8(dc_val); 962 two_dc_val = 2 * dc_val; 963 three_dc_val = 3 * dc_val; 964 three_dc_val += 2; 965 966 three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val); 967 pu1_ref_tmp = pu1_ref + two_nt + 1 + 0; 968 pu1_dst_tmp = pu1_dst; 969 970 971 if(nt == 32) 972 { 973 for(row = 0; row < nt; row++) 974 { 975 for(col = nt; col > 0; col -= 8) 976 { 977 vst1_u8(pu1_dst_tmp, dc_val_t); 978 pu1_dst_tmp += 8; 979 } 980 pu1_dst_tmp += dst_strd - nt; 981 } 982 } 983 else 984 985 { 986 for(col = nt; col > 0; col -= 8) 987 { 988 ref_load1 = vld1_u8(pu1_ref_tmp); 989 pu1_ref_tmp += 8; 990 ref_load_q = vmovl_u8(ref_load1); 991 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t); 992 ref_load_q = vshrq_n_u16(ref_load_q, 2); 993 sto_res_tmp = vmovn_u16(ref_load_q); 994 vst1_u8(pu1_dst_tmp, sto_res_tmp); 995 pu1_dst_tmp += 8; 996 } 997 998 pu1_ref_tmp = pu1_ref + two_nt - 9; 999 pu1_dst_tmp = pu1_dst + dst_strd; 1000 col_count = nt - 8; 1001 1002 /* Except the first row the remaining rows are done here */ 1003 /* Both column and row has been unrolled by 8 */ 1004 /* Store has been taken care for the unrolling */ 1005 /* Except the 1st column of the remaining rows(other than 1st row), the values are */ 1006 /* constant hence it is extracted with an constant value and stored */ 1007 /* If the column is greater than 8, then the remaining values are constant which is */ 1008 /* taken care in the inner for loop */ 1009 1010 for(row = nt; row > 0; row -= 8) 1011 { 1012 pu1_dst_tmp1 = pu1_dst_tmp + 8; 1013 ref_load1 = vld1_u8(pu1_ref_tmp); 1014 pu1_ref_tmp -= 8; 1015 ref_load_q = vmovl_u8(ref_load1); 1016 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t); 1017 ref_load_q = vshrq_n_u16(ref_load_q, 2); 1018 sto_res_tmp = vmovn_u16(ref_load_q); 1019 1020 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7); 1021 1022 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8)); 1023 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7); 1024 vst1_u8(pu1_dst_tmp, sto_res_tmp1); 1025 pu1_dst_tmp += dst_strd; 1026 1027 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16)); 1028 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7); 1029 vst1_u8(pu1_dst_tmp, sto_res_tmp2); 1030 pu1_dst_tmp += dst_strd; 1031 1032 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24)); 1033 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7); 1034 vst1_u8(pu1_dst_tmp, sto_res_tmp3); 1035 pu1_dst_tmp += dst_strd; 1036 1037 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32)); 1038 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7); 1039 vst1_u8(pu1_dst_tmp, sto_res_tmp4); 1040 pu1_dst_tmp += dst_strd; 1041 1042 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40)); 1043 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7); 1044 vst1_u8(pu1_dst_tmp, sto_res_tmp1); 1045 pu1_dst_tmp += dst_strd; 1046 1047 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48)); 1048 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7); 1049 vst1_u8(pu1_dst_tmp, sto_res_tmp2); 1050 pu1_dst_tmp += dst_strd; 1051 1052 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56)); 1053 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7); 1054 vst1_u8(pu1_dst_tmp, sto_res_tmp3); 1055 pu1_dst_tmp += dst_strd; 1056 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ 1057 if(row != 8) 1058 vst1_u8(pu1_dst_tmp, sto_res_tmp4); 1059 pu1_dst_tmp += dst_strd; 1060 1061 for(col = col_count; col > 0; col -= 8) 1062 { 1063 pu1_dst_tmp2 = pu1_dst_tmp1; 1064 vst1_u8(pu1_dst_tmp1, dc_val_t); 1065 pu1_dst_tmp1 += dst_strd; 1066 vst1_u8(pu1_dst_tmp1, dc_val_t); 1067 pu1_dst_tmp1 += dst_strd; 1068 vst1_u8(pu1_dst_tmp1, dc_val_t); 1069 pu1_dst_tmp1 += dst_strd; 1070 vst1_u8(pu1_dst_tmp1, dc_val_t); 1071 pu1_dst_tmp1 += dst_strd; 1072 vst1_u8(pu1_dst_tmp1, dc_val_t); 1073 pu1_dst_tmp1 += dst_strd; 1074 vst1_u8(pu1_dst_tmp1, dc_val_t); 1075 pu1_dst_tmp1 += dst_strd; 1076 vst1_u8(pu1_dst_tmp1, dc_val_t); 1077 pu1_dst_tmp1 += dst_strd; 1078 1079 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ 1080 if(row != 8) 1081 vst1_u8(pu1_dst_tmp1, dc_val_t); 1082 pu1_dst_tmp1 = pu1_dst_tmp2 + 8; 1083 } 1084 } 1085 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2; 1086 } 1087 } 1088 /* loops have been unrolld considering the fact width is multiple of 4 */ 1089 else 1090 { 1091 WORD32 acc_dc; 1092 two_nt = 2 * nt; 1093 1094 acc_dc = 0; 1095 pu1_ref_tmp = pu1_ref + nt + 1; 1096 for(i = nt; i < two_nt; i++) 1097 { 1098 acc_dc += pu1_ref[i]; 1099 acc_dc += pu1_ref_tmp[i]; 1100 } 1101 dc_val = (acc_dc + nt) >> (log2nt_plus1); 1102 two_dc_val = 2 * dc_val; 1103 three_dc_val = 3 * dc_val; 1104 three_dc_val = three_dc_val + 2; 1105 dc_val_t = vdup_n_u8(dc_val); 1106 1107 if(nt == 32) 1108 { 1109 pu1_dst_tmp = pu1_dst; 1110 for(row = 0; row < nt; row++) 1111 { 1112 for(col = nt; col > 0; col -= 4) 1113 { 1114 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0); 1115 pu1_dst_tmp += 4; 1116 } 1117 pu1_dst_tmp += dst_strd - nt; 1118 } 1119 } 1120 else 1121 1122 { 1123 for(col = 1; col < nt; col++) 1124 { 1125 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2; 1126 } 1127 1128 pu1_dst_tmp = pu1_dst + dst_strd + 0; 1129 /* Since first row is already updated before, loop count is nt-1 */ 1130 for(row = nt - 1; row > 0; row -= 1) 1131 { 1132 for(col = nt; col > 0; col -= 4) 1133 { 1134 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0); 1135 pu1_dst_tmp += 4; 1136 } 1137 pu1_dst_tmp += dst_strd - nt; 1138 } 1139 1140 for(row = 1; row < nt; row++) 1141 { 1142 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2; 1143 } 1144 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2; 1145 } 1146 } 1147 } 1148 /* INTRA_PRED_LUMA_DC */ 1149 1150 /** 1151 ******************************************************************************* 1152 * 1153 * @brief 1154 * Intra prediction interpolation filter for horizontal luma variable. 1155 * 1156 * @par Description: 1157 * Horizontal intraprediction with reference neighboring samples location 1158 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 1159 * 1160 * @param[in] pu1_src 1161 * UWORD8 pointer to the source 1162 * 1163 * @param[out] pu1_dst 1164 * UWORD8 pointer to the destination 1165 * 1166 * @param[in] src_strd 1167 * integer source stride 1168 * 1169 * @param[in] dst_strd 1170 * integer destination stride 1171 * 1172 * @param[in] nt 1173 * integer Transform Block size 1174 * 1175 * @param[in] wd 1176 * integer width of the array 1177 * 1178 * @returns 1179 * 1180 * @remarks 1181 * None 1182 * 1183 ******************************************************************************* 1184 */ 1185 1186 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref, 1187 WORD32 src_strd, 1188 UWORD8 *pu1_dst, 1189 WORD32 dst_strd, 1190 WORD32 nt, 1191 WORD32 mode) 1192 { 1193 1194 WORD32 row, col; 1195 WORD32 two_nt; 1196 UNUSED(src_strd); 1197 UNUSED(mode); 1198 1199 two_nt = 2 * nt; 1200 1201 1202 UWORD8 *pu1_dst_tmp = pu1_dst; 1203 UWORD32 pu1_val; 1204 uint8x8_t pu1_val_two_nt_1_row; 1205 if(nt == 32) 1206 { 1207 pu1_dst_tmp = pu1_dst; 1208 for(row = 0; row < nt; row++) 1209 { 1210 pu1_val = pu1_ref[two_nt - 1 - row]; 1211 pu1_val_two_nt_1_row = vdup_n_u8(pu1_val); 1212 for(col = nt; col > 0; col -= 8) 1213 { 1214 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row); 1215 pu1_dst_tmp += 8; 1216 } 1217 pu1_dst_tmp += dst_strd - nt; 1218 } 1219 } 1220 else 1221 1222 1223 /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/ 1224 /* naming of variables made according to the operation(instructions) it performs*/ 1225 /* (eg. shift_val which contains the shifted value, */ 1226 /* add_sat which has add and saturated value) */ 1227 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1228 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 1229 { 1230 if(0 != (nt & 7)) /* cond for multiple of 4 */ 1231 { 1232 UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref; 1233 UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref; 1234 UWORD8 *pu1_dst_4 = pu1_dst; 1235 UWORD8 *pu1_dst_4_tmp = pu1_dst; 1236 1237 uint32x2_t pu1_ref_val1, pu1_ref_val2; 1238 uint8x8_t dup_sub, round_val, dup_val; 1239 uint16x8_t dup_add, sub_val; 1240 int16x8_t shift_val, add_sat; 1241 1242 pu1_ref_val1 = vdup_n_u32(0); 1243 pu1_ref_val2 = vdup_n_u32(0); 1244 1245 dup_sub = vdup_n_u8(pu1_ref[two_nt]); 1246 1247 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]); 1248 1249 pu1_ref_4_two_nt_plus1 += (two_nt + 1); 1250 1251 pu1_ref_4_two_nt_minus_nt += (two_nt - nt); 1252 1253 for(row = nt; row > 0; row -= 4) 1254 { 1255 for(col = nt; col > 0; col -= 4) 1256 { 1257 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0); 1258 sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub); 1259 shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); 1260 1261 add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add)); 1262 round_val = vqmovun_s16(add_sat); 1263 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0); 1264 pu1_dst_4 += dst_strd; 1265 1266 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0); 1267 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2); 1268 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); 1269 pu1_dst_4 += dst_strd; 1270 1271 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1); 1272 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); 1273 pu1_dst_4 += dst_strd; 1274 1275 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0); 1276 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0); 1277 pu1_dst_4 += dst_strd; 1278 1279 1280 } 1281 /* worst cases */ 1282 pu1_ref_4_two_nt_minus_nt += 3; 1283 pu1_ref_4_two_nt_plus1 += 4; 1284 pu1_dst_4 = (pu1_dst_4_tmp + 4); 1285 } 1286 1287 } 1288 1289 /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */ 1290 /* naming of variables made according to the operation(instructions) it performs */ 1291 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1292 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1293 1294 else 1295 { 1296 UWORD8 *pu1_ref_tmp_1 = pu1_ref; 1297 UWORD8 *pu1_ref_tmp_2 = pu1_ref; 1298 1299 UWORD8 *pu1_dst_tmp_1 = pu1_dst; 1300 UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd; 1301 UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd; 1302 1303 uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res; 1304 uint16x8_t sub_res, dup_add; 1305 int16x8_t shift_res, add_res; 1306 1307 dup_sub = vdup_n_u8(pu1_ref[two_nt]); 1308 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]); 1309 1310 pu1_ref_tmp_1 += (two_nt + 1); 1311 pu1_ref_tmp_2 += (two_nt - 1); 1312 1313 for(col = nt; col > 0; col -= 8) 1314 { 1315 src_tmp = vld1_u8(pu1_ref_tmp_1); 1316 pu1_ref_tmp_1 += 8; 1317 1318 sub_res = vsubl_u8(src_tmp, dup_sub); 1319 shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1); 1320 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add)); 1321 round_val = vqmovun_s16(add_res); 1322 vst1_u8(pu1_dst_tmp_1, round_val); 1323 pu1_dst_tmp_1 += 8; 1324 } 1325 1326 for(row = nt; row > 0; row -= 8) 1327 { 1328 pu1_ref_tmp_2 -= 8; 1329 1330 src_tmp_1 = vld1_u8(pu1_ref_tmp_2); 1331 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */ 1332 1333 dup_1 = vdup_lane_u8(rev_res, 0); 1334 dup_2 = vdup_lane_u8(rev_res, 1); 1335 dup_3 = vdup_lane_u8(rev_res, 2); 1336 dup_4 = vdup_lane_u8(rev_res, 3); 1337 dup_5 = vdup_lane_u8(rev_res, 4); 1338 dup_6 = vdup_lane_u8(rev_res, 5); 1339 dup_7 = vdup_lane_u8(rev_res, 6); 1340 dup_8 = vdup_lane_u8(rev_res, 7); 1341 1342 for(col = nt; col > 0; col -= 8) 1343 { 1344 pu1_dst_tmp_2 = pu1_dst_tmp_3; 1345 1346 vst1_u8(pu1_dst_tmp_2, dup_1); 1347 pu1_dst_tmp_2 += dst_strd; 1348 1349 vst1_u8(pu1_dst_tmp_2, dup_2); 1350 pu1_dst_tmp_2 += dst_strd; 1351 1352 vst1_u8(pu1_dst_tmp_2, dup_3); 1353 pu1_dst_tmp_2 += dst_strd; 1354 1355 vst1_u8(pu1_dst_tmp_2, dup_4); 1356 pu1_dst_tmp_2 += dst_strd; 1357 1358 vst1_u8(pu1_dst_tmp_2, dup_5); 1359 pu1_dst_tmp_2 += dst_strd; 1360 1361 vst1_u8(pu1_dst_tmp_2, dup_6); 1362 pu1_dst_tmp_2 += dst_strd; 1363 1364 vst1_u8(pu1_dst_tmp_2, dup_7); 1365 pu1_dst_tmp_2 += dst_strd; 1366 1367 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */ 1368 if(row != 8) 1369 vst1_u8(pu1_dst_tmp_2, dup_8); 1370 pu1_dst_tmp_2 += dst_strd; 1371 1372 pu1_dst_tmp_3 += 8; 1373 } 1374 pu1_dst_tmp_2 -= (nt - 8); 1375 pu1_dst_tmp_3 = pu1_dst_tmp_2; 1376 } 1377 } 1378 } 1379 } 1380 /* INTRA_PRED_LUMA_HORZ */ 1381 1382 /** 1383 ******************************************************************************* 1384 * 1385 * @brief 1386 * Intra prediction interpolation filter for vertical luma variable. 1387 * 1388 * @par Description: 1389 * Horizontal intraprediction with reference neighboring samples location 1390 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' 1391 * 1392 * @param[in] pu1_src 1393 * UWORD8 pointer to the source 1394 * 1395 * @param[out] pu1_dst 1396 * UWORD8 pointer to the destination 1397 * 1398 * @param[in] src_strd 1399 * integer source stride 1400 * 1401 * @param[in] dst_strd 1402 * integer destination stride 1403 * 1404 * @param[in] nt 1405 * integer Transform Block size 1406 * 1407 * @param[in] wd 1408 * integer width of the array 1409 * 1410 * @returns 1411 * 1412 * @remarks 1413 * None 1414 * 1415 ******************************************************************************* 1416 */ 1417 1418 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref, 1419 WORD32 src_strd, 1420 UWORD8 *pu1_dst, 1421 WORD32 dst_strd, 1422 WORD32 nt, 1423 WORD32 mode) 1424 { 1425 WORD32 row, col; 1426 WORD32 two_nt; 1427 UNUSED(src_strd); 1428 UNUSED(mode); 1429 1430 two_nt = 2 * nt; 1431 1432 UWORD8 *pu1_dst_tmp = pu1_dst; 1433 UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1; 1434 uint8x8_t pu1_val_two_nt_1_col; 1435 if(nt == 32) 1436 { 1437 pu1_dst_tmp = pu1_dst; 1438 for(row = 0; row < nt; row++) 1439 { 1440 for(col = nt; col > 0; col -= 8) 1441 { 1442 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1); 1443 pu1_ref_tmp_1 += 8; 1444 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col); 1445 pu1_dst_tmp += 8; 1446 } 1447 pu1_ref_tmp_1 -= nt; 1448 pu1_dst_tmp += dst_strd - nt; 1449 } 1450 } 1451 else 1452 1453 { 1454 /* naming of variables made according to the operation(instructions) it performs */ 1455 /* (eg. shift_val which contains the shifted value, */ 1456 /* add_sat which has add and saturated value) */ 1457 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1458 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 1459 1460 if(0 != (nt & 7)) 1461 { 1462 WORD32 cond_4 = 0; 1463 UWORD8 *pu1_ref_val1 = pu1_ref; 1464 UWORD8 *pu1_ref_val2 = pu1_ref; 1465 UWORD8 *pu1_ref_val3 = pu1_ref; 1466 1467 UWORD8 *pu1_dst_val1 = pu1_dst; 1468 UWORD8 *pu1_dst_val2 = pu1_dst; 1469 UWORD8 *pu1_dst_val3 = pu1_dst; 1470 1471 uint8x8_t dup_2_sub, round_val, vext_val; 1472 uint16x8_t dup_2_add; 1473 uint32x2_t src_val1, src_val2, src_val3; 1474 uint16x8_t sub_val; 1475 int16x8_t shift_val1, add_sat; 1476 uint64x1_t shift_val2; 1477 1478 src_val1 = vdup_n_u32(0); 1479 src_val2 = vdup_n_u32(0); 1480 src_val3 = vdup_n_u32(0); 1481 pu1_ref_val1 += (two_nt - nt); 1482 pu1_ref_val3 += (two_nt + 2); 1483 pu1_ref_val2 += (two_nt + 1); 1484 1485 dup_2_sub = vdup_n_u8(pu1_ref[two_nt]); 1486 dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]); 1487 1488 /* loops to store the first nt sets of values in the destination */ 1489 1490 for(row = nt; row > 0; row -= 4) 1491 { 1492 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4) 1493 { 1494 /* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/ 1495 src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1); 1496 sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub); 1497 shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); 1498 add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add)); 1499 round_val = vqmovun_s16(add_sat); 1500 1501 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/ 1502 src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0); 1503 vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7); 1504 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1505 pu1_dst_val1 += dst_strd; 1506 1507 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8); 1508 1509 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); 1510 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1511 pu1_dst_val1 += dst_strd; 1512 1513 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16); 1514 1515 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); 1516 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1517 pu1_dst_val1 += dst_strd; 1518 1519 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24); 1520 1521 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7); 1522 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0); 1523 pu1_dst_val1 += dst_strd; 1524 1525 pu1_ref_val1 -= 4; 1526 } 1527 1528 /* loop to store next sets of eight values in the destination */ 1529 1530 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4) 1531 { 1532 src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0); 1533 1534 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1535 pu1_dst_val2 += dst_strd; 1536 1537 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1538 pu1_dst_val2 += dst_strd; 1539 1540 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1541 pu1_dst_val2 += dst_strd; 1542 1543 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3)); 1544 pu1_dst_val2 += dst_strd; 1545 } 1546 pu1_ref_val2 += 4; 1547 pu1_dst_val3 += 4; 1548 pu1_dst_val2 = pu1_dst_val3; 1549 cond_4 = 1; 1550 } 1551 } 1552 1553 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1554 else 1555 { 1556 WORD32 cond = 0, col_1; 1557 UWORD8 *pu1_dst_tmp_1 = pu1_dst; 1558 UWORD8 *pu1_dst_tmp_2 = pu1_dst; 1559 UWORD8 *pu1_dst_tmp_3 = pu1_dst; 1560 1561 UWORD8 *pu1_ref_tmp_1 = pu1_ref; 1562 UWORD8 *pu1_ref_tmp_2 = pu1_ref; 1563 UWORD8 *pu1_ref_tmp_3 = pu1_ref; 1564 1565 uint8x8_t pu1_src_tmp1; 1566 uint8x8_t pu1_src_tmp2; 1567 1568 uint8x8_t dup_sub; 1569 uint16x8_t dup_add; 1570 int16x8_t subsh_val; 1571 int16x8_t addsat_val; 1572 uint16x8_t sub_val; 1573 uint8x8_t round_val; 1574 uint8x8_t vext_t; 1575 uint64x1_t shift_64; 1576 1577 dup_sub = vdup_n_u8(pu1_ref[two_nt]); 1578 dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]); 1579 1580 pu1_ref_tmp_1 += (two_nt); 1581 pu1_ref_tmp_1 -= 8; 1582 pu1_ref_tmp_2 += (two_nt + 2); 1583 pu1_ref_tmp_3 += (two_nt + 1); 1584 1585 /* loops to store the first nt sets of values in the destination */ 1586 1587 for(row = nt; row > 0; row -= 8) 1588 { 1589 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8) 1590 { 1591 pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1); 1592 1593 sub_val = vsubl_u8(pu1_src_tmp1, dup_sub); 1594 subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1); 1595 addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add)); 1596 round_val = vqmovun_s16(addsat_val); 1597 1598 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/ 1599 1600 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2); 1601 vext_t = vext_u8(round_val, pu1_src_tmp2, 7); 1602 vst1_u8(pu1_dst_tmp_1, vext_t); 1603 pu1_dst_tmp_1 += dst_strd; 1604 1605 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8); 1606 1607 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1608 vst1_u8(pu1_dst_tmp_1, vext_t); 1609 pu1_dst_tmp_1 += dst_strd; 1610 1611 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16); 1612 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1613 vst1_u8(pu1_dst_tmp_1, vext_t); 1614 pu1_dst_tmp_1 += dst_strd; 1615 1616 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24); 1617 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1618 vst1_u8(pu1_dst_tmp_1, vext_t); 1619 pu1_dst_tmp_1 += dst_strd; 1620 1621 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32); 1622 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1623 vst1_u8(pu1_dst_tmp_1, vext_t); 1624 pu1_dst_tmp_1 += dst_strd; 1625 1626 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40); 1627 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1628 vst1_u8(pu1_dst_tmp_1, vext_t); 1629 pu1_dst_tmp_1 += dst_strd; 1630 1631 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48); 1632 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1633 vst1_u8(pu1_dst_tmp_1, vext_t); 1634 pu1_dst_tmp_1 += dst_strd; 1635 1636 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56); 1637 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7); 1638 vst1_u8(pu1_dst_tmp_1, vext_t); 1639 pu1_dst_tmp_1 += dst_strd; 1640 1641 pu1_ref_tmp_1 -= 8; 1642 } 1643 1644 /* loop to store next sets of eight values in the destination */ 1645 1646 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8) 1647 { 1648 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3); 1649 1650 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1651 pu1_dst_tmp_2 += dst_strd; 1652 1653 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1654 pu1_dst_tmp_2 += dst_strd; 1655 1656 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1657 pu1_dst_tmp_2 += dst_strd; 1658 1659 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1660 pu1_dst_tmp_2 += dst_strd; 1661 1662 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1663 pu1_dst_tmp_2 += dst_strd; 1664 1665 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1666 pu1_dst_tmp_2 += dst_strd; 1667 1668 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1669 pu1_dst_tmp_2 += dst_strd; 1670 1671 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2); 1672 pu1_dst_tmp_2 += dst_strd; 1673 } 1674 pu1_ref_tmp_3 += 8; 1675 pu1_dst_tmp_3 += 8; 1676 pu1_dst_tmp_2 = pu1_dst_tmp_3; 1677 cond = 1; 1678 } 1679 } 1680 } 1681 } 1682 /* INTRA_PRED_LUMA_VER */ 1683 1684 /** 1685 ******************************************************************************* 1686 * 1687 * @brief 1688 * Intra prediction interpolation filter for luma mode2. 1689 * 1690 * @par Description: 1691 * Intraprediction for mode 2 (sw angle) with reference neighboring samples 1692 * location pointed by 'pu1_ref' to the TU block location pointed by 1693 * 'pu1_dst' 1694 * 1695 * @param[in] pu1_src 1696 * UWORD8 pointer to the source 1697 * 1698 * @param[out] pu1_dst 1699 * UWORD8 pointer to the destination 1700 * 1701 * @param[in] src_strd 1702 * integer source stride 1703 * 1704 * @param[in] dst_strd 1705 * integer destination stride 1706 * 1707 * @param[in] nt 1708 * integer Transform Block size 1709 * 1710 * @param[in] wd 1711 * integer width of the array 1712 * 1713 * @returns 1714 * 1715 * @remarks 1716 * None 1717 * 1718 ******************************************************************************* 1719 */ 1720 1721 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref, 1722 WORD32 src_strd, 1723 UWORD8 *pu1_dst, 1724 WORD32 dst_strd, 1725 WORD32 nt, 1726 WORD32 mode) 1727 { 1728 1729 WORD32 row, col; 1730 WORD32 two_nt; 1731 UNUSED(src_strd); 1732 UNUSED(mode); 1733 1734 /* rev_res naming has been made to have the reverse result value in it */ 1735 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1736 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 1737 1738 if(0 != (nt & 7)) 1739 { 1740 UWORD8 *pu1_ref_tmp = pu1_ref; 1741 UWORD8 *pu1_dst_tmp = pu1_dst; 1742 uint8x8_t pu1_src_val, rev_res; 1743 uint64x1_t shift_res; 1744 1745 for(col = nt; col > 0; col -= 4) 1746 { 1747 for(row = nt; row > 0; row -= 4) 1748 { 1749 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */ 1750 1751 pu1_src_val = vld1_u8(pu1_ref_tmp); 1752 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8); 1753 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res)); 1754 1755 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0); 1756 pu1_dst_tmp += dst_strd; 1757 1758 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8); 1759 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); 1760 pu1_dst_tmp += dst_strd; 1761 1762 shift_res = vshr_n_u64(shift_res, 8); 1763 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); 1764 pu1_dst_tmp += dst_strd; 1765 1766 shift_res = vshr_n_u64(shift_res, 8); 1767 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0); 1768 pu1_dst_tmp += dst_strd; 1769 } 1770 } 1771 } 1772 1773 /* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */ 1774 /* shift_64 to shift the reversed 2nd values to get the value what we need */ 1775 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1776 1777 else 1778 { 1779 UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref; 1780 UWORD8 *pu1_dst_tmp = pu1_dst; 1781 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst; 1782 1783 uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first; 1784 uint64x1_t shift_val; 1785 1786 two_nt = 2 * nt; 1787 pu1_ref_two_nt_minus2 += (two_nt); 1788 pu1_ref_two_nt_minus2 -= 8; 1789 1790 for(col = nt; col > 0; col -= 8) 1791 { 1792 for(row = nt; row > 0; row -= 8) 1793 { 1794 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2); 1795 rev_val_first = vrev64_u8(pu1_src_val2); 1796 1797 pu1_ref_two_nt_minus2 -= 8; 1798 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2); 1799 rev_val_second = vrev64_u8(pu1_src_val1); 1800 1801 vext_t = vext_u8(rev_val_first, rev_val_second, 1); 1802 vst1_u8(pu1_dst_tmp, vext_t); 1803 pu1_dst_tmp += dst_strd; 1804 1805 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8); 1806 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1807 vst1_u8(pu1_dst_tmp, vext_t); 1808 pu1_dst_tmp += dst_strd; 1809 1810 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16); 1811 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1812 vst1_u8(pu1_dst_tmp, vext_t); 1813 pu1_dst_tmp += dst_strd; 1814 1815 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24); 1816 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1817 vst1_u8(pu1_dst_tmp, vext_t); 1818 pu1_dst_tmp += dst_strd; 1819 1820 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32); 1821 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1822 vst1_u8(pu1_dst_tmp, vext_t); 1823 pu1_dst_tmp += dst_strd; 1824 1825 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40); 1826 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1827 vst1_u8(pu1_dst_tmp, vext_t); 1828 pu1_dst_tmp += dst_strd; 1829 1830 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48); 1831 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1832 vst1_u8(pu1_dst_tmp, vext_t); 1833 pu1_dst_tmp += dst_strd; 1834 1835 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56); 1836 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1); 1837 vst1_u8(pu1_dst_tmp, vext_t); 1838 pu1_dst_tmp += dst_strd; 1839 } 1840 pu1_dst_tmp_plus8 += 8; 1841 pu1_dst_tmp = pu1_dst_tmp_plus8; 1842 pu1_ref_two_nt_minus2 += (nt - 8); 1843 } 1844 } 1845 } 1846 /* INTRA_PRED_LUMA_MODE2 */ 1847 1848 /** 1849 ******************************************************************************* 1850 * 1851 * @brief 1852 * Intra prediction interpolation filter for luma mode 18 & mode 34. 1853 * 1854 * @par Description: 1855 * Intraprediction for mode 34 (ne angle) with reference neighboring 1856 * samples location pointed by 'pu1_ref' to the TU block location pointed by 1857 * 'pu1_dst' 1858 * 1859 * @param[in] pu1_src 1860 * UWORD8 pointer to the source 1861 * 1862 * @param[out] pu1_dst 1863 * UWORD8 pointer to the destination 1864 * 1865 * @param[in] src_strd 1866 * integer source stride 1867 * 1868 * @param[in] dst_strd 1869 * integer destination stride 1870 * 1871 * @param[in] nt 1872 * integer Transform Block size 1873 * 1874 * @param[in] wd 1875 * integer width of the array 1876 * 1877 * @returns 1878 * 1879 * @remarks 1880 * None 1881 * 1882 ******************************************************************************* 1883 */ 1884 1885 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref, 1886 WORD32 src_strd, 1887 UWORD8 *pu1_dst, 1888 WORD32 dst_strd, 1889 WORD32 nt, 1890 WORD32 mode) 1891 { 1892 1893 WORD32 row, col, idx; 1894 WORD32 intraPredAngle = 32; 1895 WORD32 two_nt; 1896 UNUSED(src_strd); 1897 two_nt = 2 * nt; 1898 1899 UWORD8 *pu1_ref_tmp = pu1_ref; 1900 UWORD8 *pu1_ref_tmp1 = pu1_ref; 1901 UWORD8 *pu1_dst_tmp = pu1_dst; 1902 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst; 1903 1904 uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7; 1905 1906 /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */ 1907 /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */ 1908 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */ 1909 /* rows and columns are unrolled by 8, when the width is multiple of 8 */ 1910 /* loops are maintained separately for mode18 and mode34 */ 1911 1912 /* cond to allow multiples of 8 */ 1913 if(0 == (nt & 7)) 1914 { 1915 if(mode == 34) 1916 { 1917 pu1_ref_tmp += (two_nt + 2); 1918 1919 for(row = nt; row > 0; row -= 8) 1920 { 1921 for(col = nt; col > 0; col -= 8) 1922 { 1923 /* Loading 1st eight values */ 1924 src_tmp_1st = vld1_u8(pu1_ref_tmp); 1925 pu1_ref_tmp += 8; 1926 1927 /* Loading next eight values */ 1928 src_tmp_2nd = vld1_u8(pu1_ref_tmp); 1929 1930 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */ 1931 vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1); 1932 vst1_u8(pu1_dst_tmp, src_tmp_1st); 1933 pu1_dst_tmp += dst_strd; 1934 1935 vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2); 1936 vst1_u8(pu1_dst_tmp, vext1); 1937 pu1_dst_tmp += dst_strd; 1938 1939 vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3); 1940 vst1_u8(pu1_dst_tmp, vext2); 1941 pu1_dst_tmp += dst_strd; 1942 1943 vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4); 1944 vst1_u8(pu1_dst_tmp, vext3); 1945 pu1_dst_tmp += dst_strd; 1946 1947 vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5); 1948 vst1_u8(pu1_dst_tmp, vext4); 1949 pu1_dst_tmp += dst_strd; 1950 1951 vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6); 1952 vst1_u8(pu1_dst_tmp, vext5); 1953 pu1_dst_tmp += dst_strd; 1954 1955 vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7); 1956 vst1_u8(pu1_dst_tmp, vext6); 1957 pu1_dst_tmp += dst_strd; 1958 1959 vst1_u8(pu1_dst_tmp, vext7); 1960 pu1_dst_tmp += dst_strd; 1961 } 1962 1963 pu1_dst_tmp_plus8 += 8; 1964 pu1_dst_tmp = pu1_dst_tmp_plus8; 1965 pu1_ref_tmp -= (nt - 8); 1966 } 1967 } 1968 else /* Loop for mode 18 */ 1969 { 1970 pu1_ref_tmp += (two_nt); 1971 1972 for(row = nt; row > 0; row -= 8) 1973 { 1974 for(col = nt; col > 0; col -= 8) 1975 { 1976 /* Loading 1st eight values */ 1977 src_tmp_1st = vld1_u8(pu1_ref_tmp); 1978 pu1_ref_tmp -= 8; 1979 1980 /* Loading next eight values */ 1981 src_tmp_2nd = vld1_u8(pu1_ref_tmp); 1982 1983 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */ 1984 vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7); 1985 vst1_u8(pu1_dst_tmp, src_tmp_1st); 1986 pu1_dst_tmp += dst_strd; 1987 1988 vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6); 1989 vst1_u8(pu1_dst_tmp, vext1); 1990 pu1_dst_tmp += dst_strd; 1991 1992 vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5); 1993 vst1_u8(pu1_dst_tmp, vext2); 1994 pu1_dst_tmp += dst_strd; 1995 1996 vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4); 1997 vst1_u8(pu1_dst_tmp, vext3); 1998 pu1_dst_tmp += dst_strd; 1999 2000 vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3); 2001 vst1_u8(pu1_dst_tmp, vext4); 2002 pu1_dst_tmp += dst_strd; 2003 2004 vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2); 2005 vst1_u8(pu1_dst_tmp, vext5); 2006 pu1_dst_tmp += dst_strd; 2007 2008 vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1); 2009 vst1_u8(pu1_dst_tmp, vext6); 2010 pu1_dst_tmp += dst_strd; 2011 2012 vst1_u8(pu1_dst_tmp, vext7); 2013 pu1_dst_tmp += dst_strd; 2014 } 2015 pu1_dst_tmp_plus8 += 8; 2016 pu1_dst_tmp = pu1_dst_tmp_plus8; 2017 pu1_ref_tmp += (nt + 8); 2018 } 2019 } 2020 } 2021 2022 /* rows and columns are unrolled by 4, when the width is multiple of 4 */ 2023 2024 else /* loop for multiples of 4 */ 2025 { 2026 uint8x8_t src_val1; 2027 uint8x8_t src_val2; 2028 2029 if(mode == 18) 2030 intraPredAngle = -32; 2031 else if(mode == 34) 2032 intraPredAngle = 32; 2033 2034 for(row = 0; row < nt; row += 2) 2035 { 2036 /* unrolling 2 rows */ 2037 idx = ((row + 1) * intraPredAngle) >> 5; 2038 pu1_ref_tmp = pu1_ref + two_nt + idx + 1; 2039 src_val1 = vld1_u8(pu1_ref_tmp); 2040 2041 idx = ((row + 2) * intraPredAngle) >> 5; 2042 pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1; 2043 src_val2 = vld1_u8(pu1_ref_tmp1); 2044 2045 /* unrolling 4 col */ 2046 for(col = nt; col > 0; col -= 4) 2047 { 2048 pu1_dst_tmp = pu1_dst; 2049 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0); 2050 pu1_dst_tmp += dst_strd; 2051 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0); 2052 pu1_dst += 4; 2053 } 2054 pu1_dst += 2 * dst_strd - nt; 2055 } 2056 } 2057 } 2058 /* INTRA_PRED_LUMA_MODE_18_34 */ 2059 2060 /** 2061 ******************************************************************************* 2062 * 2063 * @brief 2064 * Intra prediction interpolation filter for luma mode 3 to mode 9 2065 * 2066 * @par Description: 2067 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with 2068 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2069 * block location pointed by 'pu1_dst' 2070 * 2071 * @param[in] pu1_src 2072 * UWORD8 pointer to the source 2073 * 2074 * @param[out] pu1_dst 2075 * UWORD8 pointer to the destination 2076 * 2077 * @param[in] src_strd 2078 * integer source stride 2079 * 2080 * @param[in] dst_strd 2081 * integer destination stride 2082 * 2083 * @param[in] nt 2084 * integer Transform Block size 2085 * 2086 * @param[in] mode 2087 * integer intraprediction mode 2088 * 2089 * @returns 2090 * 2091 * @remarks 2092 * None 2093 * 2094 ******************************************************************************* 2095 */ 2096 2097 2098 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref, 2099 WORD32 src_strd, 2100 UWORD8 *pu1_dst, 2101 WORD32 dst_strd, 2102 WORD32 nt, 2103 WORD32 mode) 2104 { 2105 2106 WORD32 row, col; 2107 WORD32 intra_pred_ang; 2108 WORD32 pos, fract = 100, fract_prev; 2109 UNUSED(src_strd); 2110 if(0 == (nt & 7)) 2111 { 2112 2113 UWORD8 *pu1_ref_main_idx = pu1_ref; 2114 UWORD8 *pu1_ref_main_idx_1 = pu1_ref; 2115 2116 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2117 UWORD8 *pu1_dst_tmp2 = pu1_dst; 2118 2119 WORD32 two_nt = 2 * nt; 2120 2121 pu1_ref_main_idx += two_nt; 2122 pu1_ref_main_idx_1 += two_nt - 1; 2123 2124 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1; 2125 uint8x8_t shift_res; 2126 uint16x8_t mul_res1, mul_res2, add_res; 2127 2128 /* Intra Pred Angle according to the mode */ 2129 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2130 2131 pu1_ref_main_idx -= 8; 2132 pu1_ref_main_idx_1 -= 8; 2133 2134 for(col = 0; col < nt; col++) 2135 { 2136 fract_prev = fract; 2137 2138 pos = ((col + 1) * intra_pred_ang); 2139 fract = pos & (31); 2140 2141 if(fract_prev < fract) 2142 { 2143 pu1_ref_main_idx += 1; 2144 pu1_ref_main_idx_1 += 1; 2145 } 2146 2147 dup_const_fract = vdup_n_u8((uint8_t)fract); 2148 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2149 2150 for(row = nt; row > 0; row -= 8) 2151 { 2152 ref_main_idx = vld1_u8(pu1_ref_main_idx); 2153 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1); 2154 2155 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2156 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2157 2158 add_res = vaddq_u16(mul_res1, mul_res2); 2159 2160 shift_res = vrshrn_n_u16(add_res, 5); 2161 2162 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7); 2163 pu1_dst_tmp1 += dst_strd; 2164 2165 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6); 2166 pu1_dst_tmp1 += dst_strd; 2167 2168 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5); 2169 pu1_dst_tmp1 += dst_strd; 2170 2171 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4); 2172 pu1_dst_tmp1 += dst_strd; 2173 2174 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2175 pu1_dst_tmp1 += dst_strd; 2176 2177 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2178 pu1_dst_tmp1 += dst_strd; 2179 2180 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2181 pu1_dst_tmp1 += dst_strd; 2182 2183 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2184 pu1_dst_tmp1 += dst_strd; 2185 2186 pu1_ref_main_idx -= 8; 2187 pu1_ref_main_idx_1 -= 8; 2188 2189 } 2190 pu1_dst_tmp2 += 1; 2191 pu1_dst_tmp1 = pu1_dst_tmp2; 2192 2193 pu1_ref_main_idx += nt; 2194 pu1_ref_main_idx_1 += nt; 2195 2196 pu1_ref_main_idx -= 1; 2197 pu1_ref_main_idx_1 -= 1; 2198 2199 } 2200 } 2201 else 2202 { 2203 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2204 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2205 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2206 UWORD8 *pu1_dst_tmp2 = pu1_dst; 2207 2208 pu1_ref_tmp1 += nt; 2209 pu1_ref_tmp2 += (nt - 1); 2210 2211 uint8x8_t dup_fract, dup_32_fract, shift_res; 2212 uint16x8_t mul_res1, mul_res2, add_res; 2213 uint32x2_t pu1_ref_val1, pu1_ref_val2; 2214 2215 pu1_ref_val1 = vdup_n_u32(0); 2216 pu1_ref_val2 = vdup_n_u32(0); 2217 2218 /* Intra Pred Angle according to the mode */ 2219 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2220 2221 2222 for(col = 0; col < nt; col++) 2223 { 2224 fract_prev = fract; 2225 pos = ((col + 1) * intra_pred_ang); 2226 fract = pos & (31); 2227 if(fract_prev < fract) 2228 { 2229 pu1_ref_tmp1 += 1; 2230 pu1_ref_tmp2 += 1; 2231 } 2232 dup_fract = vdup_n_u8((uint8_t)fract); 2233 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2234 2235 for(row = nt; row > 0; row -= 4) 2236 { 2237 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0); 2238 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0); 2239 2240 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract); 2241 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract); 2242 2243 add_res = vaddq_u16(mul_res1, mul_res2); 2244 2245 shift_res = vrshrn_n_u16(add_res, 5); 2246 2247 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2248 pu1_dst_tmp1 += dst_strd; 2249 2250 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2251 pu1_dst_tmp1 += dst_strd; 2252 2253 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2254 pu1_dst_tmp1 += dst_strd; 2255 2256 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2257 2258 } 2259 pu1_ref_tmp1 -= 1; 2260 pu1_ref_tmp2 -= 1; 2261 2262 pu1_dst_tmp2 += 1; 2263 pu1_dst_tmp1 = pu1_dst_tmp2; 2264 2265 } 2266 2267 2268 } 2269 2270 } 2271 2272 /** 2273 ******************************************************************************* 2274 * 2275 * @brief 2276 * Intra prediction interpolation filter for luma mode 11 to mode 17 2277 * 2278 * @par Description: 2279 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode ) 2280 * with reference neighboring samples location pointed by 'pu1_ref' to the 2281 * TU block location pointed by 'pu1_dst' 2282 * 2283 * @param[in] pu1_src 2284 * UWORD8 pointer to the source 2285 * 2286 * @param[out] pu1_dst 2287 * UWORD8 pointer to the destination 2288 * 2289 * @param[in] src_strd 2290 * integer source stride 2291 * 2292 * @param[in] dst_strd 2293 * integer destination stride 2294 * 2295 * @param[in] nt 2296 * integer Transform Block size 2297 * 2298 * @param[in] mode 2299 * integer intraprediction mode 2300 * 2301 * @returns 2302 * 2303 * @remarks 2304 * None 2305 * 2306 ******************************************************************************* 2307 */ 2308 2309 2310 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref, 2311 WORD32 src_strd, 2312 UWORD8 *pu1_dst, 2313 WORD32 dst_strd, 2314 WORD32 nt, 2315 WORD32 mode) 2316 { 2317 2318 WORD32 row, col, k; 2319 WORD32 two_nt; 2320 WORD32 intra_pred_ang, inv_ang, inv_ang_sum; 2321 WORD32 pos, fract = 1000, fract_prev; 2322 WORD32 ref_idx; 2323 2324 UWORD8 *ref_main; 2325 UWORD8 *ref_main_tmp; 2326 2327 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2328 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2329 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2330 UWORD8 *pu1_dst_tmp2 = pu1_dst; 2331 2332 UWORD8 ref_temp[2 * MAX_CU_SIZE + 1]; 2333 2334 uint16x8_t mul_res1, mul_res2, add_res; 2335 uint8x8_t dup_const_fract, dup_const_32_fract; 2336 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res; 2337 uint8x8_t ref_left_t; 2338 uint32x2_t ref_left_tmp; 2339 UNUSED(src_strd); 2340 ref_left_tmp = vdup_n_u32(0); 2341 2342 inv_ang_sum = 128; 2343 two_nt = 2 * nt; 2344 2345 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2346 2347 inv_ang = gai4_ihevc_inv_ang_table[mode - 11]; 2348 2349 pu1_ref_tmp1 += two_nt; 2350 2351 ref_main = ref_temp + (nt - 1); 2352 ref_main_tmp = ref_main; 2353 2354 if(0 == (nt & 7)) 2355 { 2356 pu1_ref_tmp2 += (two_nt - 7); 2357 2358 for(k = nt - 1; k >= 0; k -= 8) 2359 { 2360 2361 ref_left_t = vld1_u8(pu1_ref_tmp2); 2362 2363 ref_left_t = vrev64_u8(ref_left_t); 2364 vst1_u8(ref_main_tmp, ref_left_t); 2365 ref_main_tmp += 8; 2366 pu1_ref_tmp2 -= 8; 2367 2368 } 2369 2370 } 2371 else 2372 { 2373 uint8x8_t rev_val; 2374 pu1_ref_tmp2 += (two_nt - (nt - 1)); 2375 2376 for(k = nt - 1; k >= 0; k -= 8) 2377 { 2378 2379 ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1); 2380 2381 rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp)); 2382 vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0); 2383 2384 } 2385 2386 } 2387 2388 ref_main[nt] = pu1_ref[two_nt - nt]; 2389 2390 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */ 2391 2392 ref_idx = (nt * intra_pred_ang) >> 5; 2393 2394 /* SIMD Optimization can be done using look-up table for the loop */ 2395 /* For negative angled derive the main reference samples from side */ 2396 /* reference samples refer to section 8.4.4.2.6 */ 2397 for(k = -1; k > ref_idx; k--) 2398 { 2399 inv_ang_sum += inv_ang; 2400 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)]; 2401 } 2402 2403 UWORD8 *ref_main_tmp1 = ref_main; 2404 UWORD8 *ref_main_tmp2 = ref_main; 2405 2406 ref_main_tmp2 += 1; 2407 2408 if(0 == (nt & 7)) 2409 { 2410 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 2411 /* samples dependent on distance to obtain destination sample */ 2412 for(col = 0; col < nt; col++) 2413 { 2414 2415 fract_prev = fract; 2416 pos = ((col + 1) * intra_pred_ang); 2417 fract = pos & (31); 2418 2419 if(fract_prev < fract) 2420 { 2421 ref_main_tmp1 -= 1; 2422 ref_main_tmp2 -= 1; 2423 } 2424 2425 dup_const_fract = vdup_n_u8((uint8_t)fract); 2426 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2427 2428 // Do linear filtering 2429 for(row = nt; row > 0; row -= 8) 2430 { 2431 ref_main_idx = vld1_u8(ref_main_tmp1); 2432 2433 ref_main_idx_1 = vld1_u8(ref_main_tmp2); 2434 2435 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2436 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2437 2438 add_res = vaddq_u16(mul_res1, mul_res2); 2439 2440 shift_res = vrshrn_n_u16(add_res, 5); 2441 2442 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2443 pu1_dst_tmp1 += dst_strd; 2444 2445 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2446 pu1_dst_tmp1 += dst_strd; 2447 2448 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2449 pu1_dst_tmp1 += dst_strd; 2450 2451 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2452 pu1_dst_tmp1 += dst_strd; 2453 2454 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4); 2455 pu1_dst_tmp1 += dst_strd; 2456 2457 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5); 2458 pu1_dst_tmp1 += dst_strd; 2459 2460 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6); 2461 pu1_dst_tmp1 += dst_strd; 2462 2463 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7); 2464 pu1_dst_tmp1 += dst_strd; 2465 2466 ref_main_tmp1 += 8; 2467 ref_main_tmp2 += 8; 2468 } 2469 2470 ref_main_tmp1 -= nt; 2471 ref_main_tmp2 -= nt; 2472 2473 pu1_dst_tmp2 += 1; 2474 pu1_dst_tmp1 = pu1_dst_tmp2; 2475 } 2476 } 2477 else 2478 { 2479 uint32x2_t ref_main_idx1, ref_main_idx2; 2480 2481 ref_main_idx1 = vdup_n_u32(0); 2482 ref_main_idx2 = vdup_n_u32(0); 2483 2484 for(col = 0; col < nt; col++) 2485 { 2486 fract_prev = fract; 2487 pos = ((col + 1) * intra_pred_ang); 2488 fract = pos & (31); 2489 2490 if(fract_prev < fract) 2491 { 2492 ref_main_tmp1 -= 1; 2493 ref_main_tmp2 -= 1; 2494 } 2495 2496 dup_const_fract = vdup_n_u8((uint8_t)fract); 2497 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2498 2499 for(row = nt; row > 0; row -= 4) 2500 { 2501 2502 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0); 2503 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0); 2504 2505 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract); 2506 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract); 2507 2508 add_res = vaddq_u16(mul_res1, mul_res2); 2509 2510 shift_res = vrshrn_n_u16(add_res, 5); 2511 2512 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0); 2513 pu1_dst_tmp1 += dst_strd; 2514 2515 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1); 2516 pu1_dst_tmp1 += dst_strd; 2517 2518 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2); 2519 pu1_dst_tmp1 += dst_strd; 2520 2521 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3); 2522 pu1_dst_tmp1 += dst_strd; 2523 2524 } 2525 2526 pu1_dst_tmp2 += 1; 2527 pu1_dst_tmp1 = pu1_dst_tmp2; 2528 2529 } 2530 2531 } 2532 } 2533 2534 /** 2535 ******************************************************************************* 2536 * 2537 * @brief 2538 * Intra prediction interpolation filter for luma mode 19 to mode 25 2539 * 2540 * @par Description: 2541 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with 2542 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2543 * block location pointed by 'pu1_dst' 2544 * 2545 * @param[in] pu1_src 2546 * UWORD8 pointer to the source 2547 * 2548 * @param[out] pu1_dst 2549 * UWORD8 pointer to the destination 2550 * 2551 * @param[in] src_strd 2552 * integer source stride 2553 * 2554 * @param[in] dst_strd 2555 * integer destination stride 2556 * 2557 * @param[in] nt 2558 * integer Transform Block size 2559 * 2560 * @param[in] mode 2561 * integer intraprediction mode 2562 * 2563 * @returns 2564 * 2565 * @remarks 2566 * None 2567 * 2568 ******************************************************************************* 2569 */ 2570 2571 2572 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref, 2573 WORD32 src_strd, 2574 UWORD8 *pu1_dst, 2575 WORD32 dst_strd, 2576 WORD32 nt, 2577 WORD32 mode) 2578 { 2579 2580 WORD32 row, col, k; 2581 WORD32 two_nt, intra_pred_ang; 2582 WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;; 2583 WORD32 ref_idx; 2584 UWORD8 *ref_main; 2585 UWORD8 *ref_main_tmp; 2586 UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1]; 2587 2588 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2589 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2590 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2591 2592 uint16x8_t mul_res1, mul_res2, add_res; 2593 uint8x8_t dup_const_fract, dup_const_32_fract; 2594 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res; 2595 uint8x8_t ref_above_t; 2596 uint32x2_t ref_above_tmp; 2597 UNUSED(src_strd); 2598 ref_above_tmp = vdup_n_u32(0); 2599 2600 two_nt = 2 * nt; 2601 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2602 inv_ang = gai4_ihevc_inv_ang_table[mode - 12]; 2603 2604 /* Intermediate reference samples for negative angle modes */ 2605 /* This have to be removed during optimization*/ 2606 pu1_ref_tmp1 += two_nt; 2607 2608 2609 ref_main = ref_temp + (nt - 1); 2610 ref_main_tmp = ref_main; 2611 2612 if(0 == (nt & 7)) 2613 { 2614 pu1_ref_tmp2 += (two_nt - 7); 2615 for(k = nt - 1; k >= 0; k -= 8) 2616 { 2617 2618 ref_above_t = vld1_u8(pu1_ref_tmp1); 2619 vst1_u8(ref_main_tmp, ref_above_t); 2620 ref_main_tmp += 8; 2621 pu1_ref_tmp1 += 8; 2622 2623 } 2624 2625 } 2626 else 2627 { 2628 pu1_ref_tmp2 += (two_nt - (nt - 1)); 2629 2630 for(k = nt - 1; k >= 0; k -= 4) 2631 { 2632 2633 ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0); 2634 vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0); 2635 2636 } 2637 2638 } 2639 2640 ref_main[nt] = pu1_ref[two_nt + nt]; 2641 2642 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 2643 2644 ref_idx = (nt * intra_pred_ang) >> 5; 2645 inv_ang_sum = 128; 2646 2647 /* SIMD Optimization can be done using look-up table for the loop */ 2648 /* For negative angled derive the main reference samples from side */ 2649 /* reference samples refer to section 8.4.4.2.6 */ 2650 for(k = -1; k > ref_idx; k--) 2651 { 2652 inv_ang_sum += inv_ang; 2653 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)]; 2654 } 2655 2656 UWORD8 *ref_main_tmp1 = ref_main; 2657 UWORD8 *ref_main_tmp2 = ref_main; 2658 2659 ref_main_tmp2 += 1; 2660 2661 if(0 == (nt & 7)) 2662 { 2663 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 2664 /* samples dependent on distance to obtain destination sample */ 2665 for(row = 0; row < nt; row++) 2666 { 2667 2668 fract_prev = fract; 2669 pos = ((row + 1) * intra_pred_ang); 2670 fract = pos & (31); 2671 2672 if(fract_prev < fract) 2673 { 2674 ref_main_tmp1 -= 1; 2675 ref_main_tmp2 -= 1; 2676 } 2677 2678 dup_const_fract = vdup_n_u8((uint8_t)fract); 2679 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2680 2681 // Do linear filtering 2682 for(col = nt; col > 0; col -= 8) 2683 { 2684 ref_main_idx = vld1_u8(ref_main_tmp1); 2685 2686 ref_main_idx_1 = vld1_u8(ref_main_tmp2); 2687 2688 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2689 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2690 2691 add_res = vaddq_u16(mul_res1, mul_res2); 2692 2693 shift_res = vrshrn_n_u16(add_res, 5); 2694 2695 vst1_u8(pu1_dst_tmp1, shift_res); 2696 pu1_dst_tmp1 += 8; 2697 2698 ref_main_tmp1 += 8; 2699 ref_main_tmp2 += 8; 2700 } 2701 2702 ref_main_tmp1 -= nt; 2703 ref_main_tmp2 -= nt; 2704 2705 pu1_dst_tmp1 += (dst_strd - nt); 2706 } 2707 } 2708 else 2709 { 2710 uint32x2_t ref_main_idx1, ref_main_idx2; 2711 2712 ref_main_idx1 = vdup_n_u32(0); 2713 ref_main_idx2 = vdup_n_u32(0); 2714 2715 for(row = 0; row < nt; row++) 2716 { 2717 fract_prev = fract; 2718 pos = ((row + 1) * intra_pred_ang); 2719 fract = pos & (31); 2720 2721 if(fract_prev < fract) 2722 { 2723 ref_main_tmp1 -= 1; 2724 ref_main_tmp2 -= 1; 2725 } 2726 2727 dup_const_fract = vdup_n_u8((uint8_t)fract); 2728 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2729 2730 for(col = nt; col > 0; col -= 4) 2731 { 2732 2733 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0); 2734 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0); 2735 2736 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract); 2737 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract); 2738 2739 add_res = vaddq_u16(mul_res1, mul_res2); 2740 2741 shift_res = vrshrn_n_u16(add_res, 5); 2742 2743 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0); 2744 pu1_dst_tmp1 += 4; 2745 2746 } 2747 pu1_dst_tmp1 += (dst_strd - nt); 2748 } 2749 2750 } 2751 2752 } 2753 2754 /** 2755 ******************************************************************************* 2756 * 2757 * @brief 2758 * Intra prediction interpolation filter for luma mode 27 to mode 33 2759 * 2760 * @par Description: 2761 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with 2762 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2763 * block location pointed by 'pu1_dst' 2764 * 2765 * @param[in] pu1_src 2766 * UWORD8 pointer to the source 2767 * 2768 * @param[out] pu1_dst 2769 * UWORD8 pointer to the destination 2770 * 2771 * @param[in] src_strd 2772 * integer source stride 2773 * 2774 * @param[in] dst_strd 2775 * integer destination stride 2776 * 2777 * @param[in] nt 2778 * integer Transform Block size 2779 * 2780 * @param[in] mode 2781 * integer intraprediction mode 2782 * 2783 * @returns 2784 * 2785 * @remarks 2786 * None 2787 * 2788 ******************************************************************************* 2789 */ 2790 2791 2792 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref, 2793 WORD32 src_strd, 2794 UWORD8 *pu1_dst, 2795 WORD32 dst_strd, 2796 WORD32 nt, 2797 WORD32 mode) 2798 { 2799 2800 WORD32 row, col; 2801 WORD32 intra_pred_ang; 2802 WORD32 pos, fract = 0, fract_prev; 2803 2804 WORD32 two_nt = 2 * nt; 2805 UNUSED(src_strd); 2806 if(0 == (nt & 7)) 2807 { 2808 2809 UWORD8 *pu1_ref_main_idx = pu1_ref; 2810 UWORD8 *pu1_ref_main_idx_1 = pu1_ref; 2811 2812 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2813 pu1_ref_main_idx += (two_nt + 1); 2814 pu1_ref_main_idx_1 += (two_nt + 2); 2815 2816 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1; 2817 uint8x8_t shift_res; 2818 uint16x8_t mul_res1, mul_res2, add_res; 2819 2820 /* Intra Pred Angle according to the mode */ 2821 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2822 2823 for(row = 0; row < nt; row++) 2824 { 2825 fract_prev = fract; 2826 2827 pos = ((row + 1) * intra_pred_ang); 2828 fract = pos & (31); 2829 2830 if(fract_prev > fract) 2831 { 2832 pu1_ref_main_idx += 1; 2833 pu1_ref_main_idx_1 += 1; 2834 } 2835 2836 dup_const_fract = vdup_n_u8((uint8_t)fract); 2837 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2838 2839 for(col = nt; col > 0; col -= 8) 2840 { 2841 ref_main_idx = vld1_u8(pu1_ref_main_idx); 2842 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1); 2843 2844 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract); 2845 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract); 2846 2847 add_res = vaddq_u16(mul_res1, mul_res2); 2848 2849 shift_res = vrshrn_n_u16(add_res, 5); 2850 2851 vst1_u8(pu1_dst_tmp1, shift_res); 2852 pu1_dst_tmp1 += 8; 2853 2854 pu1_ref_main_idx += 8; 2855 pu1_ref_main_idx_1 += 8; 2856 } 2857 2858 pu1_ref_main_idx -= nt; 2859 pu1_ref_main_idx_1 -= nt; 2860 2861 pu1_dst_tmp1 += (dst_strd - nt); 2862 } 2863 2864 } 2865 else 2866 { 2867 UWORD8 *pu1_ref_tmp1 = pu1_ref; 2868 UWORD8 *pu1_ref_tmp2 = pu1_ref; 2869 UWORD8 *pu1_dst_tmp1 = pu1_dst; 2870 2871 pu1_ref_tmp1 += (two_nt + 1);; 2872 pu1_ref_tmp2 += (two_nt + 2);; 2873 2874 uint8x8_t dup_fract, dup_32_fract, shift_res; 2875 uint16x8_t mul_res1, mul_res2, add_res; 2876 uint32x2_t pu1_ref_val1, pu1_ref_val2; 2877 2878 pu1_ref_val1 = vdup_n_u32(0); 2879 pu1_ref_val2 = vdup_n_u32(0); 2880 2881 /* Intra Pred Angle according to the mode */ 2882 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2883 2884 for(row = 0; row < nt; row++) 2885 { 2886 fract_prev = fract; 2887 pos = ((row + 1) * intra_pred_ang); 2888 fract = pos & (31); 2889 if(fract_prev > fract) 2890 { 2891 pu1_ref_tmp1 += 1; 2892 pu1_ref_tmp2 += 1; 2893 } 2894 dup_fract = vdup_n_u8((uint8_t)fract); 2895 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract)); 2896 2897 for(col = nt; col > 0; col -= 4) 2898 { 2899 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0); 2900 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0); 2901 2902 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract); 2903 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract); 2904 2905 add_res = vaddq_u16(mul_res1, mul_res2); 2906 2907 shift_res = vrshrn_n_u16(add_res, 5); 2908 2909 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0); 2910 pu1_dst_tmp1 += 4; 2911 2912 } 2913 2914 pu1_dst_tmp1 += (dst_strd - nt); 2915 2916 } 2917 2918 2919 } 2920 2921 } 2922