1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* @file 21 @* ihevc_intra_pred_filters_planar.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* akshaya mukund 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @******************************************************************************* 39 @*/ 40 @/** 41 @******************************************************************************* 42 @* 43 @* @brief 44 @* luma intraprediction filter for planar input 45 @* 46 @* @par description: 47 @* 48 @* @param[in] pu1_ref 49 @* uword8 pointer to the source 50 @* 51 @* @param[out] pu1_dst 52 @* uword8 pointer to the destination 53 @* 54 @* @param[in] src_strd 55 @* integer source stride 56 @* 57 @* @param[in] dst_strd 58 @* integer destination stride 59 @* 60 @* @param[in] pi1_coeff 61 @* word8 pointer to the planar coefficients 62 @* 63 @* @param[in] nt 64 @* size of tranform block 65 @* 66 @* @param[in] mode 67 @* type of filtering 68 @* 69 @* @returns 70 @* 71 @* @remarks 72 @* none 73 @* 74 @******************************************************************************* 75 @*/ 76 77 @void ihevc_intra_pred_luma_planar(uword8* pu1_ref, 78 @ word32 src_strd, 79 @ uword8* pu1_dst, 80 @ word32 dst_strd, 81 @ word32 nt, 82 @ word32 mode, 83 @ word32 pi1_coeff) 84 @**************variables vs registers***************************************** 85 @r0 => *pu1_ref 86 @r1 => src_strd 87 @r2 => *pu1_dst 88 @r3 => dst_strd 89 90 @stack contents from #104 91 @ nt 92 @ mode 93 @ pi1_coeff 94 95 .equ nt_offset, 104 96 97 .text 98 .align 4 99 100 101 102 103 .globl ihevc_intra_pred_luma_planar_a9q 104 .extern gau1_ihevc_planar_factor 105 .extern gau1_ihevc_planar_factor_1 106 107 gau1_ihevc_planar_factor_addr: 108 .long gau1_ihevc_planar_factor - ulbl1 - 8 109 110 gau1_ihevc_planar_factor_1_addr: 111 .long gau1_ihevc_planar_factor_1 - ulbl2 - 8 112 113 114 .type ihevc_intra_pred_luma_planar_a9q, %function 115 116 ihevc_intra_pred_luma_planar_a9q: 117 118 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 119 vpush {d8 - d15} 120 ldr r4,[sp,#nt_offset] @loads nt 121 ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs 122 ulbl1: 123 add r11,r11,pc 124 125 clz r5, r4 126 rsb r5, r5, #32 127 vdup.16 q7, r5 128 vneg.s16 q7, q7 @shr value (so vneg) 129 vdup.8 d2, r4 @nt 130 vdup.s16 q8, r4 @nt 131 132 sub r6, r4, #1 @nt-1 133 add r6, r6, r0 134 ldr r7, [r6] 135 vdup.s8 d0, r7 @src[nt-1] 136 137 add r6, r4, r4,lsl #1 @3nt 138 add r6, r6, #1 @3nt + 1 139 add r6, r6, r0 140 ldr r7, [r6] 141 vdup.s8 d1, r7 @src[3nt+1] 142 143 add r6, r4, r4 @2nt 144 add r14, r6, #1 @2nt+1 145 sub r6, r6, #1 @2nt-1 146 add r6, r6, r0 @&src[2nt-1] 147 add r14, r14, r0 @&src[2nt+1] 148 149 mov r8, #1 @row+1 (row is first 0) 150 sub r9, r4, r8 @nt-1-row (row is first 0) 151 152 vdup.s8 d5, r8 @row + 1 153 vdup.s8 d6, r9 @nt - 1 - row 154 vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row 155 156 add r12, r11, #1 @coeffs (to be reloaded after every row) 157 mov r1, r4 @nt (row counter) (dec after every row) 158 mov r5, r2 @dst (to be reloaded after every row and inc by dst_strd) 159 mov r10, #8 @increment for the coeffs 160 mov r0, r14 @&src[2nt+1] (to be reloaded after every row) 161 162 cmp r4, #4 163 beq tf_sz_4 164 165 @@ ========== ***************** ===================== 166 prolog: 167 tf_sz_8_16_32: 168 169 mov r7, r4 @column counter (set to no of cols) 170 mov r9, r4, lsr #3 @divide nt by 8 171 mul r7, r7, r9 @multiply width * height 172 ldr r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs 173 ulbl2: 174 add r5,r5,pc 175 sub r6, r6, #7 176 mov r8, r2 177 lsl r9, r3, #3 @4*stride 178 rsb r9, r9, #8 @8-4*stride 179 mov r10, r4 @nt 180 sub r10, r10, #8 @nt - 8 181 182 col_loop_8_16_32: 183 184 vld1.s8 d8, [r12] @(1-8)load 8 coeffs [col+1] 185 vdup.16 q6, r4 @(1) 186 vld1.s8 d4, [r6] @(1-8)src[2nt-1-row] 187 vsub.s8 d9, d2, d8 @(1-8)[nt-1-col] 188 189 190 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 191 192 vld1.s8 d3, [r14] @(1-8)load 8 src[2nt+1+col] 193 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 194 195 vdup.s8 d20, d4[7] @(1) 196 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 197 198 vdup.s8 d21, d4[6] @(2) 199 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 200 201 vdup.16 q15, r4 @(2) 202 vadd.s8 d5, d5, d7 @(1) 203 204 vsub.s8 d6, d6, d7 @(1) 205 206 vdup.s8 d22, d4[5] @(3) 207 vmlal.u8 q15, d5, d0 @(2) 208 209 vdup.16 q14, r4 @(3) 210 vmlal.u8 q15, d8, d1 @(2) 211 212 vmlal.u8 q15, d6, d3 @(2) 213 vmlal.u8 q15, d9, d21 @(2) 214 215 vshl.s16 q6, q6, q7 @(1)shr 216 217 vadd.s8 d5, d5, d7 @(2) 218 vsub.s8 d6, d6, d7 @(2) 219 220 vmovn.i16 d12, q6 @(1) 221 vmlal.u8 q14, d5, d0 @(3) 222 223 vdup.8 d23, d4[4] @(4) 224 vmlal.u8 q14, d8, d1 @(3) 225 226 vdup.16 q5, r4 @(4) 227 vmlal.u8 q14, d6, d3 @(3) 228 229 vst1.s8 d12, [r2], r3 @(1)str 8 values 230 vmlal.u8 q14, d9, d22 @(3) 231 232 vshl.s16 q15, q15, q7 @(2)shr 233 234 vadd.s8 d5, d5, d7 @(3) 235 vsub.s8 d6, d6, d7 @(3) 236 237 vmovn.i16 d30, q15 @(2) 238 vmlal.u8 q5, d5, d0 @(4) 239 240 vdup.8 d20, d4[3] @(5) 241 vmlal.u8 q5, d8, d1 @(4) 242 243 vdup.16 q8, r4 @(5) 244 vmlal.u8 q5, d6, d3 @(4) 245 246 vst1.s8 d30, [r2], r3 @(2)str 8 values 247 vmlal.u8 q5, d9, d23 @(4) 248 249 vshl.s16 q14, q14, q7 @(3)shr 250 251 vadd.s8 d5, d5, d7 @(4) 252 vsub.s8 d6, d6, d7 @(4) 253 254 vmovn.i16 d28, q14 @(3) 255 vmlal.u8 q8, d5, d0 @(5) 256 257 vdup.8 d21, d4[2] @(6) 258 vmlal.u8 q8, d8, d1 @(5) 259 260 vdup.16 q9, r4 @(6) 261 vmlal.u8 q8, d6, d3 @(5) 262 263 vst1.s8 d28, [r2], r3 @(3)str 8 values 264 vmlal.u8 q8, d9, d20 @(5) 265 266 vshl.s16 q5, q5, q7 @(4)shr 267 vadd.s8 d5, d5, d7 @(5) 268 vsub.s8 d6, d6, d7 @(5) 269 270 vmovn.i16 d10, q5 @(4) 271 vmlal.u8 q9, d5, d0 @(6) 272 273 vdup.8 d22, d4[1] @(7) 274 vmlal.u8 q9, d8, d1 @(6) 275 276 vdup.16 q13, r4 @(7) 277 vmlal.u8 q9, d6, d3 @(6) 278 279 vst1.s8 d10, [r2], r3 @(4)str 8 values 280 vmlal.u8 q9, d9, d21 @(6) 281 282 vshl.s16 q8, q8, q7 @(5)shr 283 284 vadd.s8 d5, d5, d7 @(6) 285 vsub.s8 d6, d6, d7 @(6) 286 287 vmovn.i16 d16, q8 @(5) 288 vmlal.u8 q13, d5, d0 @(7) 289 290 vdup.8 d23, d4[0] @(8) 291 vmlal.u8 q13, d8, d1 @(7) 292 293 vdup.16 q12, r4 @(8) 294 vmlal.u8 q13, d6, d3 @(7) 295 296 vst1.s8 d16, [r2], r3 @(5)str 8 values 297 vmlal.u8 q13, d9, d22 @(7) 298 299 vshl.s16 q9, q9, q7 @(6)shr 300 301 vadd.s8 d5, d5, d7 @(7) 302 vsub.s8 d6, d6, d7 @(7) 303 304 vmovn.i16 d18, q9 @(6) 305 vmlal.u8 q12, d5, d0 @(8) 306 307 308 vmlal.u8 q12, d8, d1 @(8) 309 310 vmlal.u8 q12, d6, d3 @(8) 311 312 vst1.s8 d18, [r2], r3 @(6)str 8 values 313 vmlal.u8 q12, d9, d23 @(8) 314 315 vshl.s16 q13, q13, q7 @(7)shr 316 317 subs r7, r7, #8 318 319 beq epilog 320 321 subs r1, r1, #8 @row counter 322 addgt r12, r12, #8 @col inc 323 addgt r14, r14, #8 @also for col inc 324 movle r1, r4 @nt reloaded (refresh the value) 325 addle r12, r11, #1 @r12 reset 326 327 movle r14, r0 @r14 reset 328 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 329 330 suble r6, r6, #8 @for next set of rows 331 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 332 333 addle r5, r5, #8 334 vdup.16 q6, r4 @(1n)(1) 335 336 vld1.s8 d5, [r5] 337 338 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 339 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 340 341 vdup.s8 d20, d4[7] @(1n)(1) 342 vsub.s8 d6, d2, d5 343 344 beq epilog 345 346 kernel_plnr: 347 348 cmp r1, #0 @ (cond loop) 349 vshl.s16 q12, q12, q7 @(8)shr 350 351 vmovn.i16 d26, q13 @(7) 352 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 353 354 vmovn.i16 d24, q12 @(8) 355 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 356 357 vdup.s8 d21, d4[6] @(2) 358 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 359 360 vdup.16 q15, r4 @(2) 361 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 362 363 vst1.s8 d26, [r2], r3 @(7)str 8 values 364 vadd.s8 d5, d5, d7 @(1) 365 366 vst1.s8 d24, [r2], r3 @(8)str 8 values 367 vsub.s8 d6, d6, d7 @(1) 368 369 addgt r2, r2, r9 @since more cols to fill, dst + 8 - 6*strd (cond loop) 370 vmlal.u8 q15, d5, d0 @(2) 371 372 suble r2, r2, r10 @else go to next set of rows, dst - (nt-8) (cond loop) 373 vmlal.u8 q15, d8, d1 @(2) 374 375 vdup.s8 d22, d4[5] @(3) 376 vmlal.u8 q15, d6, d3 @(2) 377 378 vdup.16 q14, r4 @(3) 379 vmlal.u8 q15, d9, d21 @(2) 380 381 vshl.s16 q6, q6, q7 @(1)shr 382 383 vadd.s8 d5, d5, d7 @(2) 384 movle r1, r4 @nt reloaded (refresh the value) (cond loop) 385 386 vsub.s8 d6, d6, d7 @(2) 387 subs r1, r1, #8 @row counter (loop) 388 389 vmovn.i16 d12, q6 @(1) 390 vmlal.u8 q14, d5, d0 @(3) 391 392 vdup.8 d23, d4[4] @(4) 393 vmlal.u8 q14, d8, d1 @(3) 394 395 vdup.16 q5, r4 @(4) 396 vmlal.u8 q14, d6, d3 @(3) 397 398 vst1.s8 d12, [r2], r3 @(1)str 8 values 399 vmlal.u8 q14, d9, d22 @(3) 400 401 vshl.s16 q15, q15, q7 @(2)shr 402 403 vadd.s8 d5, d5, d7 @(3) 404 405 vsub.s8 d6, d6, d7 @(3) 406 407 vmovn.i16 d30, q15 @(2) 408 vmlal.u8 q5, d5, d0 @(4) 409 410 vdup.8 d20, d4[3] @(5) 411 vmlal.u8 q5, d8, d1 @(4) 412 413 vdup.16 q8, r4 @(5) 414 vmlal.u8 q5, d6, d3 @(4) 415 416 vst1.s8 d30, [r2], r3 @(2)str 8 values 417 vmlal.u8 q5, d9, d23 @(4) 418 419 vshl.s16 q14, q14, q7 @(3)shr 420 421 vadd.s8 d5, d5, d7 @(4) 422 423 vsub.s8 d6, d6, d7 @(4) 424 425 vmovn.i16 d28, q14 @(3) 426 vmlal.u8 q8, d5, d0 @(5) 427 428 vdup.8 d21, d4[2] @(6) 429 vmlal.u8 q8, d8, d1 @(5) 430 431 vdup.16 q9, r4 @(6) 432 vmlal.u8 q8, d6, d3 @(5) 433 434 vst1.s8 d28, [r2], r3 @(3)str 8 values 435 vmlal.u8 q8, d9, d20 @(5) 436 437 addle r12, r11, #1 @r12 reset (cond loop) 438 vshl.s16 q5, q5, q7 @(4)shr 439 440 addgt r12, r12, #8 @col inc (cond loop) 441 vadd.s8 d5, d5, d7 @(5) 442 443 addgt r14, r14, #8 @also for col inc (cond loop) 444 vsub.s8 d6, d6, d7 @(5) 445 446 vmovn.i16 d10, q5 @(4) 447 vmlal.u8 q9, d5, d0 @(6) 448 449 vdup.8 d22, d4[1] @(7) 450 vmlal.u8 q9, d8, d1 @(6) 451 452 vdup.16 q13, r4 @(7) 453 vmlal.u8 q9, d6, d3 @(6) 454 455 vst1.s8 d10, [r2], r3 @(4)str 8 values 456 vmlal.u8 q9, d9, d21 @(6) 457 458 movle r14, r0 @r14 reset (cond loop) 459 vshl.s16 q8, q8, q7 @(5)shr 460 461 suble r6, r6, #8 @for next set of rows (cond loop) 462 vadd.s8 d5, d5, d7 @(6) 463 464 addle r5, r5, #8 @ (cond loop) 465 vsub.s8 d6, d6, d7 @(6) 466 467 vmovn.i16 d16, q8 @(5) 468 vmlal.u8 q13, d5, d0 @(7) 469 470 vdup.8 d23, d4[0] @(8) 471 vmlal.u8 q13, d8, d1 @(7) 472 473 vdup.16 q12, r4 @(8) 474 vmlal.u8 q13, d6, d3 @(7) 475 476 vst1.s8 d16, [r2], r3 @(5)str 8 values 477 vmlal.u8 q13, d9, d22 @(7) 478 479 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 480 vshl.s16 q9, q9, q7 @(6)shr 481 482 vadd.s8 d5, d5, d7 @(7) 483 484 vsub.s8 d6, d6, d7 @(7) 485 486 vmovn.i16 d18, q9 @(6) 487 vmlal.u8 q12, d5, d0 @(8) 488 489 vld1.s8 d5, [r5] @(row+1 value) 490 vmlal.u8 q12, d8, d1 @(8) 491 492 vdup.s8 d20, d4[7] @(1n)(1) 493 vmlal.u8 q12, d6, d3 @(8) 494 495 vst1.s8 d18, [r2], r3 @(6)str 8 values 496 vmlal.u8 q12, d9, d23 @(8) 497 498 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 499 vsub.s8 d6, d2, d5 @(nt-1-row) value 500 501 subs r7, r7, #8 @col counter 502 503 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 504 vshl.s16 q13, q13, q7 @(7)shr 505 506 vdup.16 q6, r4 @(1n)(1) 507 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 508 509 bne kernel_plnr 510 511 epilog: 512 513 vmovn.i16 d26, q13 @(7) 514 vst1.s8 d26, [r2], r3 @(7)str 8 values 515 516 vshl.s16 q12, q12, q7 @(8)shr 517 vmovn.i16 d24, q12 @(8) 518 vst1.s8 d24, [r2], r3 @(8)str 8 values 519 520 @@ ========== ***************** ===================== 521 522 beq end_loop 523 524 tf_sz_4: 525 vld1.s8 d10, [r14] @load src[2nt+1+col] 526 vld1.s8 d8, [r12], r10 @load 8 coeffs [col+1] 527 loop_sz_4: 528 mov r10, #4 @reduce inc to #4 for 4x4 529 ldr r7, [r6], #-1 @src[2nt-1-row] (dec to take into account row) 530 vdup.s8 d4, r7 @src[2nt-1-row] 531 532 vsub.s8 d9, d2, d8 @[nt-1-col] 533 534 vmull.u8 q6, d5, d0 @(row+1) * src[nt-1] 535 vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col] 536 vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1] 537 vmlal.u8 q6, d9, d4 @(nt-1-col) * src[2nt-1-row] 538 @ vadd.i16 q6, q6, q8 @add (nt) 539 @ vshl.s16 q6, q6, q7 @shr 540 @ vmovn.i16 d12, q6 541 vrshrn.s16 d12,q6,#3 542 vst1.s32 {d12[0]}, [r2], r3 543 544 vadd.s8 d5, d5, d7 @row++ [(row+1)++] 545 vsub.s8 d6, d6, d7 @[nt-1-row]-- 546 subs r1, r1, #1 547 548 bne loop_sz_4 549 550 end_loop: 551 vpop {d8 - d15} 552 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 553 554 555 556 557 558 559 560 561