1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* @file 21 @* ihevc_intra_pred_filters_planar.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* akshaya mukund 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @******************************************************************************* 39 @*/ 40 @/** 41 @******************************************************************************* 42 @* 43 @* @brief 44 @* luma intraprediction filter for planar input 45 @* 46 @* @par description: 47 @* 48 @* @param[in] pu1_ref 49 @* uword8 pointer to the source 50 @* 51 @* @param[out] pu1_dst 52 @* uword8 pointer to the destination 53 @* 54 @* @param[in] src_strd 55 @* integer source stride 56 @* 57 @* @param[in] dst_strd 58 @* integer destination stride 59 @* 60 @* @param[in] pi1_coeff 61 @* word8 pointer to the planar coefficients 62 @* 63 @* @param[in] nt 64 @* size of tranform block 65 @* 66 @* @param[in] mode 67 @* type of filtering 68 @* 69 @* @returns 70 @* 71 @* @remarks 72 @* none 73 @* 74 @******************************************************************************* 75 @*/ 76 77 @void ihevc_intra_pred_luma_planar(uword8* pu1_ref, 78 @ word32 src_strd, 79 @ uword8* pu1_dst, 80 @ word32 dst_strd, 81 @ word32 nt, 82 @ word32 mode, 83 @ word32 pi1_coeff) 84 @**************variables vs registers***************************************** 85 @r0 => *pu1_ref 86 @r1 => src_strd 87 @r2 => *pu1_dst 88 @r3 => dst_strd 89 90 @stack contents from #40 91 @ nt 92 @ mode 93 @ pi1_coeff 94 95 .text 96 .align 4 97 98 99 100 101 .globl ihevc_intra_pred_luma_planar_a9q 102 .extern gau1_ihevc_planar_factor 103 .extern gau1_ihevc_planar_factor_1 104 105 gau1_ihevc_planar_factor_addr: 106 .long gau1_ihevc_planar_factor - ulbl1 - 8 107 108 gau1_ihevc_planar_factor_1_addr: 109 .long gau1_ihevc_planar_factor_1 - ulbl2 - 8 110 111 112 .type ihevc_intra_pred_luma_planar_a9q, %function 113 114 ihevc_intra_pred_luma_planar_a9q: 115 116 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 117 118 ldr r4,[sp,#40] @loads nt 119 ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs 120 ulbl1: 121 add r11,r11,pc 122 123 clz r5, r4 124 rsb r5, r5, #32 125 vdup.16 q7, r5 126 vneg.s16 q7, q7 @shr value (so vneg) 127 vdup.8 d2, r4 @nt 128 vdup.s16 q8, r4 @nt 129 130 sub r6, r4, #1 @nt-1 131 add r6, r6, r0 132 ldr r7, [r6] 133 vdup.s8 d0, r7 @src[nt-1] 134 135 add r6, r4, r4,lsl #1 @3nt 136 add r6, r6, #1 @3nt + 1 137 add r6, r6, r0 138 ldr r7, [r6] 139 vdup.s8 d1, r7 @src[3nt+1] 140 141 add r6, r4, r4 @2nt 142 add r14, r6, #1 @2nt+1 143 sub r6, r6, #1 @2nt-1 144 add r6, r6, r0 @&src[2nt-1] 145 add r14, r14, r0 @&src[2nt+1] 146 147 mov r8, #1 @row+1 (row is first 0) 148 sub r9, r4, r8 @nt-1-row (row is first 0) 149 150 vdup.s8 d5, r8 @row + 1 151 vdup.s8 d6, r9 @nt - 1 - row 152 vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row 153 154 add r12, r11, #1 @coeffs (to be reloaded after every row) 155 mov r1, r4 @nt (row counter) (dec after every row) 156 mov r5, r2 @dst (to be reloaded after every row and inc by dst_strd) 157 mov r10, #8 @increment for the coeffs 158 mov r0, r14 @&src[2nt+1] (to be reloaded after every row) 159 160 cmp r4, #4 161 beq tf_sz_4 162 163 @@ ========== ***************** ===================== 164 prolog: 165 tf_sz_8_16_32: 166 167 mov r7, r4 @column counter (set to no of cols) 168 mov r9, r4, lsr #3 @divide nt by 8 169 mul r7, r7, r9 @multiply width * height 170 ldr r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs 171 ulbl2: 172 add r5,r5,pc 173 sub r6, r6, #7 174 mov r8, r2 175 lsl r9, r3, #3 @4*stride 176 rsb r9, r9, #8 @8-4*stride 177 mov r10, r4 @nt 178 sub r10, r10, #8 @nt - 8 179 180 col_loop_8_16_32: 181 182 vld1.s8 d8, [r12] @(1-8)load 8 coeffs [col+1] 183 vdup.16 q6, r4 @(1) 184 vld1.s8 d4, [r6] @(1-8)src[2nt-1-row] 185 vsub.s8 d9, d2, d8 @(1-8)[nt-1-col] 186 187 188 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 189 190 vld1.s8 d3, [r14] @(1-8)load 8 src[2nt+1+col] 191 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 192 193 vdup.s8 d20, d4[7] @(1) 194 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 195 196 vdup.s8 d21, d4[6] @(2) 197 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 198 199 vdup.16 q15, r4 @(2) 200 vadd.s8 d5, d5, d7 @(1) 201 202 vsub.s8 d6, d6, d7 @(1) 203 204 vdup.s8 d22, d4[5] @(3) 205 vmlal.u8 q15, d5, d0 @(2) 206 207 vdup.16 q14, r4 @(3) 208 vmlal.u8 q15, d8, d1 @(2) 209 210 vmlal.u8 q15, d6, d3 @(2) 211 vmlal.u8 q15, d9, d21 @(2) 212 213 vshl.s16 q6, q6, q7 @(1)shr 214 215 vadd.s8 d5, d5, d7 @(2) 216 vsub.s8 d6, d6, d7 @(2) 217 218 vmovn.i16 d12, q6 @(1) 219 vmlal.u8 q14, d5, d0 @(3) 220 221 vdup.8 d23, d4[4] @(4) 222 vmlal.u8 q14, d8, d1 @(3) 223 224 vdup.16 q5, r4 @(4) 225 vmlal.u8 q14, d6, d3 @(3) 226 227 vst1.s8 d12, [r2], r3 @(1)str 8 values 228 vmlal.u8 q14, d9, d22 @(3) 229 230 vshl.s16 q15, q15, q7 @(2)shr 231 232 vadd.s8 d5, d5, d7 @(3) 233 vsub.s8 d6, d6, d7 @(3) 234 235 vmovn.i16 d30, q15 @(2) 236 vmlal.u8 q5, d5, d0 @(4) 237 238 vdup.8 d20, d4[3] @(5) 239 vmlal.u8 q5, d8, d1 @(4) 240 241 vdup.16 q8, r4 @(5) 242 vmlal.u8 q5, d6, d3 @(4) 243 244 vst1.s8 d30, [r2], r3 @(2)str 8 values 245 vmlal.u8 q5, d9, d23 @(4) 246 247 vshl.s16 q14, q14, q7 @(3)shr 248 249 vadd.s8 d5, d5, d7 @(4) 250 vsub.s8 d6, d6, d7 @(4) 251 252 vmovn.i16 d28, q14 @(3) 253 vmlal.u8 q8, d5, d0 @(5) 254 255 vdup.8 d21, d4[2] @(6) 256 vmlal.u8 q8, d8, d1 @(5) 257 258 vdup.16 q9, r4 @(6) 259 vmlal.u8 q8, d6, d3 @(5) 260 261 vst1.s8 d28, [r2], r3 @(3)str 8 values 262 vmlal.u8 q8, d9, d20 @(5) 263 264 vshl.s16 q5, q5, q7 @(4)shr 265 vadd.s8 d5, d5, d7 @(5) 266 vsub.s8 d6, d6, d7 @(5) 267 268 vmovn.i16 d10, q5 @(4) 269 vmlal.u8 q9, d5, d0 @(6) 270 271 vdup.8 d22, d4[1] @(7) 272 vmlal.u8 q9, d8, d1 @(6) 273 274 vdup.16 q13, r4 @(7) 275 vmlal.u8 q9, d6, d3 @(6) 276 277 vst1.s8 d10, [r2], r3 @(4)str 8 values 278 vmlal.u8 q9, d9, d21 @(6) 279 280 vshl.s16 q8, q8, q7 @(5)shr 281 282 vadd.s8 d5, d5, d7 @(6) 283 vsub.s8 d6, d6, d7 @(6) 284 285 vmovn.i16 d16, q8 @(5) 286 vmlal.u8 q13, d5, d0 @(7) 287 288 vdup.8 d23, d4[0] @(8) 289 vmlal.u8 q13, d8, d1 @(7) 290 291 vdup.16 q12, r4 @(8) 292 vmlal.u8 q13, d6, d3 @(7) 293 294 vst1.s8 d16, [r2], r3 @(5)str 8 values 295 vmlal.u8 q13, d9, d22 @(7) 296 297 vshl.s16 q9, q9, q7 @(6)shr 298 299 vadd.s8 d5, d5, d7 @(7) 300 vsub.s8 d6, d6, d7 @(7) 301 302 vmovn.i16 d18, q9 @(6) 303 vmlal.u8 q12, d5, d0 @(8) 304 305 306 vmlal.u8 q12, d8, d1 @(8) 307 308 vmlal.u8 q12, d6, d3 @(8) 309 310 vst1.s8 d18, [r2], r3 @(6)str 8 values 311 vmlal.u8 q12, d9, d23 @(8) 312 313 vshl.s16 q13, q13, q7 @(7)shr 314 315 subs r7, r7, #8 316 317 beq epilog 318 319 subs r1, r1, #8 @row counter 320 addgt r12, r12, #8 @col inc 321 addgt r14, r14, #8 @also for col inc 322 movle r1, r4 @nt reloaded (refresh the value) 323 addle r12, r11, #1 @r12 reset 324 325 movle r14, r0 @r14 reset 326 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 327 328 suble r6, r6, #8 @for next set of rows 329 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 330 331 addle r5, r5, #8 332 vdup.16 q6, r4 @(1n)(1) 333 334 vld1.s8 d5, [r5] 335 336 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 337 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 338 339 vdup.s8 d20, d4[7] @(1n)(1) 340 vsub.s8 d6, d2, d5 341 342 beq epilog 343 344 kernel_plnr: 345 346 cmp r1, #0 @ (cond loop) 347 vshl.s16 q12, q12, q7 @(8)shr 348 349 vmovn.i16 d26, q13 @(7) 350 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 351 352 vmovn.i16 d24, q12 @(8) 353 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 354 355 vdup.s8 d21, d4[6] @(2) 356 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 357 358 vdup.16 q15, r4 @(2) 359 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 360 361 vst1.s8 d26, [r2], r3 @(7)str 8 values 362 vadd.s8 d5, d5, d7 @(1) 363 364 vst1.s8 d24, [r2], r3 @(8)str 8 values 365 vsub.s8 d6, d6, d7 @(1) 366 367 addgt r2, r2, r9 @since more cols to fill, dst + 8 - 6*strd (cond loop) 368 vmlal.u8 q15, d5, d0 @(2) 369 370 suble r2, r2, r10 @else go to next set of rows, dst - (nt-8) (cond loop) 371 vmlal.u8 q15, d8, d1 @(2) 372 373 vdup.s8 d22, d4[5] @(3) 374 vmlal.u8 q15, d6, d3 @(2) 375 376 vdup.16 q14, r4 @(3) 377 vmlal.u8 q15, d9, d21 @(2) 378 379 vshl.s16 q6, q6, q7 @(1)shr 380 381 vadd.s8 d5, d5, d7 @(2) 382 movle r1, r4 @nt reloaded (refresh the value) (cond loop) 383 384 vsub.s8 d6, d6, d7 @(2) 385 subs r1, r1, #8 @row counter (loop) 386 387 vmovn.i16 d12, q6 @(1) 388 vmlal.u8 q14, d5, d0 @(3) 389 390 vdup.8 d23, d4[4] @(4) 391 vmlal.u8 q14, d8, d1 @(3) 392 393 vdup.16 q5, r4 @(4) 394 vmlal.u8 q14, d6, d3 @(3) 395 396 vst1.s8 d12, [r2], r3 @(1)str 8 values 397 vmlal.u8 q14, d9, d22 @(3) 398 399 vshl.s16 q15, q15, q7 @(2)shr 400 401 vadd.s8 d5, d5, d7 @(3) 402 403 vsub.s8 d6, d6, d7 @(3) 404 405 vmovn.i16 d30, q15 @(2) 406 vmlal.u8 q5, d5, d0 @(4) 407 408 vdup.8 d20, d4[3] @(5) 409 vmlal.u8 q5, d8, d1 @(4) 410 411 vdup.16 q8, r4 @(5) 412 vmlal.u8 q5, d6, d3 @(4) 413 414 vst1.s8 d30, [r2], r3 @(2)str 8 values 415 vmlal.u8 q5, d9, d23 @(4) 416 417 vshl.s16 q14, q14, q7 @(3)shr 418 419 vadd.s8 d5, d5, d7 @(4) 420 421 vsub.s8 d6, d6, d7 @(4) 422 423 vmovn.i16 d28, q14 @(3) 424 vmlal.u8 q8, d5, d0 @(5) 425 426 vdup.8 d21, d4[2] @(6) 427 vmlal.u8 q8, d8, d1 @(5) 428 429 vdup.16 q9, r4 @(6) 430 vmlal.u8 q8, d6, d3 @(5) 431 432 vst1.s8 d28, [r2], r3 @(3)str 8 values 433 vmlal.u8 q8, d9, d20 @(5) 434 435 addle r12, r11, #1 @r12 reset (cond loop) 436 vshl.s16 q5, q5, q7 @(4)shr 437 438 addgt r12, r12, #8 @col inc (cond loop) 439 vadd.s8 d5, d5, d7 @(5) 440 441 addgt r14, r14, #8 @also for col inc (cond loop) 442 vsub.s8 d6, d6, d7 @(5) 443 444 vmovn.i16 d10, q5 @(4) 445 vmlal.u8 q9, d5, d0 @(6) 446 447 vdup.8 d22, d4[1] @(7) 448 vmlal.u8 q9, d8, d1 @(6) 449 450 vdup.16 q13, r4 @(7) 451 vmlal.u8 q9, d6, d3 @(6) 452 453 vst1.s8 d10, [r2], r3 @(4)str 8 values 454 vmlal.u8 q9, d9, d21 @(6) 455 456 movle r14, r0 @r14 reset (cond loop) 457 vshl.s16 q8, q8, q7 @(5)shr 458 459 suble r6, r6, #8 @for next set of rows (cond loop) 460 vadd.s8 d5, d5, d7 @(6) 461 462 addle r5, r5, #8 @ (cond loop) 463 vsub.s8 d6, d6, d7 @(6) 464 465 vmovn.i16 d16, q8 @(5) 466 vmlal.u8 q13, d5, d0 @(7) 467 468 vdup.8 d23, d4[0] @(8) 469 vmlal.u8 q13, d8, d1 @(7) 470 471 vdup.16 q12, r4 @(8) 472 vmlal.u8 q13, d6, d3 @(7) 473 474 vst1.s8 d16, [r2], r3 @(5)str 8 values 475 vmlal.u8 q13, d9, d22 @(7) 476 477 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 478 vshl.s16 q9, q9, q7 @(6)shr 479 480 vadd.s8 d5, d5, d7 @(7) 481 482 vsub.s8 d6, d6, d7 @(7) 483 484 vmovn.i16 d18, q9 @(6) 485 vmlal.u8 q12, d5, d0 @(8) 486 487 vld1.s8 d5, [r5] @(row+1 value) 488 vmlal.u8 q12, d8, d1 @(8) 489 490 vdup.s8 d20, d4[7] @(1n)(1) 491 vmlal.u8 q12, d6, d3 @(8) 492 493 vst1.s8 d18, [r2], r3 @(6)str 8 values 494 vmlal.u8 q12, d9, d23 @(8) 495 496 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 497 vsub.s8 d6, d2, d5 @(nt-1-row) value 498 499 subs r7, r7, #8 @col counter 500 501 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 502 vshl.s16 q13, q13, q7 @(7)shr 503 504 vdup.16 q6, r4 @(1n)(1) 505 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 506 507 bne kernel_plnr 508 509 epilog: 510 511 vmovn.i16 d26, q13 @(7) 512 vst1.s8 d26, [r2], r3 @(7)str 8 values 513 514 vshl.s16 q12, q12, q7 @(8)shr 515 vmovn.i16 d24, q12 @(8) 516 vst1.s8 d24, [r2], r3 @(8)str 8 values 517 518 @@ ========== ***************** ===================== 519 520 beq end_loop 521 522 tf_sz_4: 523 vld1.s8 d10, [r14] @load src[2nt+1+col] 524 vld1.s8 d8, [r12], r10 @load 8 coeffs [col+1] 525 loop_sz_4: 526 mov r10, #4 @reduce inc to #4 for 4x4 527 ldr r7, [r6], #-1 @src[2nt-1-row] (dec to take into account row) 528 vdup.s8 d4, r7 @src[2nt-1-row] 529 530 vsub.s8 d9, d2, d8 @[nt-1-col] 531 532 vmull.u8 q6, d5, d0 @(row+1) * src[nt-1] 533 vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col] 534 vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1] 535 vmlal.u8 q6, d9, d4 @(nt-1-col) * src[2nt-1-row] 536 @ vadd.i16 q6, q6, q8 @add (nt) 537 @ vshl.s16 q6, q6, q7 @shr 538 @ vmovn.i16 d12, q6 539 vrshrn.s16 d12,q6,#3 540 vst1.s32 {d12[0]}, [r2], r3 541 542 vadd.s8 d5, d5, d7 @row++ [(row+1)++] 543 vsub.s8 d6, d6, d7 @[nt-1-row]-- 544 subs r1, r1, #1 545 546 bne loop_sz_4 547 548 end_loop: 549 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 550 551 552 553 554 555 556 557 558